<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v9i1e66110</article-id><article-id pub-id-type="doi">10.2196/66110</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Use of a Preliminary Artificial Intelligence-Based Laryngeal Cancer Screening Framework for Low-Resource Settings: Development and Validation Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Lam</surname><given-names>Shao Wei Sean</given-names></name><degrees>MEng, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lee</surname><given-names>Min Hun</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Dorosan</surname><given-names>Michael</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Altonji</surname><given-names>Samuel</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Tan</surname><given-names>Hiang Khoon</given-names></name><degrees>MBBS, PhD</degrees><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lee</surname><given-names>Walter T</given-names></name><degrees>MD, MHS</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib></contrib-group><aff id="aff1"><institution>Health Services Research Centre, Singapore Health Services Pte Ltd</institution><addr-line>Ngee Ann Kongsi Discovery Tower Level 6, 20 College Road</addr-line><addr-line>Singapore</addr-line><country>Singapore</country></aff><aff id="aff2"><institution>School of Computing and Information Systems, Singapore Management University</institution><addr-line>Singapore</addr-line><country>Singapore</country></aff><aff id="aff3"><institution>Department of Head and Neck Surgery &#x0026; Communication Sciences, Duke University Health System</institution><addr-line>Durham</addr-line><addr-line>NC</addr-line><country>United States</country></aff><aff id="aff4"><institution>Division of Surgery and Surgical Oncology, Singapore General Hospital and National Cancer Centre Singapore</institution><addr-line>Singapore</addr-line><country>Singapore</country></aff><aff id="aff5"><institution>SingHealth Duke-NUS Global Health Institute, Duke-National University of Singapore</institution><addr-line>Singapore</addr-line><country>Singapore</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Mavragani</surname><given-names>Amaryllis</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Gupta</surname><given-names>Gaurav Kumar</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Reynoso Aguirre</surname><given-names>Pablo Eliseo</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Arasteh</surname><given-names>Soroosh Tayebi</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Shao Wei Sean Lam, MEng, PhD, Health Services Research Centre, Singapore Health Services Pte Ltd, Ngee Ann Kongsi Discovery Tower Level 6, 20 College Road, Singapore, 169856, Singapore, 65 65767140; <email>gmslasws@nus.edu.sg</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>7</day><month>10</month><year>2025</year></pub-date><volume>9</volume><elocation-id>e66110</elocation-id><history><date date-type="received"><day>23</day><month>09</month><year>2024</year></date><date date-type="rev-recd"><day>31</day><month>07</month><year>2025</year></date><date date-type="accepted"><day>05</day><month>08</month><year>2025</year></date></history><copyright-statement>&#x00A9; Shao Wei Sean Lam, Min Hun Lee, Michael Dorosan, Samuel Altonji, Hiang Khoon Tan, Walter T Lee. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 7.10.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2025/1/e66110"/><abstract><sec><title>Background</title><p>Early-stage diagnosis of laryngeal cancer significantly improves patient survival and quality of life. However, the scarcity of specialists in low-resource settings hinders the timely review of flexible nasopharyngoscopy (FNS) videos, which are essential for accurate triage of at-risk patients.</p></sec><sec><title>Objective</title><p>We introduce a preliminary AI-based screening framework to address this challenge for the triaging of at-risk patients in low-resource settings. This formative research addresses multiple challenges common in high-dimensional FNS videos: (1) selecting clear, informative images; (2) deriving regions within frames that show an anatomical landmark of interest; and (3) classifying patients into referral grades based on the FNS video frames.</p></sec><sec sec-type="methods"><title>Methods</title><p>The system includes an image quality model (IQM) to identify high-quality endoscopic images, which are then fed into a disease classification model (DCM) trained on efficient convolutional neural network (CNN) modules. To validate our approach, we curated a real-world dataset comprising 132 patients from an academic tertiary care center in the United States.</p></sec><sec sec-type="results"><title>Results</title><p>Based on this dataset, we demonstrated that the IQM quality frame selection achieved an area under the receiver operating characteristic curve (AUROC) of 0.895 and an area under the precision-recall curve (AUPRC) of 0.878. When using all the image frames selected by the IQM, the DCM improved its performance by 38% considering the AUROC (from 0.60 to 0.83) and 8% considering the AUPRC (from 0.84 to 0.91). Through an ablation study, it was demonstrated that a minimum of 50 good-quality image frames was required to achieve the improvements. Additionally, an efficient CNN model can achieve 2.5-times-faster inference time than ResNet50.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>This study demonstrated the feasibility of an AI-based screening framework designed for low-resource settings, showing its capability to triage patients for higher-level care efficiently. This approach promises substantial benefits for health care accessibility and patient outcomes in regions with limited specialist care in outpatient settings. This research provides necessary evidence to continue the development of a fully validated screening system for low-resource settings.</p></sec></abstract><kwd-group><kwd>head and neck cancers</kwd><kwd>flexible nasopharyngoscopy</kwd><kwd>efficient neural nets</kwd><kwd>deep learning</kwd><kwd>cancer triage</kwd><kwd>machine learning</kwd><kwd>artificial intelligence</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Head and neck cancers (HNCs) are the 6th most common cancer worldwide, with a disproportionate growth in incidence and mortality in low- and middle-income countries (LMICs), particularly the West-Pacific and Southeast Asia regions [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>]. Among HNCs, laryngeal cancer can be challenging to diagnose, with nonspecific and mild symptoms in the early stages. Early-stage diagnosis of laryngeal cancer is crucial to improve survival and quality of life [<xref ref-type="bibr" rid="ref4">4</xref>]. Patients presenting with early-stage cancers have a 60%&#x2010;90% chance of cure with local therapy, while those with late-stage cancers have a significantly reduced opportunity for remission [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>]. In addition, patients with advanced cancers have worse quality of life due to their swallowing, verbal communication, and breathing dysfunctions [<xref ref-type="bibr" rid="ref7">7</xref>].</p><p>The early detection of laryngeal cancer requires highly trained health care providers (eg, otolaryngologists) to visualize and interpret the relevant anatomical structures to detect anomalies. In addition, a definitive diagnosis requires downstream histopathological confirmation. Sophisticated endoscopic equipment, such as flexible nasopharyngoscopy (FNS), is necessary to examine the upper aerodigestive tract for abnormalities [<xref ref-type="bibr" rid="ref8">8</xref>]. Experts who can perform this examination and interpret the endoscopic videos are limited in many low- and middle-income countries and in low-resource settings [<xref ref-type="bibr" rid="ref9">9</xref>]. Limited access to specialty care in low- and middle-income countries is apparent for HNCs, with one study estimating the otolaryngologists-to-population ratio in some Asian countries to be as low as 1 per 2,146,000 [<xref ref-type="bibr" rid="ref10">10</xref>]. This results in missed opportunities for early-stage diagnosis [<xref ref-type="bibr" rid="ref2">2</xref>]. Technological advancements, particularly in fiberoptic flexible endoscopy and laser systems, have enabled the shift of many laryngological procedures from the operating room to outpatient clinics [<xref ref-type="bibr" rid="ref11">11</xref>]. In high-volume outpatient settings, trained non-specialists may benefit from artificial intelligence (AI)-based clinical decision support systems encapsulating domain expert knowledge. Clinical decision support systems (CDSS) with embedded clinical practice guidelines, rules, and specialist knowledge may more effectively assess and triage the endoscopies performed by non-specialist health care workers while having the advantage of portability and accessibility [<xref ref-type="bibr" rid="ref12">12</xref>].</p><p>AI, specifically machine learning and deep learning, is increasingly used to detect abnormalities in medical images and support cancer clinical decision-making, including screening, diagnosis, and prognosis [<xref ref-type="bibr" rid="ref13">13</xref>-<xref ref-type="bibr" rid="ref16">16</xref>]. The early application of deep learning and machine learning models in laryngeal cancer management has demonstrated the potential for detection capabilities comparable to human experts [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>]. Deep convolutional neural networks (DCNNs) have been reported to deal with various data modalities for different use cases across the entire care chain [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]. These include real-time lesion detection [<xref ref-type="bibr" rid="ref19">19</xref>-<xref ref-type="bibr" rid="ref21">21</xref>] and segmentation [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref21">21</xref>], as well as screening, diagnosis [<xref ref-type="bibr" rid="ref22">22</xref>], management, and prognosis of laryngeal cancers [<xref ref-type="bibr" rid="ref16">16</xref>].</p><p>Deploying these AI models in the imaging practice presents several challenges. Imaging modalities use high-dimensional data. Applications that process single image frames account for the frames&#x2019; pixel resolution and other features (eg, color channels). This &#x201C;curse of dimensionality&#x201D; effect, confronted by computational AI models, is compounded when considering video streams that capture many sequential image frames. For FNS procedures, the frame count ranges from hundreds to thousands, depending on the frame capture rate and the procedure&#x2019;s duration.</p><p>Furthermore, the high computational requirements of performant AI-based models must be considered in low-resource settings [<xref ref-type="bibr" rid="ref23">23</xref>-<xref ref-type="bibr" rid="ref28">28</xref>]. Another concern, especially for video-based procedures, such as screening for laryngeal cancer through FNS, is that the frames of interest may only lie within a range of non-blurry, contiguous frames that capture the anatomical landmark of interest (ie, the region of interest within each frame). This adds the challenge of localizing decision-making to a few clear and relevant regions and frames that best inform case escalation to more advanced diagnostic and treatment procedures. A previous study has proposed manually filtering frames to exclude low-quality frames (ie, blurry, noisy) before making an assessment [<xref ref-type="bibr" rid="ref17">17</xref>]. Others have suggested various preprocessing steps to improve the quality of input images [<xref ref-type="bibr" rid="ref29">29</xref>-<xref ref-type="bibr" rid="ref32">32</xref>]; for instance, Huang et al [<xref ref-type="bibr" rid="ref31">31</xref>] suggesting using the grayscale adaptive entropy value for setting the threshold to eliminate unclear images and recognize vocal fold disorders.</p><p>This formative study introduces an AI-based framework that denoises high-dimensional FNS videos, selects relevant frames, and suggests care escalation decisions through a referral grade classification task. To handle noisy real-world data and select relevant frames, our framework proposes an image quality module (IQM) that conducts a two-step procedure of filtering redundant images using a histogram of gradient-based threshold model and selecting good quality frames using supervised DCNN models. This IQM is used in conjunction with a disease classification module (DCM) that outputs a probability that a case should be escalated to appropriate downstream test and treat procedures. We aim to explore the use of efficient DCNN models and validate whether the proposed framework enhances the performance of correctly classifying cases to appropriate referral grades to address the resource constraints envisioned in less well-resourced settings [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref33">33</xref>].</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Data Acquisition</title><p>Our study dataset has 132 full-color FNS videos of varying lengths collected from laryngoscopy procedures conducted in the Duke University Health System from December 2019 to December 2020. The shortest video was 5 seconds, while the longest was 165 seconds. The video clips were captured with various orientations, movements, and variable lighting and contrast conditions during the procedure. Some of these patients were healthy (no laryngeal pathology), some had benign disease processes, and some had laryngeal cancer. Patients were excluded if the videos were taken post-laryngectomy or if the larynx was not visualized on the video. Expert clinicians annotated the videos with medical conditions and referral levels for training classification models. The medical conditions were classified into three referral levels by a panel of 4 clinicians (two senior and two junior specialists): Grade 1, no referral required; Grade 2, non-urgent referral or close follow-up in 3&#x2010;4 weeks; and Grade 3, urgent referral.</p></sec><sec id="s2-2"><title>Ethical Considerations</title><p>All FNS videos were de-identified before analysis to protect patient privacy and confidentiality. The study was approved by the Duke University Health System Institutional Review Board (No. Pro00106209). The IRB granted a waiver of informed consent, as the study involved only de-identified data and posed no risk to participants. No compensation was provided to participants. No identifiable individuals appear in any images or materials included in the manuscript or supplementary files.</p></sec><sec id="s2-3"><title>AI-Based Framework for Laryngeal Cancer Screening</title><sec id="s2-3-1"><title>Overview</title><p><xref ref-type="fig" rid="figure1">Figure 1</xref> shows the proposed framework for screening patients receiving an FNS procedure. The framework includes two main components: the IQM and the DCM. The IQM filters low-quality and irrelevant images through a histogram of gradients-based threshold compared to an indexed 1st frame of each video. A U-Net model, trained with segmentation masks derived from the open-source benchmark for automatic glottis segmentation (BAGLS) dataset [<xref ref-type="bibr" rid="ref30">30</xref>], was used to generate a labeled dataset for training the IQM to select relevant frames. The trained IQM network is then used to further refine the set of high-quality frames. Using selected frames, we train an efficient AI-based DCM to classify the referral grade. <xref ref-type="fig" rid="figure1">Figure 1</xref> shows the schematic of the training and inference process based on the IQM and DCM.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Schematic of the proposed AI-based framework based on the IQM and DCM. AI: artificial intelligence; BAGLS: Benchmark for Automatic Glottis Segmentation; DCM: disease classification module; DHS: Duke University Health System; HOG: histogram of gradients; IQM: image quality module.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e66110_fig01.png"/></fig></sec><sec id="s2-3-2"><title>Image Quality Module</title><p>We developed a two-stage IQM based on (1) histogram of gradients-based similarity filtering, followed by (2) a U-Net-based DCNN module to identify a set of relevant, good-quality images. The correlation (ie, by cosine similarity) of features from a histogram of gradients [<xref ref-type="bibr" rid="ref34">34</xref>] was used to evaluate the similarity of contiguous frames with the indexed frame (which is the first frame of relevance in the FNS process). After similarity filtering, the U-Net model was trained to identify the glottal region using the open-source BAGLS dataset [<xref ref-type="bibr" rid="ref30">30</xref>]. Good-quality images are frames where the glottal area is entirely visible, regardless of the image&#x2019;s position and the glottis&#x2019;s size. We then assigned positive and negative quality labels to the Duke University Health System dataset using the U-Net model&#x2019;s predictions. Negative labels indicate poor quality due to obscuration and blurring by natural bodily secretions and movements or irrelevance, that is, frames not of the glottal region.</p></sec><sec id="s2-3-3"><title>Disease Classification Module</title><p>The 132 unique patient videos were filtered into &#x201C;good&#x201D; and &#x201C;poor&#x201D; quality frames with the IQM. Sixteen videos were excluded due to insufficient good-quality image frames. The remaining 116 videos were used to train the DCM model using an 80&#x2010;20 patient-level train-test split to avoid data leakage. The DCM classifies patients into binary referral grades: non-referral (Grade 1) versus referral (Grades 2 to 3).</p><p>To develop the DCM, we compared a baseline CNN model [<xref ref-type="bibr" rid="ref35">35</xref>], a ResNet50-based model [<xref ref-type="bibr" rid="ref36">36</xref>], a MobileNetV2-based model [<xref ref-type="bibr" rid="ref33">33</xref>], and a GhostNet-based model [<xref ref-type="bibr" rid="ref26">26</xref>] across six validation metrics. The baseline CNN model has six convolutional layers with maximum pooling and batch normalization, adding dropout [<xref ref-type="bibr" rid="ref37">37</xref>] to the last two layers. ResNet50 employs deep residual learning with skip connections, enabling training of very deep neural networks without the challenge of vanishing gradients [<xref ref-type="bibr" rid="ref38">38</xref>]. MobileNetV2 employs inverted residuals and depthwise separable convolutions for more efficient performance, intended for mobile and embedded devices. Lastly, GhostNet further enhances the efficiency of computations by generating more feature maps from efficient operations; this results in an even more efficient DCNN suitable for lightweight applications.</p></sec><sec id="s2-3-4"><title>Model Evaluation</title><p>The classification performance for IQM at the image level and DCM at the video (ie, patient) level was evaluated across six validation metrics. The train-test split was determined at the patient level to avoid data leakage. For the patient-level classification, we utilized the concept of bootstrap aggregation to evaluate the average classification probability of image frames within the same video. The primary metrics that describe the quality of the model predictions are accuracy, weighted F1 score, area under the receiver operating characteristic (AUROC), and area under the precision-recall curve (AUPRC). The secondary metrics that describe the efficiency of the models are the total number of floating-point operations in the order of 10<sup>9</sup> (GFLOPs) [<xref ref-type="bibr" rid="ref39">39</xref>] and inference times. These outcome metrics describe the efficiency and effectiveness of the algorithm for model training, validation, and inference [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref28">28</xref>]. In addition, to address potential data imbalance that may hinder the classification model&#x2019;s ability to learn minor class patterns, the final selected model was further evaluated with training sample augmentation, and binary focal cross entropy loss [<xref ref-type="bibr" rid="ref40">40</xref>]. To assess the impact of the IQM in the overall framework, we performed an ablation study [<xref ref-type="bibr" rid="ref41">41</xref>] in which the DCM classifier was evaluated both with and without IQM-based preprocessing. Specifically, we trained and tested the DCM using input sequences that had undergone the IQM step. In addition, we systematically varied the number of high-quality frames provided to the DCM to examine the effect of input frame count on the classification performance.</p></sec></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Comparison of the BAGLS Dataset and the Study Dataset</title><p><xref ref-type="table" rid="table1">Table 1</xref> summarizes the BAGLS and the study dataset. The BAGLS dataset has approximately 60% healthy patients, whereas our dataset has 30% healthy patients. The number of frames derived from the patient-level videos is roughly the same ratio. We used the entire BAGLS cohort, comprising 59,250 frames, to label informative frames. The raw dataset comprised 190,978 images derived from 132 patients.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Summary of cohorts from the BAGLS<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> dataset and our dataset.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom" colspan="2">BAGLS</td><td align="left" valign="bottom" colspan="2">Study dataset</td></tr><tr><td align="left" valign="top">Disorder Status</td><td align="left" valign="top">Cohort size (%)</td><td align="left" valign="top">Patient count (%)</td><td align="left" valign="top">Frame count (%)</td><td align="left" valign="top">Cohort size (%)</td></tr></thead><tbody><tr><td align="left" valign="top">Healthy (Grade 1)</td><td align="left" valign="top">35,400 (59.7)</td><td align="left" valign="top">382 (59.7)</td><td align="left" valign="top">49,282 (25.8)</td><td align="left" valign="top">40 (30.3)</td></tr><tr><td align="left" valign="top">Unhealthy (Grade 2/3)</td><td align="left" valign="top">23,850 (40.3)</td><td align="left" valign="top">258 (40.3)</td><td align="left" valign="top">141,696 (74.2)</td><td align="left" valign="top">92 (69.7)</td></tr><tr><td align="left" valign="top">Total</td><td align="left" valign="top">59,250 (100)</td><td align="left" valign="top">640 (100)</td><td align="left" valign="top">190,978 (100)</td><td align="left" valign="top">132 (100)</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>BAGLS: Benchmark for Automatic Glottis Segmentation.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Performance of the Image Quality Module</title><p><xref ref-type="table" rid="table2">Table 2</xref> compares the test performance of the baseline CNN model [<xref ref-type="bibr" rid="ref35">35</xref>], ResNet50-based model [<xref ref-type="bibr" rid="ref36">36</xref>], and GhostNet-based model [<xref ref-type="bibr" rid="ref26">26</xref>] for the IQM. Although the ResNet50-based model had the best accuracy of 0.833, the best F1 score of 0.832, and the best AUPRC of 0.957, the GhostNet model had comparable performance with the ResNet50 model and the best AUROC score of 0.895 with the fewest GFLOPs for computation.</p><p>The IQM model generated 20,040 good-quality frames from 116 patients in the study dataset cohort for the DCM training and test sets. Of these, 34/116 patients (29.3%) were classified as having Grade 1 disease, while the remainder 82/116 (70.7%) were classified as having Grade 2/3 disease. GhostNet resulted in the highest AUC-ROC and AUPRC, while being the most efficient, that is, the lowest GFLOPs.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Comparison of the performance of the different deep convolutional neural network architectures used in the image quality module (IQM).</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Accuracy</td><td align="left" valign="bottom">F1 score</td><td align="left" valign="bottom">AUROC<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="bottom">AUPRC<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="bottom">GFLOPs<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">Baseline convolutional neural network</td><td align="char" char="." valign="top">0.699</td><td align="char" char="." valign="top">0.673</td><td align="char" char="." valign="top">0.724</td><td align="char" char="." valign="top">0.729</td><td align="char" char="." valign="top">50.0</td></tr><tr><td align="left" valign="top">ResNet50</td><td align="char" char="." valign="top">0.833</td><td align="char" char="." valign="top">0.832</td><td align="char" char="." valign="top">0.746</td><td align="char" char="." valign="top">0.957</td><td align="char" char="." valign="top">245.0</td></tr><tr><td align="left" valign="top">GhostNet</td><td align="char" char="." valign="top">0.829</td><td align="char" char="." valign="top">0.827</td><td align="char" char="." valign="top">0.895</td><td align="char" char="." valign="top">0.878</td><td align="char" char="." valign="top">8.7</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>AUROC: area under the receiver operating characteristic curve.</p></fn><fn id="table2fn2"><p><sup>b</sup>AUPRC: Area under the precision-recall curve.</p></fn><fn id="table2fn3"><p><sup>c</sup>GFLOP: Number of floating-point operations in the order of 10<sup>9</sup>.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-3"><title>Performance of the Disease Classification Module</title><p>The ResNet50 and GhostNet DCM achieved accuracy, optimal F1-scores, and AUPRC exceeding 80% at the video-level classification (<xref ref-type="table" rid="table3">Table 3</xref>). The ResNet50 model&#x2019;s inference time was 20.44 s, nearly 2.5 times slower than that of GhostNet (7.95 s per batch). Using an inference batch size of 64, 224-pixel-sized images (ie, height and width), ResNet50 had 245.0 GFLOPs, 40 times more than the GhostNet model with 8.7 GFLOPs.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Performance comparison of different disease classification module (DCM) classifiers at the patient level.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Accuracy</td><td align="left" valign="bottom">F1 score</td><td align="left" valign="bottom">AUROC<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="bottom">AUPRC<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="bottom">Inference time (s)</td><td align="left" valign="bottom">GFLOPs<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">Convolutional neural network</td><td align="left" valign="top">0.652</td><td align="left" valign="top">0.624</td><td align="left" valign="top">0.595</td><td align="left" valign="top">0.805</td><td align="left" valign="top">8.09</td><td align="left" valign="top">50.0</td></tr><tr><td align="left" valign="top">ResNet50</td><td align="left" valign="top">0.739</td><td align="left" valign="top">0.697</td><td align="left" valign="top">0.667</td><td align="left" valign="top">0.850</td><td align="left" valign="top">16.71</td><td align="left" valign="top">245.0</td></tr><tr><td align="left" valign="top">MobileNetV2</td><td align="left" valign="top">0.696</td><td align="left" valign="top">0.629</td><td align="left" valign="top">0.611</td><td align="left" valign="top">0.833</td><td align="left" valign="top">8.62</td><td align="left" valign="top">20.3</td></tr><tr><td align="left" valign="top">GhostNet</td><td align="left" valign="top">0.870</td><td align="left" valign="top">0.863</td><td align="left" valign="top">0.833</td><td align="left" valign="top">0.912</td><td align="left" valign="top">7.95</td><td align="left" valign="top">8.7</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>AUROC: area under the receiver operating characteristic curve.</p></fn><fn id="table3fn2"><p><sup>b</sup>AUPRC: Area under the precision-recall curve.</p></fn><fn id="table3fn3"><p><sup>c</sup>GFLOP: Number of floating-point operations in the order of 10<sup>9</sup>.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-4"><title>Ablation Study</title><p>As the video-level prediction is based on bootstrap aggregation or bagging [<xref ref-type="bibr" rid="ref42">42</xref>], the number of frames available to generate disease predictions (post-IQM) will be sensitive to the number of good-quality frames available per patient.</p><p><xref ref-type="table" rid="table4">Table 4</xref> shows the sensitivity of predictive quality across the number of high-quality frames. When using all the image frames selected by the IQM, the DCM improved its performance by 38% considering the AUROC (from 0.60 to 0.83) and 8% considering the AUPRC (from 0.84 to 0.91). Our results showed that 50 good-quality frames per patient video were required to outperform the model&#x2019;s base case without IQM.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Ablation study results show GhostNet-based disease classification module performance at varying numbers (n) of good-quality frames per patient selected by the image quality module (IQM).</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Accuracy</td><td align="left" valign="bottom">F1 score</td><td align="left" valign="bottom">AUROC<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="bottom">AUPRC<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td></tr><tr><td align="left" valign="bottom">Without IQM</td><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="bottom"/></tr></thead><tbody><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Original number</td><td align="left" valign="top">0.704</td><td align="left" valign="top">0.633</td><td align="left" valign="top">0.600</td><td align="left" valign="top">0.840</td></tr><tr><td align="left" valign="top">With IQM, n</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>10</td><td align="left" valign="top">0.676</td><td align="left" valign="top">0.545</td><td align="left" valign="top">0.500</td><td align="left" valign="top">0.839</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>30</td><td align="left" valign="top">0.622</td><td align="left" valign="top">0.625</td><td align="left" valign="top">0.698</td><td align="left" valign="top">0.877</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>50</td><td align="left" valign="top">0.784</td><td align="left" valign="top">0.770</td><td align="left" valign="top">0.710</td><td align="left" valign="top">0.884</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>No limit<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td><td align="left" valign="top">0.870</td><td align="left" valign="top">0.863</td><td align="left" valign="top">0.833</td><td align="left" valign="top">0.912</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>AUROC: area under the receiver operating characteristic curve.</p></fn><fn id="table4fn2"><p><sup>b</sup>AUPRC: area under the precision-recall curve.</p></fn><fn id="table4fn3"><p><sup>c</sup>All frames classified as good quality by the IQM are used.</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study showed the feasibility of an efficient AI-based screening framework incorporating an image quality filtering module to select high-quality and relevant image frames from FNS videos. Our ablation study demonstrated that the integration of IQM resulted in higher-quality DCM predictions across all the performance metrics at the patient level. Using a minimum of 50 high-quality frames, the DCM showed better predictive performance across all the metrics compared to the base model, where all the image frames were used without the IQM. Addressing the challenges of selecting informative image frames has been identified as a key impediment in developing laryngeal cancer screening algorithms [<xref ref-type="bibr" rid="ref17">17</xref>]. Our formative research highlights the IQM&#x2019;s potential to enhance training and inference through effective frame selection.</p><p>We leveraged the efficient GhostNet architecture for our IQM and DCM as an alternative to the more resource-intensive ResNet50 model. GhostNet-based models have demonstrated performance comparable to those using less efficient architectures, such as ResNet50 [<xref ref-type="bibr" rid="ref9">9</xref>]. In our study, the GhostNet-based DCM produced the best model across the validation metrics. The model achieved an accuracy of 87% and a high AUROC (0.833) and AUPRC (0.912) for classification at the patient level, with the optimal F1-score of 0.863 (<xref ref-type="table" rid="table3">Table 3</xref>). This level of performance, combined with the model&#x2019;s efficiency, makes it more suitable for integration into low-cost FNS facilities and screening equipment.</p><p>Our dataset, comprising 132 patients with 190,978 frames, is smaller than the dataset in a prior study [<xref ref-type="bibr" rid="ref21">21</xref>], which trained and validated a segmentation model on data from 557 patients with 3933 frames and tested on two additional datasets. Nonetheless, limited patient datasets are common in this field. A related study [<xref ref-type="bibr" rid="ref43">43</xref>] evaluated a CNN model on 100 patients with 170 images, while another study [<xref ref-type="bibr" rid="ref44">44</xref>] used data from just 33 patients with 1320 images to assess machine learning algorithms.</p><p>Current state-of-the-art computer vision models use transformer-based AI models to classify images, segment pixels, or localize objects within images [<xref ref-type="bibr" rid="ref45">45</xref>]. While achieving high performance scores on established benchmarks, these models are computationally costly, with computational workloads exceeding those of the ResNets models [<xref ref-type="bibr" rid="ref46">46</xref>-<xref ref-type="bibr" rid="ref48">48</xref>]. Studies that compared traditional DCNNs, like those explored in this study, with transformer-based models highlighted greater computational costs and dependence on large training datasets [<xref ref-type="bibr" rid="ref45">45</xref>,<xref ref-type="bibr" rid="ref49">49</xref>,<xref ref-type="bibr" rid="ref50">50</xref>]. Given these limitations, particularly in the context of deployment in low-resource clinical settings, there remains a strong case for exploring simpler, more efficient architectures. This study focused on efficient DCNNs to develop and validate the AI-based IQM-DCM screening framework for laryngeal cancer, emphasizing practical feasibility and predictive performance.</p><p>While our FNS videos reflect a realistic clinical setting, they may not fully represent the constraints of low-resource environments. This study serves as a preliminary step towards demonstrating the feasibility of the AI-based IQM-DCM screening framework. Acknowledging the limitations of our dataset, we are actively expanding data collection efforts with multiple partners to further enhance the framework&#x2019;s robustness and generalizability across diverse low-resourced clinical contexts [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref51">51</xref>]. Recent developments in efficient transformer network models will also be evaluated further to refine the dual-stage screening framework [<xref ref-type="bibr" rid="ref52">52</xref>,<xref ref-type="bibr" rid="ref53">53</xref>]. Cost-effectiveness analysis and implementation studies will also be conducted to achieve the envisioned system, which can support referral decisions in low-resource settings [<xref ref-type="bibr" rid="ref16">16</xref>].</p></sec><sec id="s4-2"><title>Conclusion</title><p>This study demonstrates the potential of the IQM-DCM framework to be embedded in an AI-based system to support early screening and triaging of patients at risk of laryngeal cancer. This preliminary work provides early evidence supporting the feasibility of this approach. Notably, the IQM-DCM framework, leveraged on lightweight neural network architectures, is shown to outperform conventional CNN models across various effectiveness and efficiency metrics. Future work will expand the dataset, incorporate recent advances in efficient network architectures, and validate the framework across more diverse populations to enhance its generalizability and real-world clinical applicability.</p></sec></sec></body><back><ack><p>The study team would like to thank Ms. Zhenzhi Chen for managing this project and facilitating the necessary agreements that enabled its progress. This project is funded by the Duke/Duke-NUS Innovation Collaboration Pilot Grant (Duke/Duke-NUS/ICA(Pilot)/2020/0003). WTL wishes to acknowledge his joint appointment with the Duke-National University of Singapore.</p></ack><notes><sec><title>Data Availability</title><p>Access to data used in this study is restricted to approved research collaborators due to concerns about privacy and institutional ethics policies.</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">AUPRC</term><def><p>area under precision-recall curve</p></def></def-item><def-item><term id="abb3">AUROC</term><def><p>area under the receiver operating characteristic curve</p></def></def-item><def-item><term id="abb4">BAGLS</term><def><p>benchmark for automatic glottis segmentation</p></def></def-item><def-item><term id="abb5">CNN</term><def><p>convolutional neural network</p></def></def-item><def-item><term id="abb6">DCM</term><def><p>disease classification module</p></def></def-item><def-item><term id="abb7">DCNN</term><def><p>deep convolutional neural network</p></def></def-item><def-item><term id="abb8">FNS</term><def><p>flexible nasopharyngoscopy</p></def></def-item><def-item><term id="abb9">GFLOP</term><def><p>number of floating-point operations in the order of 10<sup>9</sup></p></def></def-item><def-item><term id="abb10">HNC</term><def><p>head and neck cancer</p></def></def-item><def-item><term id="abb11">IQM</term><def><p>image quality module</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gupta</surname><given-names>B</given-names> </name><name name-style="western"><surname>Johnson</surname><given-names>NW</given-names> </name><name name-style="western"><surname>Kumar</surname><given-names>N</given-names> </name></person-group><article-title>Global epidemiology of head and neck cancers: a continuing challenge</article-title><source>Oncology (Williston Park, NY)</source><year>2016</year><volume>91</volume><issue>1</issue><fpage>13</fpage><lpage>23</lpage><pub-id pub-id-type="doi">10.1159/000446117</pub-id><pub-id pub-id-type="medline">27245686</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Patterson</surname><given-names>RH</given-names> </name><name name-style="western"><surname>Fischman</surname><given-names>VG</given-names> </name><name name-style="western"><surname>Wasserman</surname><given-names>I</given-names> </name><etal/></person-group><article-title>Global burden of head and neck cancer: economic consequences, health, and the role of surgery</article-title><source>Otolaryngol Head Neck Surg</source><year>2020</year><month>03</month><volume>162</volume><issue>3</issue><fpage>296</fpage><lpage>303</lpage><pub-id pub-id-type="doi">10.1177/0194599819897265</pub-id><pub-id pub-id-type="medline">31906785</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schultz</surname><given-names>P</given-names> </name></person-group><article-title>Vocal fold cancer</article-title><source>Eur Ann Otorhinolaryngol Head Neck Dis</source><year>2011</year><month>12</month><volume>128</volume><issue>6</issue><fpage>301</fpage><lpage>308</lpage><pub-id pub-id-type="doi">10.1016/j.anorl.2011.04.004</pub-id><pub-id pub-id-type="medline">21959270</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Unger</surname><given-names>J</given-names> </name><name name-style="western"><surname>Lohscheller</surname><given-names>J</given-names> </name><name name-style="western"><surname>Reiter</surname><given-names>M</given-names> </name><name name-style="western"><surname>Eder</surname><given-names>K</given-names> </name><name name-style="western"><surname>Betz</surname><given-names>CS</given-names> </name><name name-style="western"><surname>Schuster</surname><given-names>M</given-names> </name></person-group><article-title>A noninvasive procedure for early-stage discrimination of malignant and precancerous vocal fold lesions based on laryngeal dynamics analysis</article-title><source>Cancer Res</source><year>2015</year><month>01</month><day>1</day><volume>75</volume><issue>1</issue><fpage>31</fpage><lpage>39</lpage><pub-id pub-id-type="doi">10.1158/0008-5472.CAN-14-1458</pub-id><pub-id pub-id-type="medline">25371410</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>D&#x2019;cruz</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>T</given-names> </name><name name-style="western"><surname>Anand</surname><given-names>AK</given-names> </name><etal/></person-group><article-title>Consensus recommendations for management of head and neck cancer in Asian countries: a review of international guidelines</article-title><source>Oral Oncol</source><year>2013</year><month>09</month><volume>49</volume><issue>9</issue><fpage>872</fpage><lpage>877</lpage><pub-id pub-id-type="doi">10.1016/j.oraloncology.2013.05.010</pub-id><pub-id pub-id-type="medline">23830839</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chaturvedi</surname><given-names>P</given-names> </name><name name-style="western"><surname>Singhavi</surname><given-names>H</given-names> </name><name name-style="western"><surname>Malik</surname><given-names>A</given-names> </name><name name-style="western"><surname>Nair</surname><given-names>D</given-names> </name></person-group><article-title>Outcome of head and neck squamous cell cancers in low-resource settings: challenges and opportunities</article-title><source>Otolaryngol Clin North Am</source><year>2018</year><month>06</month><volume>51</volume><issue>3</issue><fpage>619</fpage><lpage>629</lpage><pub-id pub-id-type="doi">10.1016/j.otc.2018.01.008</pub-id><pub-id pub-id-type="medline">29501327</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hoare</surname><given-names>TJ</given-names> </name><name name-style="western"><surname>Thomson</surname><given-names>HG</given-names> </name><name name-style="western"><surname>Proops</surname><given-names>DW</given-names> </name></person-group><article-title>Detection of laryngeal cancer--the case for early specialist assessment</article-title><source>J R Soc Med</source><year>1993</year><month>07</month><volume>86</volume><issue>7</issue><fpage>390</fpage><lpage>392</lpage><pub-id pub-id-type="doi">10.1177/014107689308600707</pub-id><pub-id pub-id-type="medline">8053995</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Alvi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Harsha</surname><given-names>P</given-names> </name></person-group><source>Flexible Nasopharyngoscopy</source><year>2022</year><access-date>2025-09-23</access-date><publisher-name>StatPearls Publishing</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.ncbi.nlm.nih.gov/books/NBK539740/">https://www.ncbi.nlm.nih.gov/books/NBK539740/</ext-link></comment></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ara&#x00FA;jo</surname><given-names>T</given-names> </name><name name-style="western"><surname>Santos</surname><given-names>CP</given-names> </name><name name-style="western"><surname>De Momi</surname><given-names>E</given-names> </name><name name-style="western"><surname>Moccia</surname><given-names>S</given-names> </name></person-group><article-title>Learned and handcrafted features for early-stage laryngeal SCC diagnosis</article-title><source>Med Biol Eng Comput</source><year>2019</year><month>12</month><volume>57</volume><issue>12</issue><fpage>2683</fpage><lpage>2692</lpage><pub-id pub-id-type="doi">10.1007/s11517-019-02051-5</pub-id><pub-id pub-id-type="medline">31728933</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Smith</surname><given-names>JD</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>KL</given-names> </name></person-group><article-title>Workforce considerations, training, and diseases of the Asia-Pacific region</article-title><source>Otolaryngol Clin North Am</source><year>2018</year><month>06</month><volume>51</volume><issue>3</issue><fpage>659</fpage><lpage>665</lpage><pub-id pub-id-type="doi">10.1016/j.otc.2018.01.010</pub-id><pub-id pub-id-type="medline">29482921</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>JL</given-names> </name><name name-style="western"><surname>Khalid</surname><given-names>H</given-names> </name><name name-style="western"><surname>Alvaran</surname><given-names>KAB</given-names> </name><name name-style="western"><surname>Hey</surname><given-names>S</given-names> </name><name name-style="western"><surname>Watson</surname><given-names>N</given-names> </name><name name-style="western"><surname>Karagama</surname><given-names>Y</given-names> </name></person-group><article-title>Improving laryngeal procedure workflow: moving from the operating room to the outpatient setting</article-title><source>Laryngoscope</source><year>2025</year><month>03</month><volume>135</volume><issue>3</issue><fpage>1132</fpage><lpage>1142</lpage><pub-id pub-id-type="doi">10.1002/lary.31849</pub-id><pub-id pub-id-type="medline">39460684</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yue</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zeng</surname><given-names>X</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>H</given-names> </name><etal/></person-group><article-title>A deep learning based smartphone application for early detection of nasopharyngeal carcinoma using endoscopic images</article-title><source>NPJ Digit Med</source><year>2024</year><month>12</month><day>31</day><volume>7</volume><issue>1</issue><fpage>384</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01403-2</pub-id><pub-id pub-id-type="medline">39738998</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Koh</surname><given-names>DM</given-names> </name><name name-style="western"><surname>Papanikolaou</surname><given-names>N</given-names> </name><name name-style="western"><surname>Bick</surname><given-names>U</given-names> </name><etal/></person-group><article-title>Artificial intelligence and machine learning in cancer imaging</article-title><source>Commun Med (Lond)</source><year>2022</year><volume>2</volume><issue>1</issue><fpage>133</fpage><pub-id pub-id-type="doi">10.1038/s43856-022-00199-0</pub-id><pub-id pub-id-type="medline">36310650</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kourou</surname><given-names>K</given-names> </name><name name-style="western"><surname>Exarchos</surname><given-names>KP</given-names> </name><name name-style="western"><surname>Papaloukas</surname><given-names>C</given-names> </name><name name-style="western"><surname>Sakaloglou</surname><given-names>P</given-names> </name><name name-style="western"><surname>Exarchos</surname><given-names>T</given-names> </name><name name-style="western"><surname>Fotiadis</surname><given-names>DI</given-names> </name></person-group><article-title>Applied machine learning in cancer research: a systematic review for patient diagnosis, classification and prognosis</article-title><source>Comput Struct Biotechnol J</source><year>2021</year><volume>19</volume><fpage>5546</fpage><lpage>5555</lpage><pub-id pub-id-type="doi">10.1016/j.csbj.2021.10.006</pub-id><pub-id pub-id-type="medline">34712399</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kumar</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Gupta</surname><given-names>S</given-names> </name><name name-style="western"><surname>Singla</surname><given-names>R</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>YC</given-names> </name></person-group><article-title>A systematic review of artificial intelligence techniques in cancer prediction and diagnosis</article-title><source>Arch Comput Methods Eng</source><year>2022</year><volume>29</volume><issue>4</issue><fpage>2043</fpage><lpage>2070</lpage><pub-id pub-id-type="doi">10.1007/s11831-021-09648-w</pub-id><pub-id pub-id-type="medline">34602811</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bensoussan</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Vanstrum</surname><given-names>EB</given-names> </name><name name-style="western"><surname>Johns</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Rameau</surname><given-names>A</given-names> </name></person-group><article-title>Artificial intelligence and laryngeal cancer: from screening to prognosis: a state of the art review</article-title><source>Otolaryngol Head Neck Surg</source><year>2023</year><month>03</month><volume>168</volume><issue>3</issue><fpage>319</fpage><lpage>329</lpage><pub-id pub-id-type="doi">10.1177/01945998221110839</pub-id><pub-id pub-id-type="medline">35787073</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xiong</surname><given-names>H</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>P</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>JG</given-names> </name><etal/></person-group><article-title>Computer-aided diagnosis of laryngeal cancer via deep learning based on laryngoscopic images</article-title><source>EBioMedicine</source><year>2019</year><month>10</month><volume>48</volume><fpage>92</fpage><lpage>99</lpage><pub-id pub-id-type="doi">10.1016/j.ebiom.2019.08.075</pub-id><pub-id pub-id-type="medline">31594753</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zhong</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>K</given-names> </name></person-group><article-title>Hierarchical dynamic convolutional neural network for laryngeal disease classification</article-title><source>Sci Rep</source><year>2022</year><volume>12</volume><issue>1</issue><fpage>13914</fpage><pub-id pub-id-type="doi">10.1038/s41598-022-18217-5</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>He</surname><given-names>R</given-names> </name><name name-style="western"><surname>Jie</surname><given-names>P</given-names> </name><name name-style="western"><surname>Hou</surname><given-names>W</given-names> </name><etal/></person-group><article-title>Real-time artificial intelligence-assisted detection and segmentation of nasopharyngeal carcinoma using multimodal endoscopic data: a multi-center, prospective study</article-title><source>EClinicalMedicine</source><year>2025</year><month>03</month><volume>81</volume><fpage>103120</fpage><pub-id pub-id-type="doi">10.1016/j.eclinm.2025.103120</pub-id><pub-id pub-id-type="medline">40026832</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Baldini</surname><given-names>C</given-names> </name><name name-style="western"><surname>Migliorelli</surname><given-names>L</given-names> </name><name name-style="western"><surname>Berardini</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Improving real-time detection of laryngeal lesions in endoscopic images using a decoupled super-resolution enhanced YOLO</article-title><source>Comput Methods Programs Biomed</source><year>2025</year><month>03</month><volume>260</volume><fpage>108539</fpage><pub-id pub-id-type="doi">10.1016/j.cmpb.2024.108539</pub-id><pub-id pub-id-type="medline">39689500</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sampieri</surname><given-names>C</given-names> </name><name name-style="western"><surname>Azam</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Ioppi</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Real-time laryngeal cancer boundaries delineation on white light and narrow-band imaging laryngoscopy with deep learning</article-title><source>Laryngoscope</source><year>2024</year><month>06</month><volume>134</volume><issue>6</issue><fpage>2826</fpage><lpage>2834</lpage><pub-id pub-id-type="doi">10.1002/lary.31255</pub-id><pub-id pub-id-type="medline">38174772</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xu</surname><given-names>ZH</given-names> </name><name name-style="western"><surname>Fan</surname><given-names>DG</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>JQ</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>JW</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Li</surname><given-names>YZ</given-names> </name></person-group><article-title>Computer-aided diagnosis of laryngeal cancer based on deep learning with laryngoscopic images</article-title><source>Diagnostics (Basel)</source><year>2023</year><month>12</month><day>14</day><volume>13</volume><issue>24</issue><fpage>3669</fpage><pub-id pub-id-type="doi">10.3390/diagnostics13243669</pub-id><pub-id pub-id-type="medline">38132254</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ha</surname><given-names>DS</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>F</given-names> </name><name name-style="western"><surname>Yi</surname><given-names>Y</given-names> </name></person-group><article-title>Efficient neural networks for edge devices</article-title><source>Comput Electr Eng</source><year>2021</year><month>06</month><volume>92</volume><fpage>107121</fpage><pub-id pub-id-type="doi">10.1016/j.compeleceng.2021.107121</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Peng</surname><given-names>R</given-names> </name><name name-style="western"><surname>Tai</surname><given-names>YW</given-names> </name><name name-style="western"><surname>Tang</surname><given-names>CK</given-names> </name></person-group><article-title>Network trimming: a data-driven neuron pruning approach towards efficient deep architectures</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 12, 2016</comment><pub-id pub-id-type="doi">10.48550/arXiv.1607.03250</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Jacob</surname><given-names>B</given-names> </name><name name-style="western"><surname>Kligys</surname><given-names>S</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Quantization and training of neural networks for efficient integer-arithmetic-only inference</article-title><conf-name>2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name><conf-date>Jun 18-23, 2018</conf-date><conf-loc>Salt Lake City, UT</conf-loc><pub-id pub-id-type="doi">10.1109/CVPR.2018.00286</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Han</surname><given-names>K</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Tian</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Guo</surname><given-names>J</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>C</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>C</given-names> </name></person-group><article-title>GhostNet: more features from cheap operations</article-title><year>2020</year><conf-name>2020 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name><conf-date>Jun 13-19, 2020</conf-date><conf-loc>Seattle, WA, USA</conf-loc><fpage>1577</fpage><lpage>1586</lpage><pub-id pub-id-type="doi">10.1109/CVPR42600.2020.00165</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>X</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>J</given-names> </name></person-group><article-title>ShuffleNet: an extremely efficient convolutional neural network for mobile devices</article-title><conf-name>2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name><conf-date>Jun 18-23, 2018</conf-date><conf-loc>Salt Lake City, UT</conf-loc><pub-id pub-id-type="doi">10.1109/CVPR.2018.00716</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shi</surname><given-names>W</given-names> </name><name name-style="western"><surname>Cao</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>L</given-names> </name></person-group><article-title>Edge computing: vision and challenges</article-title><source>IEEE Internet Things J</source><year>2016</year><volume>3</volume><issue>5</issue><fpage>637</fpage><lpage>646</lpage><pub-id pub-id-type="doi">10.1109/JIOT.2016.2579198</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dunham</surname><given-names>ME</given-names> </name><name name-style="western"><surname>Kong</surname><given-names>KA</given-names> </name><name name-style="western"><surname>McWhorter</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Adkins</surname><given-names>LK</given-names> </name></person-group><article-title>Optical biopsy: automated classification of airway endoscopic findings using a convolutional neural network</article-title><source>Laryngoscope</source><year>2022</year><month>02</month><volume>132 Suppl 4</volume><fpage>S1</fpage><lpage>S8</lpage><pub-id pub-id-type="doi">10.1002/lary.28708</pub-id><pub-id pub-id-type="medline">32343434</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>G&#x00F3;mez</surname><given-names>P</given-names> </name><name name-style="western"><surname>Kist</surname><given-names>AM</given-names> </name><name name-style="western"><surname>Schlegel</surname><given-names>P</given-names> </name><etal/></person-group><article-title>BAGLS, a multihospital benchmark for automatic glottis segmentation</article-title><source>Sci Data</source><year>2020</year><month>06</month><day>19</day><volume>7</volume><issue>1</issue><fpage>186</fpage><pub-id pub-id-type="doi">10.1038/s41597-020-0526-3</pub-id><pub-id pub-id-type="medline">32561845</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>CC</given-names> </name><name name-style="western"><surname>Leu</surname><given-names>YS</given-names> </name><name name-style="western"><surname>Kuo</surname><given-names>CFJ</given-names> </name><name name-style="western"><surname>Chu</surname><given-names>WL</given-names> </name><name name-style="western"><surname>Chu</surname><given-names>YH</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>HC</given-names> </name></person-group><article-title>Automatic recognizing of vocal fold disorders from glottis images</article-title><source>Proc Inst Mech Eng H</source><year>2014</year><month>09</month><volume>228</volume><issue>9</issue><fpage>952</fpage><lpage>961</lpage><pub-id pub-id-type="doi">10.1177/0954411914551851</pub-id><pub-id pub-id-type="medline">25313026</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kuo</surname><given-names>CFJ</given-names> </name><name name-style="western"><surname>Lai</surname><given-names>WS</given-names> </name><name name-style="western"><surname>Barman</surname><given-names>J</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>SC</given-names> </name></person-group><article-title>Quantitative laryngoscopy with computer-aided diagnostic system for laryngeal lesions</article-title><source>Sci Rep</source><year>2021</year><month>05</month><day>12</day><volume>11</volume><issue>1</issue><fpage>10147</fpage><pub-id pub-id-type="doi">10.1038/s41598-021-89680-9</pub-id><pub-id pub-id-type="medline">33980940</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Sandler</surname><given-names>M</given-names> </name><name name-style="western"><surname>Howard</surname><given-names>A</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>M</given-names> </name><name name-style="western"><surname>Zhmoginov</surname><given-names>A</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>LC</given-names> </name></person-group><article-title>MobileNetV2: inverted residuals and linear bottlenecks</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 13, 2018</comment><pub-id pub-id-type="doi">10.48550/arXiv.1801.04381</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Dalal</surname><given-names>N</given-names> </name><name name-style="western"><surname>Triggs</surname><given-names>B</given-names> </name></person-group><article-title>Histograms of oriented gradients for human detection</article-title><conf-name>2005 IEEE Computer Society Conference on Computer Vision and Pattern Recognition (CVPR&#x2019;05)</conf-name><conf-date>Jun 20-25, 2025</conf-date><conf-loc>San Diego, CA, USA</conf-loc><fpage>886</fpage><lpage>893</lpage><pub-id pub-id-type="doi">10.1109/CVPR.2005.177</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Indolia</surname><given-names>S</given-names> </name><name name-style="western"><surname>Goswami</surname><given-names>AK</given-names> </name><name name-style="western"><surname>Mishra</surname><given-names>SP</given-names> </name><name name-style="western"><surname>Asopa</surname><given-names>P</given-names> </name></person-group><article-title>Conceptual understanding of convolutional neural network- a deep learning approach</article-title><source>Procedia Comput Sci</source><year>2018</year><volume>132</volume><fpage>679</fpage><lpage>688</lpage><pub-id pub-id-type="doi">10.1016/j.procs.2018.05.069</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>He</surname><given-names>K</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Ren</surname><given-names>S</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>J</given-names> </name></person-group><article-title>Deep residual learning for image recognition</article-title><conf-name>2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name><conf-date>Jun 27-30, 2016</conf-date><conf-loc>Las Vegas, NV, USA</conf-loc><fpage>770</fpage><lpage>778</lpage><pub-id pub-id-type="doi">10.1109/CVPR.2016.90</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Srivastava</surname><given-names>N</given-names> </name><name name-style="western"><surname>Hinton</surname><given-names>G</given-names> </name><name name-style="western"><surname>Krizhevsky</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sutskever</surname><given-names>I</given-names> </name><name name-style="western"><surname>Salakhutdinov</surname><given-names>R</given-names> </name></person-group><article-title>Dropout: a simple way to prevent neural networks from overfitting</article-title><source>J Mach Learn Res</source><year>2014</year><access-date>2025-09-23</access-date><volume>15</volume><issue>56</issue><fpage>1929</fpage><lpage>1958</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/abs/10.5555/2627435.2670313">https://dl.acm.org/doi/abs/10.5555/2627435.2670313</ext-link></comment></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hochreiter</surname><given-names>S</given-names> </name></person-group><article-title>The vanishing gradient problem during learning recurrent neural nets and problem solutions</article-title><source>Int J Unc Fuzz Knowl Based Syst</source><year>1998</year><month>04</month><volume>06</volume><issue>2</issue><fpage>107</fpage><lpage>116</lpage><pub-id pub-id-type="doi">10.1142/S0218488598000094</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bangalore Vijayakumar</surname><given-names>S</given-names> </name><name name-style="western"><surname>Chitty-Venkata</surname><given-names>KT</given-names> </name><name name-style="western"><surname>Arya</surname><given-names>K</given-names> </name><name name-style="western"><surname>Somani</surname><given-names>AK</given-names> </name></person-group><article-title>ConVision benchmark: a contemporary framework to benchmark CNN and ViT models</article-title><source>AI</source><year>2024</year><volume>5</volume><issue>3</issue><fpage>1132</fpage><lpage>1171</lpage><pub-id pub-id-type="doi">10.3390/ai5030056</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Jadon</surname><given-names>S</given-names> </name></person-group><article-title>A survey of loss functions for semantic segmentation</article-title><access-date>2025-09-23</access-date><conf-name>2020 IEEE Conference on Computational Intelligence in Bioinformatics and Computational Biology (CIBCB)</conf-name><conf-date>Oct 27-29, 2020</conf-date><conf-loc>Via del Mar, Chile</conf-loc><fpage>1</fpage><lpage>7</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber=9276489">https://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber=9276489</ext-link></comment><pub-id pub-id-type="doi">10.1109/CIBCB48159.2020.9277638</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Meyes</surname><given-names>R</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>M</given-names> </name><name name-style="western"><surname>de Puiseau</surname><given-names>CW</given-names> </name><name name-style="western"><surname>Meisen</surname><given-names>T</given-names> </name></person-group><article-title>Ablation studies in artificial neural networks</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 24, 2019</comment><pub-id pub-id-type="doi">10.48550/arXiv.1901.08644</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Breiman</surname><given-names>L</given-names> </name></person-group><article-title>Bagging predictors</article-title><source>Mach Learn</source><year>1996</year><month>08</month><volume>24</volume><issue>2</issue><fpage>123</fpage><lpage>140</lpage><pub-id pub-id-type="doi">10.1023/A:1018054314350</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bengs</surname><given-names>M</given-names> </name><name name-style="western"><surname>Westermann</surname><given-names>S</given-names> </name><name name-style="western"><surname>Gessert</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Spatio-spectral deep learning methods for in-vivo hyperspectral laryngeal cancer detection</article-title><source>Computer-Aided Diagnosis</source><year>2020</year><fpage>369</fpage><lpage>374</lpage><pub-id pub-id-type="doi">10.1117/12.2549251</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Moccia</surname><given-names>S</given-names> </name><name name-style="western"><surname>De Momi</surname><given-names>E</given-names> </name><name name-style="western"><surname>Guarnaschelli</surname><given-names>M</given-names> </name><name name-style="western"><surname>Savazzi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Laborai</surname><given-names>A</given-names> </name></person-group><article-title>Confident texture-based laryngeal tissue classification for early stage diagnosis support</article-title><source>J Med Imag</source><year>2017</year><month>09</month><volume>4</volume><issue>3</issue><fpage>1</fpage><pub-id pub-id-type="doi">10.1117/1.JMI.4.3.034502</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Takahashi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Sakaguchi</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Kouno</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Comparison of vision transformers and convolutional neural networks in medical image analysis: a systematic review</article-title><source>J Med Syst</source><year>2024</year><month>09</month><day>12</day><volume>48</volume><issue>1</issue><fpage>84</fpage><pub-id pub-id-type="doi">10.1007/s10916-024-02105-8</pub-id><pub-id pub-id-type="medline">39264388</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>LL</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>M</given-names> </name></person-group><article-title>Towards efficient vision transformer inference: a first study of transformers on mobile devices</article-title><access-date>2025-10-01</access-date><conf-name>Proceedings of the 23rd Annual International Workshop on Mobile Computing Systems and Applications</conf-name><conf-date>Mar 9-10, 2022</conf-date><conf-loc>New York, NY, USA</conf-loc><fpage>1</fpage><lpage>7</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/10.1145/3508396">https://dl.acm.org/doi/10.1145/3508396</ext-link></comment><pub-id pub-id-type="doi">10.1145/3508396.3512869</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Youn</surname><given-names>E</given-names> </name><name name-style="western"><surname>Prabhu</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>S</given-names> </name></person-group><article-title>Compressing vision transformers for low-resource visual learning</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 5, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2309.02617</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Habib</surname><given-names>G</given-names> </name><name name-style="western"><surname>Saleem</surname><given-names>TJ</given-names> </name><name name-style="western"><surname>Lall</surname><given-names>B</given-names> </name></person-group><article-title>Knowledge distillation in vision transformers: a critical review</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 4, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2302.02108</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Maur&#x00ED;cio</surname><given-names>J</given-names> </name><name name-style="western"><surname>Domingues</surname><given-names>I</given-names> </name><name name-style="western"><surname>Bernardino</surname><given-names>J</given-names> </name></person-group><article-title>Comparing vision transformers and convolutional neural networks for image classification: a literature review</article-title><source>Appl Sci (Basel)</source><year>2023</year><month>01</month><volume>13</volume><issue>9</issue><fpage>5521</fpage><pub-id pub-id-type="doi">10.3390/app13095521</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>J</given-names> </name><name name-style="western"><surname>Tang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>C</given-names> </name><name name-style="western"><surname>Landman</surname><given-names>BA</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>SK</given-names> </name></person-group><article-title>Transforming medical imaging with transformers? A comparative review of key properties, current progresses, and future perspectives</article-title><source>Med Image Anal</source><year>2023</year><month>04</month><volume>85</volume><fpage>102762</fpage><pub-id pub-id-type="doi">10.1016/j.media.2023.102762</pub-id><pub-id pub-id-type="medline">36738650</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Futoma</surname><given-names>J</given-names> </name><name name-style="western"><surname>Simons</surname><given-names>M</given-names> </name><name name-style="western"><surname>Panch</surname><given-names>T</given-names> </name><name name-style="western"><surname>Doshi-Velez</surname><given-names>F</given-names> </name><name name-style="western"><surname>Celi</surname><given-names>LA</given-names> </name></person-group><article-title>The myth of generalisability in clinical research and machine learning in health care</article-title><source>Lancet Digit Health</source><year>2020</year><month>09</month><volume>2</volume><issue>9</issue><fpage>e489</fpage><lpage>e492</lpage><pub-id pub-id-type="doi">10.1016/S2589-7500(20)30186-2</pub-id><pub-id pub-id-type="medline">32864600</pub-id></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>Y</given-names> </name></person-group><article-title>Efficient visual transformer by learnable token merging</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 21, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2407.15219</pub-id></nlm-citation></ref><ref id="ref53"><label>53</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Mehta</surname><given-names>S</given-names> </name><name name-style="western"><surname>Rastegari</surname><given-names>M</given-names> </name></person-group><article-title>MobileViT: light-weight, general-purpose, and mobile-friendly vision transformer</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 5, 2021</comment><pub-id pub-id-type="doi">10.48550/arXiv.2110.02178</pub-id></nlm-citation></ref></ref-list></back></article>