<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v9i1e70070</article-id><article-id pub-id-type="doi">10.2196/70070</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Exploring Generative Pre-Trained Transformer-4-Vision for Nystagmus Classification: Development and Validation of a Pupil-Tracking Process</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Noda</surname><given-names>Masao</given-names></name><degrees>MD, MBA, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Koshu</surname><given-names>Ryota</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Tsunoda</surname><given-names>Reiko</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ogihara</surname><given-names>Hirofumi</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kamo</surname><given-names>Tomohiko</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ito</surname><given-names>Makoto</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Fushiki</surname><given-names>Hiroaki</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Otolaryngology, Mejiro University Ear Institute Clinic</institution><addr-line>320 Ukiya</addr-line><addr-line>Iwatsuki-ku, Saitama-shi, Saitama</addr-line><country>Japan</country></aff><aff id="aff2"><institution>Department of Otolaryngology, Jichi Medical University</institution><addr-line>Shimotsuke</addr-line><country>Japan</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Mavragani</surname><given-names>Amaryllis</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Chandrashekar</surname><given-names>Pramod Bharadwaj</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Raza</surname><given-names>Shaina</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Masao Noda, MD, MBA, PhD, Department of Otolaryngology, Mejiro University Ear Institute Clinic, 320 Ukiya, Iwatsuki-ku, Saitama-shi, Saitama, 339-8501, Japan, 81 48 797 3341; <email>doforanabdosuc@gmail.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>6</day><month>6</month><year>2025</year></pub-date><volume>9</volume><elocation-id>e70070</elocation-id><history><date date-type="received"><day>14</day><month>12</month><year>2024</year></date><date date-type="rev-recd"><day>02</day><month>03</month><year>2025</year></date><date date-type="accepted"><day>07</day><month>04</month><year>2025</year></date></history><copyright-statement>&#x00A9; Masao Noda, Ryota Koshu, Reiko Tsunoda, Hirofumi Ogihara, Tomohiko Kamo, Makoto Ito, Hiroaki Fushiki. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 6.6.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2025/1/e70070"/><abstract><sec><title>Background</title><p>Conventional nystagmus classification methods often rely on subjective observation by specialists, which is time-consuming and variable among clinicians. Recently, deep learning techniques have been used to automate nystagmus classification using convolutional and recurrent neural networks. These networks can accurately classify nystagmus patterns using video data. However, associated challenges including the need for large datasets when creating models, limited applicability to address specific image conditions, and the complexity associated with using these models.</p></sec><sec><title>Objective</title><p>This study aimed to evaluate a novel approach for nystagmus classification that used the Generative Pre-trained Transformer 4 Vision (GPT-4V) model, which is a state-of-the-art large-scale language model with powerful image recognition capabilities.</p></sec><sec sec-type="methods"><title>Methods</title><p>We developed a pupil-tracking process using a nystagmus-recording video and verified the optimization model&#x2019;s accuracy using GPT-4V classification and nystagmus recording. We tested whether the created optimization model could be evaluated in six categories of nystagmus: right horizontal, left horizontal, upward, downward, right torsional, and left torsional. The traced trajectory was input as two-dimensional coordinate data or an image, and multiple in-context learning methods were evaluated.</p></sec><sec sec-type="results"><title>Results</title><p>The developed model showed an overall classification accuracy of 37% when using pupil-traced images and a maximum accuracy of 24.6% when pupil coordinates were used as input. Regarding orientation, we achieved a maximum accuracy of 69% for the classification of horizontal nystagmus patterns but a lower accuracy for the vertical and torsional components.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>We demonstrated the potential of versatile vertigo management in a generative artificial intelligence model that improves the accuracy and efficiency of nystagmus classification. We also highlighted areas for further improvement, such as expanding the dataset size and enhancing input modalities, to improve classification performance across all nystagmus types. The GPT-4V model validated only for recognizing still images can be linked to video classification and proposed as a novel method.</p></sec></abstract><kwd-group><kwd>nystagmus</kwd><kwd>GPT-4Vision</kwd><kwd>generative AI</kwd><kwd>deep learning</kwd><kwd>dizziness</kwd><kwd>artificial intelligence</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Equilibrium function in vertigo practice can be evaluated through nystagmus assessment. Nystagmus is characterized by rhythmically repeated rapid and slow eye movements and serves as a valuable clinical indicator for diagnosing various neurological and vestibular disorders. Nystagmus can influence the normal function of the cerebellum, semicircular canals, and integrated eye movements, and thus is of great diagnostic and therapeutic importance [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>]. The direction of nystagmus can be horizontal, vertical, or torsional (rotational), depending on the axis of the eye movement. The evaluation of nystagmus patterns provides essential insight into the function of the visual and vestibular systems by identifying underlying foci and guiding treatment strategies. Traditionally, nystagmus classification has relied heavily on subjective observation by trained specialists, which is time-consuming, prone to variability among clinicians, and can be difficult to perform in the emergency department [<xref ref-type="bibr" rid="ref4">4</xref>]. Furthermore, advancement of diagnosis can contribute to improper treatment, increasing the risk of falls or decreasing daily physical activity levels [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>].</p><p>In recent years, advances in artificial intelligence (AI) and machine learning technologies have provided promising means to capture eye movements [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>] and automate nystagmus classification, thereby improving the accuracy and efficiency of diagnosis [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>]. In particular, deep learning methods such as convolutional and recurrent neural networks are increasingly being used to analyze medical imaging data, including videos that capture eye movements [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>]. Although these techniques have shown success in tasks such as image classification, object detection, and segmentation, applying deep learning to nystagmus classification remains challenging given the temporal changes inherent in eye movement patterns. Interestingly, recent studies have demonstrated promising outcomes by using deep learning techniques to annotate scenes or detect spatiotemporal features. This suggests that the potential of deep learning algorithms to classify nystagmus patterns based on their capture, and interpretation of these temporal characteristics requires sophisticated modeling methods that can effectively process ordinal data. Notably, several reports have used deep learning to enable a unified evaluation of other perceptions; however, creating models requires large amounts of data, and it is unclear how such models can be used.</p><p>A large-scale language processing model, known as a large language model (LLM) is a highly versatile system trained on extensive text data using a transformer architecture. It has demonstrated high accuracy in medical classification and text recognition [<xref ref-type="bibr" rid="ref13">13</xref>]. In one such model, the Generative Pre-Trained Transformer (GPT), the advent of GPT-4Vision (GPT-4V) made it possible to combine image recognition by devising prompts and classification without requiring image training data [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>]. Unlike convolutional neural networks or recurring neural networks, GPT-4V leverages in-context learning, allowing it to classify visual and multimodal data without extensive training datasets [<xref ref-type="bibr" rid="ref16">16</xref>]. In this study, we developed a novel nystagmus classification approach that leverages the capabilities of the GPT-4V model. We aimed to develop a classification system that can accurately identify different nystagmus patterns and validate its accuracy by integrating GPT-4V with an eye movement tracking algorithm from eye movement video data. This study is one of the first to evaluate the feasibility of GPT-4V for nystagmus classification, particularly in scenarios where dataset limitations exist.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><p>In this study, we developed a pupil-tracking process using nystagmus recording videos and verified the accuracy of the optimization model using GPT-4V classification and nystagmus recording (<xref ref-type="fig" rid="figure1">Figure 1</xref>).</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Study overview. GPT-4V: Generative Pre-trained Transformer 4 Vision.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e70070_fig01.png"/></fig><sec id="s2-1"><title>Developing the Pupil Tracking Process</title><p>First, the eyeballs were recognized from the video data, and pupil movement co-ordinate data were created using an eye movement tracking algorithm that showed the eyeball trajectory. Video data were recorded using a charge-coupled device-based camera eyeball rotation imaging device, ET-60LW2 (Newopto), with a focal length of 6 mm, and a horizontal resolution &#x003E;500 television lines. The sensor size is 1/3 inch and intermittent synchronous lighting occurred every 1/30 second. A single video contained at least three nystagmus in the same direction and was 3 to 5 seconds in duration.</p><p>Based on these data, 190 trace images were created for each video from patients whose main complaint was dizziness. Videos were created using the same algorithm used to generate trace images of the same individual, with no more than two nystagmus events. After applying the exclusion criteria, trace data were successfully generated for 139 patients. The exclusion criteria included cases in which the edges of the eyes extended beyond the screen, images that were too unclear to be evaluated, and the presence of foreign objects or masses on the eyelid or conjunctiva.</p></sec><sec id="s2-2"><title>Pupil Movement Tracking Algorithm</title><p>An eye movement tracking algorithm based on video data was developed using the Haar cascade classifier and OpenCV [<xref ref-type="bibr" rid="ref17">17</xref>]. Although deep learning&#x2013;based approaches may offer higher accuracy for detecting facial features, they often require extensive data or a pretrained model, as well as comparatively intensive inference processes. The outline of the pupil and its center within the screen were detected, enabling the tracking of the center&#x2019;s coordinates. In contrast, the Haar cascade classifier, as part of the OpenCV suite, offers a more streamlined and efficient alternative, enabling the tracking of the eye&#x2019;s trajectory at a rate of 40 frames per second with significantly reduced complexity and setup time, as reported previously [<xref ref-type="bibr" rid="ref18">18</xref>-<xref ref-type="bibr" rid="ref20">20</xref>]. The algorithm was designed to superimpose trajectory data onto the first frames of eye movement videos, marking the starting and ending points of the pupil center&#x2019;s paths. If the algorithm failed to detect the eye due to occlusions or rapid movements, the trajectories were interpolated, and measurement points that could not be captured were omitted from the trajectory path. Additionally, the movement data comprising the x and y coordinates of the pupil at each measurement time point were systematically converted into CSV files for further analysis using LLMs.</p><p>This approach proved strategic for our study&#x2019;s requirements, allowing for rapid development and ease of modification. This was particularly beneficial in our context, in which real-time processing was prioritized over the incremental gains in accuracy afforded by more computationally intensive models.</p></sec><sec id="s2-3"><title>GPT-4V Classification With Nystagmus Recording</title><p>We developed GPT-4V models to classify eye movement trajectories by generating still images tracked by the algorithm and then inputting these images or CSV data into the GPT-4V model to obtain answers for the classification. When only traced images or coordinates were input, no significant or advantageous responses were obtained. For CSV data inputs containing the pupils&#x2019; coordinates (X and Y axes) and their respective measurements, we embedded these data directly into the prompts for the GPT-4 model. We tested three combinations of inputs: only CSV, which used GPT-4 (GPT-4 Turbo); only still images; and a combination of CSV and still images, with the latter two using GPT-4V (GPT-4 Turbo with vision).</p><p>A feature of LLMs is in-context learning through prompting. The model was developed using prompts based on previous studies [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>] and the chain-of-thoughts (CoT) prompting technique, which allows LLMs to make complex inferences by entering thought processes into the prompts to facilitate the inference process and reasoning [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>] (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). We also used methods, such as metarecognition (MR) and the Rule of 26, which complicates the thought process by making the user aware of the content in reaction to their responses [<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref26">26</xref>]. We tested whether the created optimization model could be evaluated in six nystagmus categories: right horizontal, left horizontal, upward, downward, right torsional, and left torsional. Of the trace images obtained from the pupil-tracking process, those with trace points other than the pupil, traces beyond the eyeline, or some areas that could not be traced were excluded by visual inspection by experts. The correctness rate was evaluated for 139 data points (78, 26, and 35 in the horizontal, vertical, and torsional directions, respectively).</p><p>An application programming interface was set up for model validation, and the temperature parameter was set to 0 to account for variations in responses. Experts with &#x003E;20 years of vertigo practice experience judged whether the answers and the explanatory content were appropriate. Even if the final answer was correct, the details that led to the answer were checked, and if they were incorrect, the answer was considered incorrect. Data collected from the video recordings of eye movements exhibiting various nystagmus patterns were used. Each video clip was independently reviewed and annotated by two skilled vertigo specialists to identify the presence and type of nystagmus pattern.</p></sec><sec id="s2-4"><title>Ethical Considerations</title><p>This study was approved by the Medical Research Ethics Committee of Mejiro University (approval number: Medical 20&#x2010;007). Informed consent was obtained from all subjects involved in the study. Written informed consent has been obtained from the patients to publish this paper, as applicable. In accordance with ethical guidelines, an opt-out approach was adopted. Detailed information regarding the study&#x2014;including its purpose, data handling procedures, and measures for protecting personal information&#x2014;was made publicly available, and participants were given the opportunity to decline participation if they wished. All data collected in this study were anonymized, ensuring that no personally identifiable information was included in the analysis or publication. No financial or material compensation was provided to the participant.</p></sec><sec id="s2-5"><title>Study/Clinical Setting of Recruitment</title><p>Participants were recruited in this study from the otolaryngology outpatient clinic at Mejiro University, a tertiary center specializing in the diagnosis and treatment of vestibular disorders. The patient included in this study was referred to the clinic after experiencing episodes of vertigo. During the clinical examination, ocular movements were recorded and the resulting video data were used for this study.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>In this study, we developed a system to classify nystagmus using GPT-4V and obtained the following results.</p><sec id="s3-1"><title>Pupil Tracking Process</title><p>The eye movement tracking program accurately recognized the eyeballs in the video and depicted their trajectories (<xref ref-type="fig" rid="figure2">Figure 2</xref>). The program successfully tracked eye movements and generated datasets for each nystagmus category.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Representative tracing images of six nystagmus types (ie, right horizontal, left horizontal, upward, downward, right torsional, and left torsional) obtained from video-based pupil tracking. Data were collected from patients with vestibular disorders. The images illustrate pupil movement trajectories detected using an AI-based classification model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e70070_fig02.png"/></fig></sec><sec id="s3-2"><title>GPT-4V Prompting</title><p>A nystagmus classification model was constructed by setting and training appropriate prompts on the GPT-4V model. Optimizing the prompts improved the classification accuracy and adapted the model&#x2019;s response to specific nystagmus patterns. We used both basic and additional prompts for CoT and MR (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p></sec><sec id="s3-3"><title>Validation of Nystagmus With GPT-4V</title><p>For the &#x201C;image only&#x201D; input category, the &#x201C;basic&#x201D; prompt method yielded a correct response in 43 (30.9%) instances, whereas no response was recorded in 24 (17.3%) instances (<xref ref-type="table" rid="table1">Table 1</xref>). The &#x201C;CSV only&#x201D; input category under the &#x201C;basic&#x201D; prompt approach resulted in 27 (19.4%) correct responses, with 14 (10.1%) instances of no response. When both images and CSV input were used, the &#x201C;basic&#x201D; prompt method delivered 37 (26.6%) correct responses, with no response in 12 (8.6%) instances.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Performance of GPT-4V in classifying nystagmus patterns based on video-recorded eye movements. The table presents the classification accuracy for six nystagmus types across different input modalities (image only, CSV only, and image+CSV). Data were collected from 139 patients diagnosed with vestibular disorders.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Prompting techniques</td><td align="left" valign="bottom" colspan="9">Input modalities and classification accuracy (N=139)</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom" colspan="3">Image only</td><td align="left" valign="bottom" colspan="3">CSV only</td><td align="left" valign="bottom" colspan="3">Image+CSV</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Correct,<break/>n (%)</td><td align="left" valign="top">No response,<break/>n (%)</td><td align="left" valign="top">Correct response (%)</td><td align="left" valign="top">Correct,<break/>n (%)</td><td align="left" valign="top">No response,<break/>n (%)</td><td align="left" valign="top">Correct response (%)</td><td align="left" valign="top">Correct,<break/>n (%)</td><td align="left" valign="top">No response,<break/>n (%)</td><td align="left" valign="top">Correct response (%)</td></tr></thead><tbody><tr><td align="left" valign="top">Basic</td><td align="left" valign="top">43 (30.9)</td><td align="left" valign="top">24 (17.3)</td><td align="left" valign="top">&#x2003;37.4</td><td align="left" valign="top">27 (19.4)</td><td align="left" valign="top">14 (10.1)</td><td align="left" valign="top">&#x2003;21.6</td><td align="left" valign="top">37 (26.6)</td><td align="left" valign="top">12 (8.6)</td><td align="left" valign="top">&#x2003;29.1</td></tr><tr><td align="left" valign="top">CoT</td><td align="left" valign="top">47 (33.8)</td><td align="left" valign="top">13 (9.4)</td><td align="left" valign="top">&#x2003;37.3</td><td align="left" valign="top">20 (14.4)</td><td align="left" valign="top">26 (18.7)</td><td align="left" valign="top">&#x2003;17.7</td><td align="left" valign="top">33 (23.7)</td><td align="left" valign="top">32 (23.0)</td><td align="left" valign="top">&#x2003;30.8</td></tr><tr><td align="left" valign="top">MR</td><td align="left" valign="top">50 (36.0)</td><td align="left" valign="top">5 (3.6)</td><td align="left" valign="top">&#x2003;37.3</td><td align="left" valign="top">28 (20.1)</td><td align="left" valign="top">25 (18.0)</td><td align="left" valign="top">&#x2003;24.6</td><td align="left" valign="top">30 (21.6)</td><td align="left" valign="top">33 (23.7)</td><td align="left" valign="top">&#x2003;28.3</td></tr><tr><td align="left" valign="top">CoT+ MR</td><td align="left" valign="top">48 (34.5)</td><td align="left" valign="top">12 (8.6)</td><td align="left" valign="top">&#x2003;37.8</td><td align="left" valign="top">15 (10.8)</td><td align="left" valign="top">69 (49.6)</td><td align="left" valign="top">&#x2003;21.4</td><td align="left" valign="top">35 (25.2)</td><td align="left" valign="top">32 (23.0)</td><td align="left" valign="top">&#x2003;32.7</td></tr></tbody></table></table-wrap><p>The other methods showed similar trends, with the CoT prompt approach slightly improving in the &#x201C;image only&#x201D; input category with 47 (33.8%) correct responses. Conversely, the MR prompt method outperformed the others in the &#x201C;image only&#x201D; input domain, with 50 (36.0%) correct classifications. When combining the CoT and MR prompts, the &#x201C;image only&#x201D; input domain showed 48 (34.5%) correct classifications. However, there was a modest improvement in the &#x201C;image+CSV&#x201D; input category, with 35 (25.2%) correct responses.</p><p><xref ref-type="table" rid="table2">Table 2</xref> shows the outcome of the GPT-4V classification accuracy in the presence of nystagmus-recording data segregated according to each nystagmus direction. Four types of prompting classification strategies were assessed: basic, CoT, MR, and a composite of CoT and MR. The evaluation was further stratified into three data inputs: image only, CSV only, and Image+CSV.</p><p>For downward and upward nystagmus, the highest correct classification rates were 37.5% and 27.8%, with a total data count of 8 and 18 instances, respectively.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>GPT-4V classification with nystagmus recording for each direction.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Nystagmus direction</td><td align="left" valign="bottom" colspan="8">Classification strategies and data input categories</td><td align="left" valign="bottom" colspan="4"/></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom" colspan="4">Image only (correct classification rates), n (%)</td><td align="left" valign="bottom" colspan="4">CSV only (correct classification rates), n (%)</td><td align="left" valign="bottom" colspan="4">Image+CSV (correct classification rates), n (%)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Basic</td><td align="left" valign="top">CoT</td><td align="left" valign="top">MR</td><td align="left" valign="top">CoT+MR</td><td align="left" valign="top">Basic</td><td align="left" valign="top">CoT</td><td align="left" valign="top">MR</td><td align="left" valign="top">CoT+MR</td><td align="left" valign="top">Basic</td><td align="left" valign="top">CoT</td><td align="left" valign="top">MR</td><td align="left" valign="top">CoT+MR</td></tr></thead><tbody><tr><td align="left" valign="top">Downward (n=8)<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup>, n (%)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">1 (12.5)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">2 (25)</td><td align="left" valign="top">2 (25)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">2 (25)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">1 (12.5)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">3 (37.5)</td></tr><tr><td align="left" valign="top">Left horizontal (n=36)<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup>, n (%)</td><td align="left" valign="top">14 (38.9)</td><td align="left" valign="top">14 (38.9)</td><td align="left" valign="top">20 (55.6)</td><td align="left" valign="top">14 (38.9)</td><td align="left" valign="top">3 (8.3)</td><td align="left" valign="top">2 (5.6)</td><td align="left" valign="top">3 (8.3)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">9 (25)</td><td align="left" valign="top">7 (19.4)</td><td align="left" valign="top">9 (25)</td><td align="left" valign="top">7 (19.4)</td></tr><tr><td align="left" valign="top">Left torsional (n=16)<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup>, n (%)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">1 (6.3)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">3 (18.8)</td><td align="left" valign="top">1 (6.3)</td><td align="left" valign="top">3 (18.8)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">1 (6.3)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td></tr><tr><td align="left" valign="top">Right horizontal (n=42)<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup>, n (%)</td><td align="left" valign="top">27 (64.3)</td><td align="left" valign="top">27 (64.3)</td><td align="left" valign="top">25 (59.5)</td><td align="left" valign="top">29 (69.0)</td><td align="left" valign="top">20 (47.6)</td><td align="left" valign="top">15 (35.7)</td><td align="left" valign="top">17 (40.5)</td><td align="left" valign="top">10 (23.8)</td><td align="left" valign="top">23 (54.8)</td><td align="left" valign="top">23 (54.8)</td><td align="left" valign="top">17 (40.5)</td><td align="left" valign="top">22 (52.4)</td></tr><tr><td align="left" valign="top">Right torsional (n=19)<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup>, n (%)</td><td align="left" valign="top">1 (5.3)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">1 (5.3)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td></tr><tr><td align="left" valign="top">Upward (n=18)<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup>, n (%)</td><td align="left" valign="top">1 (5.6)</td><td align="left" valign="top">5 (27.8)</td><td align="left" valign="top">4 (22.2)</td><td align="left" valign="top">3 (16.7)</td><td align="left" valign="top">2 (11.1)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">5 (27.8)</td><td align="left" valign="top">1 (5.6)</td><td align="left" valign="top">4 (22.2)</td><td align="left" valign="top">2 (11.1)</td><td align="left" valign="top">4 (22.2)</td><td align="left" valign="top">3 (16.7)</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>Total number of nystagmus cases examined.</p></fn></table-wrap-foot></table-wrap><p>For left and right torsional nystagmus, the highest correct classification rates were 6.3% and 5.3% with the &#x201C;image only&#x201D; inputs, 18.8% and 5.3% with &#x201C;CSV only&#x201D; inputs, and 6.3% and 0% with &#x201C;image+CSV&#x201D; inputs, respectively.</p><p>For the left horizontal nystagmus with 36 data instances, the &#x201C;image only&#x201D; input showed superior performance with 38.9% correct classifications using the basic prompt, improving incrementally with the MR prompt at 55.6%. In the right horizontal category with 42 instances, the &#x201C;image only&#x201D; input showed a higher correct rate compared to the other input types, with the basic and CoT prompts demonstrating an accuracy rate of 64.3%, while the combined CoT+ MR prompt exhibited the highest accuracy rate at 69%.</p><p>The &#x201C;CSV only&#x201D; mode indicated a generally lower correct classification rate across all directions and methodologies. Notably, the right torsional and left torsional classifications demonstrated zero correct classifications with basic prompts.</p><p>Overall, <xref ref-type="table" rid="table2">Table 2</xref> highlights the varying degrees of classification accuracy depending on the direction of nystagmus, data presentation format, and classification method. Inputs containing image data generally showed improved classification performance compared to using CSV only&#x201D; input.</p><p>The response trend for each type of data input was analyzed in terms of each direction (<xref ref-type="fig" rid="figure3">Figure 3</xref>). The classification performance of the GPT model was evaluated using three different input modalities: image only, CSV only, and image+ CSV. When using the &#x201C;image only&#x201D; input, the model achieved an accuracy of 0.289, with a precision of 0.211, a recall of 0.264, and an <italic>F</italic><sub>1</sub>-score of 0.169. For the &#x201C;CSV only&#x201D; input, the accuracy was 0.247, with a precision of 0.218, a recall of 0.210, and an <italic>F</italic><sub>1</sub>-score of 0.178. The combination of image and CSV inputs (image+ CSV) resulted in the highest performance among the three input types, with an accuracy of 0.356, a precision of 0.186, a recall of 0.222, and an <italic>F</italic><sub>1</sub>-score of 0.191. The confusion matrix showed a high frequency of horizontal responses, and this tendency was greater for &#x201C;image only&#x201D; and &#x201C;image+ CSV&#x201D; inputs. For inputs containing images, horizontal nystagmus tended to be misclassified more often, as horizontal nystagmus in the opposite direction occurred more frequently than in the other components. Results for the &#x201C;CSV Only&#x201D; input showed a broader distribution of misclassifications across multiple categories, with no specific tendency toward a particular type of nystagmus. Additionally, a notably high number of responses were categorized as &#x201C;others,&#x201D; indicating a difficulty in making definitive classifications using CSV data alone. In comparison, the &#x201C;Image&#x201D; input demonstrated a reduction in &#x201C;Others&#x201D; responses, highlighting improved performance and specificity when combining data modalities.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Confusion matrix of GPT-4V classification with nystagmus recording for each direction.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e70070_fig03.png"/></fig></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>In this study, we created and verified the accuracy of a nystagmus classification model using the GPT-4V. The results revealed several interesting insights. First, it became clear that nystagmus classification is possible using AI generated from a LLM model. Furthermore, the model&#x2019;s accuracy varied with different prompting adjustments, indicating that the accuracy varied with each nystagmus pattern, suggesting room for improvement in the model&#x2019;s performance without retraining the model or parameter tuning. Second, it was revealed that inputting images rather than tracking the pupil coordinates resulted in higher accuracy. Thus, the GPT-4V&#x2013;based nystagmus classification model achieved a certain level of success, and this study serves as the first step toward validating its potential for video analysis.</p><p>The GPT-4V-based nystagmus classification model derived from LLM demonstrated the ability to distinguish and classify different nystagmus patterns from video data. Overall, an accuracy rate of 17&#x2010;38% was achieved, with the classification of horizontal nystagmus patterns showing a 70% accuracy rate. This indicates that the GPT-4V model can effectively capture subtle differences in horizontal eye movements and suggests specific neurological or vestibular states. The fact that nystagmus classification is possible with an LLM model implies the potential for further development of nystagmus classification models previously tackled using deep learning models. The classification accuracy of deep learning models has been reported to be between 60&#x2010;90%, averaging approximately 80%, whereas the GPT-4V similar to LLM, has achieved a certain level of accuracy for horizontal nystagmus [<xref ref-type="bibr" rid="ref27">27</xref>-<xref ref-type="bibr" rid="ref29">29</xref>]. As LLMs do not require image training data and can be used in conversations, they can be readily used alongside other information during clinical assessments to confirm answers and diagnostics. At present, LLMs including GPT have improved learning accuracy and can generate still images and audio; however, video recognition has not yet been reported. In the future, our methodology can be applied to videos using the LLM.</p><p>Notably, accuracy varies with prompts and conditions [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref31">31</xref>]. We also report that the accuracy of medical licensing and otolaryngology expertise examinations can be improved by presenting choices in English and confirming the examiner&#x2019;s status [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>]. For the classification of nystagmus, we tested CoT, MR, and their combination, but found no significant improvement in accuracy. GPT is a pretrained general-purpose LLM, and its accuracy is believed to depend on factors such as the number of model parameters, amount of training data, and scale of computational resources [<xref ref-type="bibr" rid="ref32">32</xref>]. When provided with appropriate prompts, the model may use a greater portion of its parameters more efficiently, generating optimal outputs. Designing appropriate prompts for specific nystagmus patterns is necessary to further increase the model&#x2019;s discrimination accuracy. Regarding the input methods, the &#x201C;CSV Only&#x201D; input showed a notable tendency toward instances where the model frequently failed to classify any type of nystagmus. In contrast, inputs that included image data consistently resulted in producing a specific classification. This indicates that inputs containing images are more effective in enabling the model to provide responses and perform accurate classifications. Moreover, providing multimodal information, such as still images, patient information, and head position is expected to increase the accuracy of balance function tests.</p><p>For specific nystagmus patterns, classification was possible for horizontal movements; however, the accuracy of classifying vertical and torsional components was low, regardless of the image or coordinate input. This can be attributed to the evaluation of two dimensional movements, which makes torsional assessment challenging, similar to the limitations of deep learning using two dimensional video capture [<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref28">28</xref>]. Additionally, the amplitude of vertical movements was smaller than that of horizontal movements, which could be another reason for lower accuracy. Improving the accuracy and pupil tracking methods with the development of prompts specialized for vertical domains may also be effective for higher precision. Responses showed higher accuracy for identifying horizontal nystagmus compared to other types. These results suggest that the accuracy can be further improved by increasing the input images frame speed, performing more detailed preprocessing, such as changing the settings for each direction, or taking coordinates in three-dimensional directions. This study shows that GPT-4V, a LLM trained on extensive data, can achieve a certain level of accuracy through in-context learning methods such as CoT and MR highlights the broad applicability of this model.</p><p>Clinically, nystagmus findings are crucial indicators for evaluating vestibular function and are indispensable in clinical settings for neurological and otologic diseases [<xref ref-type="bibr" rid="ref5">5</xref>]. However, variability in physician assessments and the occurrence of findings only during vertigo attacks necessitate a stable evaluation model. Electronystagmography, a common method for recording nystagmus clinically, records eye movements as corneoretinal potentials but cannot measure torsional eye movements and has the disadvantage of difficulty in capturing three-dimensional movements. Additionally, the need for specialized equipment makes real-time recording during vertigo attacks challenging, limiting its frequent use in clinical practice. In contrast, methods for recording eye movements using video are becoming widespread [<xref ref-type="bibr" rid="ref33">33</xref>,<xref ref-type="bibr" rid="ref34">34</xref>], and with recent advancements in deep learning technology, consistent assessments may become possible. The use of highly versatile LLMs can further expand their application. For example, LLM can be used in written exchanges to confirm repetitions, reasons for thinking so, and corresponding details. The GPT-4V model, with its real-time clinical setting applications, demonstrated faster inference time and lower computational complexity than conventional deep learning techniques. Since some hallucinations may yield incorrect answers as if they were correct, a human must make the decision considering the AI responses [<xref ref-type="bibr" rid="ref35">35</xref>]. The model&#x2019;s accuracy, especially in this case, should be limited to cases where a human confirms the answer. Further improvement of the model&#x2019;s accuracy is required in the future.</p><p>As a limitation, this verification was specialized for classifying six types of nystagmus in videos and did not evaluate abnormal detections during regular examinations or nystagmus containing multiple components. Additionally, one limitation is that images being evaluated depend on the nystagmus in videos obtained during the examination, leading to variability in the amount of data depending on the type of nystagmus. Therefore, future studies should include improvements in the classification accuracy of nystagmus patterns and verification of the model&#x2019;s adaptability to mixed types of clinical data, such as horizontal- and vertical-torsional nystagmus. Moreover, the issue of hallucinations in LLMs is crucial, and how they are used is essential. At a minimum, educational purposes such as training medical professionals and pre-evaluation before doctors can judge the videos, could be effective. Devices capable of recording nystagmus using smartphones exist, and considerations must be made to record and assess patients without medical intervention [<xref ref-type="bibr" rid="ref36">36</xref>].</p><p>The GPT-4V based nystagmus classification model represents significant advancements in medical imaging and diagnostic techniques. Its high accuracy, efficiency, and potential for real-time application make it a valuable tool for improving the diagnosis and management of nystagmus. Continuous research and development in this area are essential for improving the model and maximizing its clinical utility.</p></sec><sec id="s4-2"><title>Conclusions</title><p>In this study, we developed a nystagmus classification model using GPT-4V and evaluated its performance. Unlike previous deep learning models, GPT-4V, centered on a LLM, presents a promising method for classifying nystagmus in video data and is expected to contribute to improved accuracy and efficiency in medical diagnoses. This represents a significant advance in medical AI and it is crucial to continue refining the model and consider its clinical applications to fully realize the potential benefits that AI technology brings to the medical field.</p></sec></sec></body><back><ack><p>The authors would like to thank Editage for their assistance in the English language editing process.</p><p>This research received no external funding.</p></ack><notes><sec><title>Data Availability</title><p>The datasets generated or analyzed during this study are available from the corresponding author on reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: MN, HF</p><p>Data curation: MN</p><p>Investigation, MN, HF</p><p>Methodology: MN, HF</p><p>Project administration: MN, HF</p><p>Resources: RT, HF</p><p>Supervision: HF</p><p>Validation: MN, HF</p><p>Visualization: MN, HF</p><p>Writing &#x2013; original draft: MN</p><p>Writing &#x2013; review &#x0026; editing: RK, RT, HO, TK, MI, HF</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">CoT</term><def><p>Chain of Thoughts</p></def></def-item><def-item><term id="abb3">GPT-4V</term><def><p>Generative Pre-trained Transformer-4-Vision</p></def></def-item><def-item><term id="abb4">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb5">MR</term><def><p>metarecognition</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ospina</surname><given-names>LH</given-names> </name></person-group><article-title>Dealing with nystagmus</article-title><source>J Binocul Vis Ocul Motil</source><year>2018</year><volume>68</volume><issue>4</issue><fpage>99</fpage><lpage>109</lpage><pub-id pub-id-type="doi">10.1080/2576117X.2018.1493311</pub-id><pub-id pub-id-type="medline">30322349</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kates</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Beal</surname><given-names>CJ</given-names> </name></person-group><article-title>Nystagmus</article-title><source>JAMA</source><year>2021</year><month>02</month><day>23</day><volume>325</volume><issue>8</issue><fpage>798</fpage><pub-id pub-id-type="doi">10.1001/jama.2020.3911</pub-id><pub-id pub-id-type="medline">33620408</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gottlob</surname><given-names>I</given-names> </name></person-group><article-title>Nystagmus</article-title><source>Curr Opin Ophthalmol</source><year>2001</year><month>10</month><volume>12</volume><issue>5</issue><fpage>378</fpage><lpage>383</lpage><pub-id pub-id-type="doi">10.1097/00055735-200110000-00010</pub-id><pub-id pub-id-type="medline">11588502</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Newman-Toker</surname><given-names>DE</given-names> </name><name name-style="western"><surname>Camargo</surname><given-names>CA</given-names> </name><name name-style="western"><surname>Hsieh</surname><given-names>YH</given-names> </name><name name-style="western"><surname>Pelletier</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Edlow</surname><given-names>JA</given-names> </name></person-group><article-title>Disconnect between charted vestibular diagnoses and emergency department management decisions: a cross-sectional analysis from a nationally representative sample</article-title><source>Acad Emerg Med</source><year>2009</year><month>10</month><volume>16</volume><issue>10</issue><fpage>970</fpage><lpage>977</lpage><pub-id pub-id-type="doi">10.1111/j.1553-2712.2009.00523.x</pub-id><pub-id pub-id-type="medline">19799573</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bhattacharyya</surname><given-names>N</given-names> </name><name name-style="western"><surname>Gubbels</surname><given-names>SP</given-names> </name><name name-style="western"><surname>Schwartz</surname><given-names>SR</given-names> </name><etal/></person-group><article-title>Clinical practice guideline: Benign paroxysmal positional vertigo (update)</article-title><source>Otolaryngol--head neck surg</source><year>2017</year><month>03</month><volume>156</volume><issue>S3</issue><fpage>S1</fpage><lpage>S47</lpage><pub-id pub-id-type="doi">10.1177/0194599816689667</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lopez-Escamez</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Gamiz</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Fernandez-Perez</surname><given-names>A</given-names> </name><name name-style="western"><surname>Gomez-Fi&#x00F1;ana</surname><given-names>M</given-names> </name></person-group><article-title>Long-term outcome and health-related quality of life in benign paroxysmal positional vertigo</article-title><source>Eur Arch Otorhinolaryngol</source><year>2005</year><month>06</month><volume>262</volume><issue>6</issue><fpage>507</fpage><lpage>511</lpage><pub-id pub-id-type="doi">10.1007/s00405-004-0841-x</pub-id><pub-id pub-id-type="medline">15942805</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Santini</surname><given-names>T</given-names> </name><name name-style="western"><surname>Fuhl</surname><given-names>W</given-names> </name><name name-style="western"><surname>Kasneci</surname><given-names>E</given-names> </name></person-group><article-title>PuReST: robust pupil tracking for real-time pervasive eye tracking</article-title><conf-name>ETRA &#x2019;18: 2018 Symposium on Eye Tracking Research and Applications</conf-name><conf-date>Jun 14-17, 2018</conf-date><conf-loc>Warsaw Poland</conf-loc><publisher-name>ACM</publisher-name><fpage>1</fpage><lpage>5</lpage><pub-id pub-id-type="doi">10.1145/3204493.3204578</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Eivazi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Santini</surname><given-names>T</given-names> </name><name name-style="western"><surname>Keshavarzi</surname><given-names>A</given-names> </name><name name-style="western"><surname>K&#x00FC;bler</surname><given-names>TC</given-names> </name><name name-style="western"><surname>Mazzei</surname><given-names>A</given-names> </name></person-group><article-title>Improving real-time CNN-based pupil detection through domain-specific data augmentation</article-title><year>2019</year><month>06</month><day>25</day><conf-name>ETRA &#x2019;19</conf-name><conf-date>Jun 25-28, 2019</conf-date><conf-loc>Denver Colorado</conf-loc><publisher-name>ACM</publisher-name><fpage>1</fpage><lpage>6</lpage><pub-id pub-id-type="doi">10.1145/3314111.3319914</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Otero-Millan</surname><given-names>J</given-names> </name><name name-style="western"><surname>Roberts</surname><given-names>DC</given-names> </name><name name-style="western"><surname>Lasker</surname><given-names>A</given-names> </name><name name-style="western"><surname>Zee</surname><given-names>DS</given-names> </name><name name-style="western"><surname>Kheradmand</surname><given-names>A</given-names> </name></person-group><article-title>Knowing what the brain is seeing in three dimensions: A novel, noninvasive, sensitive, accurate, and low-noise technique for measuring ocular torsion</article-title><source>J Vis</source><year>2015</year><volume>15</volume><issue>14</issue><fpage>11</fpage><pub-id pub-id-type="doi">10.1167/15.14.11</pub-id><pub-id pub-id-type="medline">26587699</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yiu</surname><given-names>YH</given-names> </name><name name-style="western"><surname>Aboulatta</surname><given-names>M</given-names> </name><name name-style="western"><surname>Raiser</surname><given-names>T</given-names> </name><etal/></person-group><article-title>DeepVOG: Open-source pupil segmentation and gaze estimation in neuroscience using deep learning</article-title><source>J Neurosci Methods</source><year>2019</year><month>08</month><day>1</day><volume>324</volume><fpage>108307</fpage><pub-id pub-id-type="doi">10.1016/j.jneumeth.2019.05.016</pub-id><pub-id pub-id-type="medline">31176683</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>W</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Deep learning based torsional nystagmus detection for dizziness and vertigo diagnosis</article-title><source>Biomed Signal Process Control</source><year>2021</year><month>07</month><volume>68</volume><fpage>102616</fpage><pub-id pub-id-type="doi">10.1016/j.bspc.2021.102616</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kermany</surname><given-names>DS</given-names> </name><name name-style="western"><surname>Goldbaum</surname><given-names>M</given-names> </name><name name-style="western"><surname>Cai</surname><given-names>W</given-names> </name><etal/></person-group><article-title>Identifying medical diagnoses and treatable diseases by image-based deep learning</article-title><source>Cell</source><year>2018</year><month>02</month><day>22</day><volume>172</volume><issue>5</issue><fpage>1122</fpage><lpage>1131</lpage><pub-id pub-id-type="doi">10.1016/j.cell.2018.02.010</pub-id><pub-id pub-id-type="medline">29474911</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jahan</surname><given-names>I</given-names> </name><name name-style="western"><surname>Laskar</surname><given-names>MTR</given-names> </name><name name-style="western"><surname>Peng</surname><given-names>C</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>JX</given-names> </name></person-group><article-title>A comprehensive evaluation of large language models on benchmark biomedical text processing tasks</article-title><source>Comput Biol Med</source><year>2024</year><month>03</month><volume>171</volume><fpage>108189</fpage><pub-id pub-id-type="doi">10.1016/j.compbiomed.2024.108189</pub-id><pub-id pub-id-type="medline">38447502</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="web"><article-title>GPT-4V(ision) system card</article-title><source>OpenAI</source><year>2023</year><access-date>2023-09-25</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openai.com/research/gpt-4v-system-card">https://openai.com/research/gpt-4v-system-card</ext-link></comment></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Noda</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ueno</surname><given-names>T</given-names> </name><name name-style="western"><surname>Koshu</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Performance of GPT-4V in answering the Japanese Otolaryngology Board Certification Examination questions: evaluation study</article-title><source>JMIR Med Educ</source><year>2024</year><month>03</month><day>28</day><volume>10</volume><fpage>e57054</fpage><pub-id pub-id-type="doi">10.2196/57054</pub-id><pub-id pub-id-type="medline">38546736</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Harsha</surname><given-names>N</given-names> </name><name name-style="western"><surname>Yin</surname><given-names>TL</given-names> </name><name name-style="western"><surname>Sheng</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Dean</surname><given-names>C</given-names> </name><name name-style="western"><surname>Richard</surname><given-names>E</given-names> </name><name name-style="western"><surname>Nicolo</surname><given-names>F</given-names> </name><etal/></person-group><article-title>Can generalist foundation models outcompete special-purpose tuning? Case study in medicine</article-title><source>arXiv</source><comment>Preprint posted online on  Nov 28, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2311.16452</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Nipun</surname><given-names>MS</given-names> </name><name name-style="western"><surname>Sulaiman</surname><given-names>RB</given-names> </name><name name-style="western"><surname>Kareem</surname><given-names>A</given-names> </name></person-group><article-title>Efficiency comparison of AI classification algorithms for image detection and recognition in real-time</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 12, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2206.05842</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Tian</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Kanade</surname><given-names>T</given-names> </name><name name-style="western"><surname>Cohn</surname><given-names>JF</given-names> </name></person-group><article-title>Dual-state parametric eye tracking</article-title><access-date>2025-04-26</access-date><conf-name>Proceedings Fourth IEEE International Conference on Automatic Face and Gesture Recognition</conf-name><conf-date>Mar 28-30, 2000</conf-date><conf-loc>Grenoble, France</conf-loc><comment><ext-link ext-link-type="uri" xlink:href="https://www.cs.cmu.edu/~face/Papers/fg1camera.pdf">https://www.cs.cmu.edu/~face/Papers/fg1camera.pdf</ext-link></comment></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Padilla</surname><given-names>R</given-names> </name><name name-style="western"><surname>Costa Filho</surname><given-names>CF</given-names> </name><name name-style="western"><surname>Costa</surname><given-names>M</given-names> </name></person-group><article-title>Evaluation of Haar cascade classifiers designed for face detection</article-title><source>World Acad Sci Eng Technol</source><year>2012</year><volume>6</volume><issue>4</issue><fpage>466</fpage><lpage>469</lpage><pub-id pub-id-type="doi">10.5281/zenodo.1058133</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wilson</surname><given-names>PI</given-names> </name><name name-style="western"><surname>Fernandez</surname><given-names>J</given-names> </name></person-group><article-title>Facial feature detection using Haar classifiers</article-title><source>J Comput Sci Coll</source><year>2006</year><access-date>2025-04-26</access-date><volume>21</volume><fpage>127</fpage><lpage>133</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://api.semanticscholar.org/CorpusID:426703">https://api.semanticscholar.org/CorpusID:426703</ext-link></comment></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tanaka</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Nakata</surname><given-names>T</given-names> </name><name name-style="western"><surname>Aiga</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Performance of generative pretrained transformer on the National Medical Licensing Examination in Japan</article-title><source>PLOS Digit Health</source><year>2024</year><month>01</month><volume>3</volume><issue>1</issue><fpage>e0000433</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000433</pub-id><pub-id pub-id-type="medline">38261580</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Noda</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ueno</surname><given-names>T</given-names> </name><name name-style="western"><surname>Koshu</surname><given-names>R</given-names> </name><etal/></person-group><article-title>A study of the performance of the generative pretrained transformer in the Japanese Otorhinolaryngology Specialty examination</article-title><source>Nippon Jibiinkoka Tokeibugeka Gakkai Kaiho(Tokyo)</source><year>2023</year><volume>126</volume><issue>11</issue><fpage>1217</fpage><lpage>1223</lpage><pub-id pub-id-type="doi">10.3950/jibiinkotokeibu.126.11_1217</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Bsharat</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Myrzakhan</surname><given-names>A</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>Z</given-names> </name></person-group><article-title>Principled instructions are all you need for questioning LLaMA-1/2, GPT-3.5/4</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 18, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2312.16171</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wei</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Schuurmans</surname><given-names>D</given-names> </name><name name-style="western"><surname>Bosma</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ichter</surname><given-names>B</given-names> </name><name name-style="western"><surname>Xia</surname><given-names>F</given-names> </name><etal/></person-group><article-title>Chain-of-thought prompting elicits reasoning in large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 10, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2201.11903</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Touvron</surname><given-names>H</given-names> </name><name name-style="western"><surname>Lavril</surname><given-names>T</given-names> </name><name name-style="western"><surname>Izacard</surname><given-names>G</given-names> </name><name name-style="western"><surname>Martinet</surname><given-names>X</given-names> </name><name name-style="western"><surname>Lachaux</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Lacroix</surname><given-names>T</given-names> </name><etal/></person-group><article-title>LLaMA: open and efficient foundation language models</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 27, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2302.13971</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Brown</surname><given-names>TB</given-names> </name><name name-style="western"><surname>Mann</surname><given-names>B</given-names> </name><name name-style="western"><surname>Ryder</surname><given-names>N</given-names> </name><name name-style="western"><surname>Subbiah</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kaplan</surname><given-names>J</given-names> </name><name name-style="western"><surname>Dhariwal</surname><given-names>P</given-names> </name><etal/></person-group><article-title>Language models are few-shot learners</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 22, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.2005.14165</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wagle</surname><given-names>N</given-names> </name><name name-style="western"><surname>Morkos</surname><given-names>J</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>J</given-names> </name><etal/></person-group><article-title>aEYE: A deep learning system for video nystagmus detection</article-title><source>Front Neurol</source><year>2022</year><volume>13</volume><fpage>963968</fpage><pub-id pub-id-type="doi">10.3389/fneur.2022.963968</pub-id><pub-id pub-id-type="medline">36034311</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lim</surname><given-names>EC</given-names> </name><name name-style="western"><surname>Park</surname><given-names>JH</given-names> </name><name name-style="western"><surname>Jeon</surname><given-names>HJ</given-names> </name><etal/></person-group><article-title>Developing a diagnostic decision support system for benign paroxysmal positional vertigo using a deep-learning model</article-title><source>J Clin Med</source><year>2019</year><month>05</month><day>8</day><volume>8</volume><issue>5</issue><fpage>633</fpage><pub-id pub-id-type="doi">10.3390/jcm8050633</pub-id><pub-id pub-id-type="medline">31072056</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>S</given-names> </name><name name-style="western"><surname>Han</surname><given-names>J</given-names> </name><name name-style="western"><surname>Seo</surname><given-names>YJ</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>S</given-names> </name></person-group><article-title>A nystagmus extraction system using artificial intelligence for video-nystagmography</article-title><source>Sci Rep</source><year>2023</year><volume>13</volume><issue>1</issue><fpage>11975</fpage><pub-id pub-id-type="doi">10.1038/s41598-023-39104-7</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rao</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kamineni</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Evaluating GPT as an adjunct for radiologic decision making: GPT-4 versus GPT-3.5 in a breast imaging pilot</article-title><source>J Am Coll Radiol</source><year>2023</year><month>10</month><volume>20</volume><issue>10</issue><fpage>990</fpage><lpage>997</lpage><pub-id pub-id-type="doi">10.1016/j.jacr.2023.05.003</pub-id><pub-id pub-id-type="medline">37356806</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chiarelli</surname><given-names>G</given-names> </name><name name-style="western"><surname>Stephens</surname><given-names>A</given-names> </name><name name-style="western"><surname>Finati</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Adequacy of prostate cancer prevention and screening recommendations provided by an artificial intelligence-powered large language model</article-title><source>Int Urol Nephrol</source><year>2024</year><month>08</month><volume>56</volume><issue>8</issue><fpage>2589</fpage><lpage>2595</lpage><pub-id pub-id-type="doi">10.1007/s11255-024-04009-5</pub-id><pub-id pub-id-type="medline">38564079</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Jared</surname><given-names>K</given-names> </name><name name-style="western"><surname>Sam</surname><given-names>M</given-names> </name><name name-style="western"><surname>Tom</surname><given-names>H</given-names> </name><name name-style="western"><surname>Tom</surname><given-names>BB</given-names> </name><name name-style="western"><surname>Benjamin</surname><given-names>C</given-names> </name><name name-style="western"><surname>Rewon</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Scaling laws for neural language models</article-title><source>arXiv</source><access-date>2025-04-26</access-date><comment>Preprint posted online on  Jan 23, 2020</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/pdf/2001.08361">https://arxiv.org/pdf/2001.08361</ext-link></comment></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bozomitu</surname><given-names>RG</given-names> </name><name name-style="western"><surname>P&#x0103;s&#x0103;ric&#x0103;</surname><given-names>A</given-names> </name><name name-style="western"><surname>T&#x0103;rniceriu</surname><given-names>D</given-names> </name><name name-style="western"><surname>Rotariu</surname><given-names>C</given-names> </name></person-group><article-title>Development of an eye tracking-based human-computer interface for real-time applications</article-title><source>Sensors (Basel)</source><year>2019</year><month>08</month><day>20</day><volume>19</volume><issue>16</issue><fpage>3630</fpage><pub-id pub-id-type="doi">10.3390/s19163630</pub-id><pub-id pub-id-type="medline">31434358</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cristina</surname><given-names>S</given-names> </name><name name-style="western"><surname>Camilleri</surname><given-names>KP</given-names> </name></person-group><article-title>Unobtrusive and pervasive video-based eye-gaze tracking</article-title><source>Image Vis Comput</source><year>2018</year><month>06</month><volume>74</volume><fpage>21</fpage><lpage>40</lpage><pub-id pub-id-type="doi">10.1016/j.imavis.2018.04.002</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Williams</surname><given-names>CYK</given-names> </name><name name-style="western"><surname>Bains</surname><given-names>J</given-names> </name><name name-style="western"><surname>Tang</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Evaluating large language models for drafting emergency department discharge summaries</article-title><source>medRxiv</source><year>2024</year><month>04</month><day>4</day><fpage>2024.04.03.24305088</fpage><pub-id pub-id-type="doi">10.1101/2024.04.03.24305088</pub-id><pub-id pub-id-type="medline">38633805</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fournier-Tombs</surname><given-names>E</given-names> </name><name name-style="western"><surname>McHardy</surname><given-names>J</given-names> </name></person-group><article-title>A medical ethics framework for conversational artificial intelligence</article-title><source>J Med Internet Res</source><year>2023</year><month>07</month><day>26</day><volume>25</volume><fpage>e43068</fpage><pub-id pub-id-type="doi">10.2196/43068</pub-id><pub-id pub-id-type="medline">37224277</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Sample of prompt.</p><media xlink:href="formative_v9i1e70070_app1.docx" xlink:title="DOCX File, 191 KB"/></supplementary-material></app-group></back></article>