<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v9i1e76896</article-id><article-id pub-id-type="doi">10.2196/76896</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Evaluating Locally Run Large Language Models (Gemma 2, Mistral Nemo, and Llama 3) for Outpatient Otorhinolaryngology Care: Retrospective Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Buhr</surname><given-names>Christoph Raphael</given-names></name><degrees>MSc, MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Seifen</surname><given-names>Christopher</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Bahr-Hamm</surname><given-names>Katharina</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Huppertz</surname><given-names>Tilman</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Pordzik</surname><given-names>Johannes</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Smith</surname><given-names>Harry</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kelsey</surname><given-names>Tom</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Blaikie</surname><given-names>Andrew</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Matthias</surname><given-names>Christoph</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kuhn</surname><given-names>Sebastian</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Eckrich</surname><given-names>Jonas</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Otorhinolaryngology, University Medical Center of the Johannes Gutenberg-University Mainz</institution><addr-line>Langenbeckstra&#x00DF;e 1</addr-line><addr-line>Mainz</addr-line><country>Germany</country></aff><aff id="aff2"><institution>School of Medicine, University of St Andrews</institution><addr-line>St Andrews</addr-line><country>United Kingdom</country></aff><aff id="aff3"><institution>School of Computer Science, University of St Andrews</institution><addr-line>St Andrews</addr-line><country>United Kingdom</country></aff><aff id="aff4"><institution>Institute for Digital Medicine, Philipps University Marburg, University Hospital Giessen and Marburg</institution><addr-line>Marburg</addr-line><country>Germany</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Sarvestan</surname><given-names>Javad</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Lotfinia</surname><given-names>Mahshad</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Christoph Raphael Buhr, MSc, MD, Department of Otorhinolaryngology, University Medical Center of the Johannes Gutenberg-University Mainz, Langenbeckstra&#x00DF;e 1, Mainz, 55131, Germany, +49 6131 17 7362; <email>buhrchri@uni-mainz.de</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>25</day><month>11</month><year>2025</year></pub-date><volume>9</volume><elocation-id>e76896</elocation-id><history><date date-type="received"><day>03</day><month>05</month><year>2025</year></date><date date-type="rev-recd"><day>08</day><month>10</month><year>2025</year></date><date date-type="accepted"><day>09</day><month>10</month><year>2025</year></date></history><copyright-statement>&#x00A9; Christoph Raphael Buhr, Christopher Seifen, Katharina Bahr-Hamm, Tilman Huppertz, Johannes Pordzik, Harry Smith, Tom Kelsey, Andrew Blaikie, Christoph Matthias, Sebastian Kuhn, Jonas Eckrich. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 25.11.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2025/1/e76896"/><abstract><sec><title>Background</title><p>Large language models (LLMs) have great potential to improve and make the work of clinicians more efficient. Previous studies have mainly focused on web-based services, such as ChatGPT, often with simulated cases. For the processing of personalized patient data, web-based services have major data protection concerns. Ensuring compliance with data protection and medical device regulations therefore remains a critical challenge for adopting LLMs in clinical settings.</p></sec><sec><title>Objective</title><p>This retrospective single-center study aimed to evaluate locally run LLMs (Gemma 2, Mistral Nemo, and Llama 3) in providing diagnosis and treatment recommendation for real-world outpatient cases in otorhinolaryngology (ORL).</p></sec><sec sec-type="methods"><title>Methods</title><p>Outpatient cases (n=30) from regular consultation hours and the emergency service at a university hospital ORL outpatient department were randomly selected. Documentation by ORL doctors, including anamnesis and examination results, was passed to the locally run LLMs (Gemma 2, Mistral Nemo, and Llama 3), which were asked to provide diagnostic and treatment strategies. Recommendations of the LLMs and the treating ORL doctors were rated by 3 experienced ORL consultants on a 6-point Likert scale for medical adequacy, conciseness, coherence, and comprehensibility. Moreover, consultants were asked whether the answers pose a risk to the patient&#x2019;s safety. A modified Turing test was performed to distinguish responses generated by LLMs from those of doctors. Finally, the potential influence of the information generated by the LLMs on the raters&#x2019; own diagnosis and treatment opinions was evaluated.</p></sec><sec sec-type="results"><title>Results</title><p>Over all categories, ORL doctors achieved superior (<italic>P</italic>&#x003C;.0005) ratings compared to locally run LLMs (Llama 3, Mistral Nemo, and Gemma 2). ORL doctors&#x2019; responses were considered hazardous for patients in only 1% of the ratings, whereas recommendations by Llama 3, Gemma 2, and Mistral Nemo were considered hazardous in 54%, 47%, and 32% of cases, respectively. According to the raters, the LLM&#x2019;s information rarely influenced their judgment, with Mistral Nemo, Gemma 2, and Llama 3 achieving 1%, 3%, and 4% of the ratings, respectively.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Although locally run LLM models still underperform compared with their web-based counterparts, they achieved respectable results on outpatient treatment in this study. Nevertheless, the retrospective and single-center nature of the study, along with the clinicians&#x2019; documentation style, may have introduced bias in favor of human recommendations. In the future, locally run LLMs will help address data protection concerns; however, further refinement and prospective validation are still needed to meet strict medical device requirements. As locally run LLMs continue to evolve, they are likely to become comparably powerful to web-based LLMs and become established as useful tools to support doctors in clinical practice.</p></sec></abstract><kwd-group><kwd>large language models</kwd><kwd>artificial intelligence</kwd><kwd>otorhinolaryngology</kwd><kwd>digital health</kwd><kwd>chatbot</kwd><kwd>global health</kwd><kwd>low- and middle-income countries</kwd><kwd>telemedicine</kwd><kwd>telehealth</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Potential of Large Language Models in Medicine</title><p>The introduction of new large language models (LLMs), such as ChatGPT (OpenAI, California, USA), has disrupted the traditional perceptions of artificial intelligence (AI). Rather than requiring extensive coding skills to achieve a task, LLMs understand natural human language input, making their capabilities accessible to much broader tasks. Instead of being trained for a specific purpose, these models are &#x2018;all-rounders&#x2019; capable of accomplishing a wide range of different tasks, including medical queries [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. When required, further fine-tuning can improve their performance for certain fields of action while achieving higher cost-efficiency than training a whole new model for a specific purpose. These characteristics are essential for medical applications. First, medical data such as anamnesis and examination findings are usually recorded in a semistructured manner in human language. Second, the broadly spread competence of LLMs is helpful for understanding the complex and interlinked issues patients present to health systems. Third, with regard to increasing economic pressure in the health care system, LLM support might present a promising solution to increase efficacy, improve outcomes, and reduce costs.</p></sec><sec id="s1-2"><title>LLMs in Otorhinolaryngology</title><p>The application of LLMs in otorhinolaryngology (ORL) is the subject of current research. Suggested areas of application include research and clinical use. Clinical uses range from patient education to improving electronic medical records, triage, patient classification, clinical education, and decision support [<xref ref-type="bibr" rid="ref3">3</xref>]. More specifically, recent studies have evaluated LLMs in answering patients&#x2019; questions [<xref ref-type="bibr" rid="ref4">4</xref>] over the analysis of polysomnographic results [<xref ref-type="bibr" rid="ref5">5</xref>] to tumor board augmentation [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>]. Nevertheless, the most frequent touch points between ORL doctors and patients occur in the outpatient care sector.</p><p>Thus, this area of application offers one of the most relevant instances where support of ORL doctors by LLMs may be useful. Previous studies in this area highlighted the promising performance of LLMs, while stating the overall superiority of ORL consultants. A study from 2023 compared LLM recommendations on case-based questions with those of experienced ORL consultants. In this case, the LLMs received high ratings in semantic categories and promising ratings on medical adequacy. Furthermore, the study revealed a significant improvement in medical adequacy between 2 tested versions (ie, ChatGPT 3 and ChatGPT 4) [<xref ref-type="bibr" rid="ref8">8</xref>]. A further, similarly designed study assessed differences between various LLMs. Here, Claude 2 (Anthropic, California, USA) and ChatGPT 4 achieved the highest ratings on medical adequacy, whereas answers from ChatGPT 4 proved to be the most secure for the patients [<xref ref-type="bibr" rid="ref9">9</xref>]. While both studies cited earlier used rated categories as performance metrics (ie, medical adequacy, coherence, comprehensibility, and conciseness, each rated on a Likert scale), another study primarily evaluated agreement between physicians&#x2019; recommendations and those of the LLMs. In this study, the agreement of physicians and ChatGPT on physician-written clinical vignettes in otolaryngology was rated using a 5-point Likert scale. The authors showed high agreement of ChatGPT and the physicians on differential diagnosis and treatment plans, with no association between vignette difficulty and agreement with differential diagnosis or treatment [<xref ref-type="bibr" rid="ref10">10</xref>].</p></sec><sec id="s1-3"><title>Conquering Data Protection Challenges: The Potential of Locally Run LLMs</title><p>Despite the promising findings these studies offer, they often use highly structured (and simulated) input data, mainly focusing on web-based LLMs, such as ChatGPT. This approach has obvious data protection limitations when it comes to real-world implementation. Data protection requirements are the key constraint when using web-based LLMs, as sharing highly sensitive patient data with a third party raises confidentiality concerns. While the General Data Protection Regulation [<xref ref-type="bibr" rid="ref11">11</xref>] regulates secure data processing in the European Union, the Health Insurance Portability and Accountability Act [<xref ref-type="bibr" rid="ref12">12</xref>] is of utmost importance in the United States. Unlike web-based LLMs, locally operated LLMs can mitigate data protection concerns by avoiding external data transfers. To date, locally run LLMs have received little attention for use in medical queries, especially for outpatient treatment, despite the potential benefits they could provide. Thus, this study evaluates different locally run open-access LLMs (ie, Llama 3 (Meta, California, USA), Mistral Nemo (Mistral AI, Paris, France), and Gemma 2 (Google, California, USA)) in providing diagnosis and therapy recommendation for real outpatient cases with no data sharing and confidentiality concerns.</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design and Patient Case Selection</title><p>The workflow of the study is visualized in <xref ref-type="fig" rid="figure1">Figure 1</xref>. Thirty outpatient cases from the archive of our regular consultation hours and the emergency service at the university hospital ORL outpatient office were retrieved for the study. Cases were selected at random from attendance lists. The selection was designed to be as broad as possible, and duplication was avoided as the frequency of different diseases differs. In case of duplication (same disease as one in the previous case), a new case was selected. The retrieved cases involved the following diagnoses: chronic sinusitis and frontal osteoma, tinnitus and conductive hearing loss on both sides, nasal polyp on the left side, deviated septum, conchal hyperplasia, functional voice disorder, dysosmia, recurrent chronic sinusitis, chronic sinusitis, median neck cyst, vocal cord/fold polyp, chronic otitis media, tympanic effusions on both sides, suspected adenoid vegetations, nasal bone fracture, nasal bridge laceration, severe sensorineural hearing loss, vallecula cyst, tumor in the parotid gland, chronic mesotympanic otitis media, lymphadenopathy, benign paroxysmal positional vertigo and vestibulopathy, tonsillar carcinoma, benign paroxysmal positional vertigo, acute bilateral tonsillitis, otitis externa, sillar abscess, foreign body esophagus, foreign body ear, epistaxis after septoplasty, obstructive ear wax, acute unilateral vestibulopathy, and otitis media.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Workflow of the study. LLM: large language model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e76896_fig01.png"/></fig></sec><sec id="s2-2"><title>Data Preparation and Prompting</title><p>The selected cases were saved in a text document (Word; Microsoft, Redmond, California, USA). The entry also included the patient&#x2019;s gender and year of birth. Generally, medical documentation of the ORL doctors in our clinic is subdivided into anamnesis, examination findings, diagnosis, and treatment recommendation. Therefore, the diagnoses and treatment recommendation section was removed before each patient case was passed to the locally run LLMs (ie, Llama 3, Mistal Nemo, and Gemma 2). All cases were passed to the different locally run LLMs using the basic prompt shown in <xref ref-type="fig" rid="figure2">Figure 2</xref> in the German language. The prompt refers to ORL doctors&#x2019; documentation style in our clinic and explicitly requests the diagnosis and treatment recommendation, limiting the answer to 100 words. The prompt has been kept deliberately simple to test the LLM&#x2019;s baseline performance. The instructions provided aimed to ensure that the response resembles the doctors&#x2019; documentation style.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Prompt use within this study.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e76896_fig02.png"/></fig></sec><sec id="s2-3"><title>LLM Execution</title><p>The LLMs were run locally on a standard Hewlett-Packard (Palo Alto, California, USA) Notebook (Intel Core 17&#x2010;1255U, 4.7 GHz. DDR4, 16 GB [2&#x00D7;8 GB], Windows 10 Pro, LM Studio 0.3.5) using the prompt shown in <xref ref-type="fig" rid="figure2">Figure 2</xref> as stated earlier.</p><p>The selection of LLMs was guided by their open-source availability and model size. Consequently, only models below 10 GB were included. The default settings of LLM Studio were not modified, and all model configurations&#x2014; including parameter sizes or variants, quantization settings, prompt templates and system prompts, as well as temperature, top-p, maximum token limits, and random seeds&#x2014;were kept unchanged. The specific models evaluated in this study are as follows: Meta-Llama-3-8B-Instruct-GGUF (GPU Offload 32/32; CPU Thread Pool Size 4, Evaluation Batch Size 512, Context Length 4096, Random Seed), Gemma-2-9b-it-GGUF (GPU Offload 37/42; CPU Thread Pool Size 4, Evaluation Batch Size 512, Context Length 4096, Random Seed), and Mistral-Nemo-Instruct-2407-GGUF (GPU Offload 27/40; CPU Thread Pool Size 4, Evaluation Batch Size 512, Context Length 4096, Random Seed) (all published by lmstudio-community).</p></sec><sec id="s2-4"><title>Evaluation and Rating</title><p>To evaluate the results, all cases were collected in a single text document. In particular, the doctor&#x2019;s documentation (eg, medical history and examination findings) was presented first for each case. Subsequently, the diagnoses and treatment recommendations of the 3 LLMs and the treating ORL doctor were presented in random order for the specific case. The evaluation criteria stated below were filled in under each answer (Likert scale and binary rating) by each rater.</p><p>The recommendation of the LLMs and the original documentation of the treating ORL doctors were rated by 3 experienced ORL consultants on a 6-point Likert scale (1=very poor and 6=excellent) for medical adequacy, conciseness, coherence, and comprehensibility.</p><p>The key assessment metrics were defined as follows:</p><list list-type="bullet"><list-item><p><italic>Medical adequacy</italic> refers to the accuracy and appropriateness of medical content relative to established clinical guidelines and expert consensus.</p></list-item><list-item><p><italic>Coherence</italic> refers to the logical flow and consistency of the information presented. This was assessed by reviewers based on the clarity of connections between ideas and the absence of contradictory statements.</p></list-item><list-item><p><italic>Comprehensibility</italic> measures the ease with which the information can be understood.</p></list-item><list-item><p><italic>Conciseness</italic> evaluates whether the information is sufficiently succinct without omitting critical details.</p></list-item></list><p>Furthermore, <italic>hazardous for patients</italic> was defined as the presence of information that could directly or indirectly lead to patient harm if followed. This includes advice that contradicts established clinical guidelines, promotes unsafe practices, misrepresents risks or benefits, or could result in delayed diagnosis, inappropriate treatment, or adverse outcomes.</p><p>Moreover, ORL consultants were asked (binary rating) whether the answers pose a risk to the patient&#x2019;s safety, whether the response originates from an LLM or a doctor (modified Turing test) [<xref ref-type="bibr" rid="ref13">13</xref>], and whether their own diagnosis and treatment opinion would be influenced by the respective answer. In total, for each included case, the 3 experienced ORL consultants rated the response (diagnosis and treatment recommendation) of the treating ORL doctor and every tested LLM.</p></sec><sec id="s2-5"><title>Benchmarking Against Web-Based LLMs</title><p>Web-based LLMs have been previously evaluated as mentioned earlier [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>] and have shown sufficient capabilities in evaluating case-based questions. To benchmark the locally run LLMs with an established web-based LLM, 10 simulated cases were comparatively evaluated following the same workflow during the course of the study. Accordingly, the 10 simulated cases were processed by an ORL doctor and submitted to the 3 locally run LLMs (ie, Llama 3, Mistal Nemo, and Gemma 2), as well as a web-based LLM (ie, Chat GPT-4o).</p></sec><sec id="s2-6"><title>Statistical Analysis</title><p>All responses from the raters were transferred to a Microsoft Excel spreadsheet and sorted according to the entity being evaluated. Statistical analysis was performed with GraphPad Prism for macOS (version 10.3.0; GraphPad Software, La Jolla, CA, USA). The data did not meet normality assumptions, as confirmed by the D&#x2019;Agostino and Pearson test.</p><p>Further statistical analysis was performed using Python (version 3.11.13) in the Google Colab environment. The data were imported from Excel files. Packages used include pandas (data processing), statsmodels, and pingouin.</p><p>For each rating category (ie, medical adequacy, conciseness, coherence, and comprehensibility), a linear mixed effects model was fitted, with entity as a fixed effect and case and rater as random intercepts to account for clustering within cases and raters. Furthermore, interrater reliability was examined using evaluations of the same responses by multiple raters.</p><p>The following measures were used to calculate reliability:</p><list list-type="bullet"><list-item><p>Fleiss &#x03BA;, a measure of agreement in categorical evaluations, was determined using the fleiss_kappa function from the statsmodels package.</p></list-item><list-item><p>The percentage agreement was calculated as the proportion of cases in which all raters gave exactly the same rating.</p></list-item><list-item><p>The intraclass correlation coefficient (ICC) for quantifying agreement in ordinal ratings was calculated using the intraclass_corr function from the pingouin package. For this purpose, the data were transformed into a long format containing target objects, raters, and ratings.</p></list-item></list></sec><sec id="s2-7"><title>Ethical Considerations</title><p>Ethical approval was obtained from the ethics committee of the state medical association (request/approval number: 2023&#x2010;17385-retrospektiv). Informed consent was not required as this was a retrospective study. The data were processed anonymously to ensure privacy, and compensation was not necessary due to the retrospective nature of the study.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Rating Results</title><p>ORL doctors showed significantly (<italic>P</italic>&#x003C;.05%) higher ratings compared to the locally run LLMs (ie, Llama 3, Mistral Nemo, and Gemma 2) over all categories (<xref ref-type="fig" rid="figure3">Figure 3</xref>). Altogether, the doctors achieved (median 6, interquartile range [IQR] 5&#x2010;6) the highest rating in every category. On medical adequacy, Gemma 2 and Mistral Nemo received similar ratings (Mistral Nemo: median 4, IQR 3-5; and Gemma: median 4, IQR 3-4), whereas Llama 3 performed inferior scores, achieving a median of 3 (IQR 3-4). For conciseness, all locally run LLMs tested received similar ratings (Gemma 2 and Mistral Nemo: median 4, IQR 4-5; and Llama 3: median 4, IQR 3-4). Mistral Nemo (median 5, IQR 4-5) outperformed Gemma 2 and Llama 3 on ratings for coherence and comprehensibility. Both Gemma 2 and Llama 3 received similar ratings in these categories. Detailed rating results are presented in <xref ref-type="table" rid="table1">Table 1</xref>. Details of the linear mixed effects model for predicting ratings are presented in <xref ref-type="table" rid="table2">Table 2</xref>, and <italic>P</italic> values for entity coefficients were all &#x003C;.001, indicating a significant difference between ORL doctors and LLMs. Boxplots of the ratings categorized for each model can be found in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> .</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Rating for locally run LLMs and ORL consultants on providing diagnosis and treatment strategy (30 real-world cases) regarding (A) medical adequacy, (B) conciseness, (C) coherence, and (D) comprehensibility rated on a 6-point Likert scale. Normality distribution was tested with the D&#x2019;Agostino and Pearson test. Comparisons were performed using a linear mixed-effects model. Nonsignificant &#x003E;0.05, *<italic>P</italic>&#x003C;.05, **<italic>P</italic>&#x003C;.005, ***<italic>P</italic>&#x003C;.001. LLM: large language models; ORL: otorhinolaryngology.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e76896_fig03.png"/></fig><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Results of the rating by ORL<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> consultants.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Values, mean (SD)</td><td align="left" valign="bottom">Rating, median (IQR<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup>)</td><td align="left" valign="bottom">95% CI of median</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="4"><bold>Medical adequacy</bold></td></tr><tr><td align="left" valign="top">&#x2003;ORL doctors</td><td align="left" valign="top">5.6 (0.6)</td><td align="left" valign="top">6 (5-6)</td><td align="left" valign="top">6-6</td></tr><tr><td align="left" valign="top">&#x2003;Gemma 2</td><td align="left" valign="top">3.5 (1.2)</td><td align="left" valign="top">4 (3-4)</td><td align="left" valign="top">3-4</td></tr><tr><td align="left" valign="top">&#x2003;Mistral Nemo</td><td align="left" valign="top">4 (1.2)</td><td align="left" valign="top">4 (3-5)</td><td align="left" valign="top">4-4</td></tr><tr><td align="left" valign="top">&#x2003;Llama 3</td><td align="left" valign="top">3.4 (1.2)</td><td align="left" valign="top">3 (3-4)</td><td align="left" valign="top">3-4</td></tr><tr><td align="left" valign="top" colspan="4"><bold>Conciseness</bold></td></tr><tr><td align="left" valign="top">&#x2003;ORL doctors</td><td align="left" valign="top">5.7 (0.5)</td><td align="left" valign="top">6 (5-6)</td><td align="left" valign="top">6-6</td></tr><tr><td align="left" valign="top">&#x2003;Gemma 2</td><td align="left" valign="top">4.2 (0.9)</td><td align="left" valign="top">4 (4-5)</td><td align="left" valign="top">4-4</td></tr><tr><td align="left" valign="top">&#x2003;Mistral Nemo</td><td align="left" valign="top">4.3 (1)</td><td align="left" valign="top">4 (4-5)</td><td align="left" valign="top">4-5</td></tr><tr><td align="left" valign="top">&#x2003;Llama 3</td><td align="left" valign="top">3.8 (0.8)</td><td align="left" valign="top">4 (3-4)</td><td align="left" valign="top">4-4</td></tr><tr><td align="left" valign="top" colspan="4"><bold>Coherence</bold></td></tr><tr><td align="left" valign="top">&#x2003;ORL doctors</td><td align="left" valign="top">5.7 (0.6)</td><td align="left" valign="top">6 (5-6)</td><td align="left" valign="top">6-6</td></tr><tr><td align="left" valign="top">&#x2003;Gemma 2</td><td align="left" valign="top">4.2 (1.1)</td><td align="left" valign="top">4 (3-5)</td><td align="left" valign="top">4-5</td></tr><tr><td align="left" valign="top">&#x2003;Mistral Nemo</td><td align="left" valign="top">4.4 (1.2)</td><td align="left" valign="top">5 (4-5)</td><td align="left" valign="top">5-5</td></tr><tr><td align="left" valign="top">&#x2003;Llama 3</td><td align="left" valign="top">3.8 (1.1)</td><td align="left" valign="top">4 (3-5)</td><td align="left" valign="top">4-4</td></tr><tr><td align="left" valign="top" colspan="4"><bold>Comprehensibility</bold></td></tr><tr><td align="left" valign="top">&#x2003;ORL doctors</td><td align="left" valign="top">5.7 (0.5)</td><td align="left" valign="top">6 (6)</td><td align="left" valign="top">6-6</td></tr><tr><td align="left" valign="top">&#x2003;Gemma 2</td><td align="left" valign="top">4.3 (1)</td><td align="left" valign="top">4 (4-5)</td><td align="left" valign="top">4-5</td></tr><tr><td align="left" valign="top">&#x2003;Mistral Nemo</td><td align="left" valign="top">4.5 (1.1)</td><td align="left" valign="top">5 (4-5)</td><td align="left" valign="top">4-5</td></tr><tr><td align="left" valign="top">&#x2003;Llama 3</td><td align="left" valign="top">3.9 (1)</td><td align="left" valign="top">4 (3-5)</td><td align="left" valign="top">4-4</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>ORL: otorhinolaryngology.</p></fn><fn id="table1fn2"><p><sup>b</sup>IQR: interquartile range.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Linear mixed effects model for ratings.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Estimate</td><td align="left" valign="bottom"><italic>P</italic> value</td><td align="left" valign="bottom">ci_lower</td><td align="left" valign="bottom">ci_upper</td></tr></thead><tbody><tr><td align="left" valign="top"><bold>Medical adequacy</bold></td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Intercept</td><td align="left" valign="top">5.633</td><td align="left" valign="top">0</td><td align="left" valign="top">5.370</td><td align="left" valign="top">5.896</td></tr><tr><td align="left" valign="top">&#x2003;entity[T.Gemma 2]</td><td align="left" valign="top">&#x2212;2.1</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">&#x2212;2.385</td><td align="left" valign="top">&#x2212;1.814</td></tr><tr><td align="left" valign="top">&#x2003;entity[T.Mistral Nemo]</td><td align="left" valign="top">&#x2212;1.633</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">&#x2212;1.918</td><td align="left" valign="top">&#x2212;1.348</td></tr><tr><td align="left" valign="top">&#x2003;entity[T.Llama 3]</td><td align="left" valign="top">&#x2212;2.278</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">&#x2212;2.563</td><td align="left" valign="top">&#x2212;1.993</td></tr><tr><td align="left" valign="top">&#x2003;Group Var</td><td align="left" valign="top">0.223</td><td align="left" valign="top">.01</td><td align="left" valign="top">0.045</td><td align="left" valign="top">0.402</td></tr><tr><td align="left" valign="top">&#x2003;rater Var</td><td align="left" valign="top">0.033</td><td align="left" valign="top">.517</td><td align="left" valign="top">&#x2212;0.067</td><td align="left" valign="top">0.134</td></tr><tr><td align="left" valign="top"><bold>Conciseness</bold></td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;Intercept</td><td align="left" valign="top">5.711</td><td align="left" valign="top">0</td><td align="left" valign="top">5.524</td><td align="left" valign="top">5.899</td></tr><tr><td align="left" valign="top">&#x2003;entity[T.Gemma 2]</td><td align="left" valign="top">&#x2212;1.556</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">&#x2212;1.773</td><td align="left" valign="top">&#x2212;1.338</td></tr><tr><td align="left" valign="top">&#x2003;entity [T.Mistral Nemo]</td><td align="left" valign="top">&#x2212;1.456</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">&#x2212;1.673</td><td align="left" valign="top">&#x2212;1.238</td></tr><tr><td align="left" valign="top">&#x2003;entity[T.Llama 3]</td><td align="left" valign="top">&#x2212;1.922</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">&#x2212;2.14</td><td align="left" valign="top">&#x2212;1.705</td></tr><tr><td align="left" valign="top">&#x2003;Group Var</td><td align="left" valign="top">0.106</td><td align="left" valign="top">.127</td><td align="left" valign="top">&#x2212;0.03</td><td align="left" valign="top">0.242</td></tr><tr><td align="left" valign="top">&#x2003;rater Var</td><td align="left" valign="top">0.164</td><td align="left" valign="top">.05</td><td align="left" valign="top">&#x003C;0.001</td><td align="left" valign="top">0.329</td></tr><tr><td align="left" valign="top"><bold>Coherence</bold></td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;Intercept</td><td align="left" valign="top">5.677</td><td align="left" valign="top">0</td><td align="left" valign="top">5.469</td><td align="left" valign="top">5.886</td></tr><tr><td align="left" valign="top">&#x2003;entity[T.Gemma 2]</td><td align="left" valign="top">&#x2212;1.5</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">&#x2212;1.768</td><td align="left" valign="top">&#x2212;1.232</td></tr><tr><td align="left" valign="top">&#x2003;entity[T.Mistral Nemo]</td><td align="left" valign="top">&#x2212;1.244</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">&#x2212;1.512</td><td align="left" valign="top">&#x2212;0.976</td></tr><tr><td align="left" valign="top">&#x2003;entity[T.Llama 3]</td><td align="left" valign="top">&#x2212;1.844</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">&#x2212;2.112</td><td align="left" valign="top">&#x2212;1.576</td></tr><tr><td align="left" valign="top">&#x2003;Group Var</td><td align="left" valign="top">0.014</td><td align="left" valign="top">.716</td><td align="left" valign="top">&#x2212;0.064</td><td align="left" valign="top">0.092</td></tr><tr><td align="left" valign="top">&#x2003;rater Var</td><td align="left" valign="top">0.166</td><td align="left" valign="top">.08</td><td align="left" valign="top">&#x2212;0.021</td><td align="left" valign="top">0.352</td></tr><tr><td align="left" valign="top"><bold>Comprehensibility</bold></td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;Intercept</td><td align="left" valign="top">5.744</td><td align="left" valign="top">0</td><td align="left" valign="top">5.526</td><td align="left" valign="top">5.963</td></tr><tr><td align="left" valign="top">&#x2003;entity[T.Gemma 2]</td><td align="left" valign="top">&#x2212;1.422</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">&#x2212;1.670</td><td align="left" valign="top">&#x2212;1.174</td></tr><tr><td align="left" valign="top">&#x2003;entity[T.Mistral Nemo]</td><td align="left" valign="top">&#x2212;1.267</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">&#x2212;1.515</td><td align="left" valign="top">&#x2212;1.019</td></tr><tr><td align="left" valign="top">&#x2003;entity[T.Llama 3]</td><td align="left" valign="top">&#x2212;1.856</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">&#x2212;2.104</td><td align="left" valign="top">&#x2212;1.607</td></tr><tr><td align="left" valign="top">&#x2003;Group Var</td><td align="left" valign="top">0.121</td><td align="left" valign="top">.109</td><td align="left" valign="top">&#x2212;0.027</td><td align="left" valign="top">0.268</td></tr><tr><td align="left" valign="top">&#x2003;rater Var</td><td align="left" valign="top">0.183</td><td align="left" valign="top">.036</td><td align="left" valign="top">0.012</td><td align="left" valign="top">0.355</td></tr></tbody></table></table-wrap></sec><sec id="s3-2"><title>Modified Turing Test</title><p>Regarding the modified Turing test, Gemma 2 was recognized as a machine in 99% (89/90), Mistral Nemo in 98% (88/90), and Llama 3 in 97% (87/90) of the ratings. Conversely, the ORL doctors were recognized as a human being in 99% (89/90) of the ratings.</p></sec><sec id="s3-3"><title>Assessment of Patient Safety</title><p>While the raters considered the ORL doctors&#x2019; recommendations as potentially unsafe for the patient in only 1% (1/90) of cases, Llama 3&#x2019;s recommendations were considered potentially hazardous in 54% (49/90), Gemma 2&#x2019;s in 47% (42/90), and Mistral Nemo&#x2019;s in 32% (29/90) of cases.</p></sec><sec id="s3-4"><title>Influence on Raters&#x2019; Own Decision-Making</title><p>Whereas the ORL consultants stated that the information provided by the ORL doctors might have influenced their own decision in 2% (2/90), Mistral Nemo was considered influential in 1% (1/90), Gemma 2 in 3% (3/90), and Llama 3 in 4% (4/90) of cases for this assessment.</p></sec><sec id="s3-5"><title>Number of Words</title><p>The number of words used by the ORL doctors and the locally run LLMs is visualized in <xref ref-type="fig" rid="figure4">Figure 4</xref>. Llama 3 (median 99, range 19&#x2010;191, IQR 83&#x2010;124) spent the most words on providing a diagnosis and treatment recommendations, followed by Gemma 2 (median 53, range 33&#x2010;83, IQR 45&#x2010;68) and Mistral Nemo (median 53, range 13&#x2010;127, IQR 35&#x2010;68) and the ORL doctors (median 20, range 3&#x2010;53, IQR 13&#x2010;37).</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Number of words used by ORL doctors and the locally run LLMs (ie, Gemma 2, Mistral Nemo, and Llama 3) on providing diagnosis and treatment recommendations.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e76896_fig04.png"/></fig></sec><sec id="s3-6"><title>Interrater Reliability</title><p>The interrater reliability analysis using Fleiss &#x03BA; and percentage match is presented in <xref ref-type="table" rid="table3">Table 3</xref>. A further interrater reliability analysis applying the ICC is illustrated in <xref ref-type="table" rid="table3">Table 3</xref>.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Overall interrater reliability: Fleiss &#x03BA; and percentage match.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Category</td><td align="left" valign="bottom">Fleiss &#x03BA;<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup><break/>Rater 1&#x2010;3<break/>170 responses evaluated</td><td align="left" valign="bottom">Percentage of homolog ratings between all raters<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup>:<break/><inline-formula><mml:math id="ieqn1"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:mfrac><mml:mrow><mml:mrow><mml:mi mathvariant="normal">n</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">u</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">m</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">b</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">e</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">r</mml:mi></mml:mrow><mml:mrow><mml:mtext>&#x00A0;</mml:mtext></mml:mrow><mml:mrow><mml:mi mathvariant="normal">o</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">f</mml:mi></mml:mrow><mml:mrow><mml:mtext>&#x00A0;</mml:mtext></mml:mrow><mml:mrow><mml:mi mathvariant="normal">h</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">o</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">m</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">o</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">l</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">o</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">g</mml:mi></mml:mrow><mml:mrow><mml:mtext>&#x00A0;</mml:mtext></mml:mrow><mml:mrow><mml:mi mathvariant="normal">r</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">a</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">t</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">i</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">n</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">g</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">s</mml:mi></mml:mrow><mml:mrow><mml:mtext>&#x00A0;</mml:mtext></mml:mrow><mml:mrow><mml:mi mathvariant="normal">b</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">e</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">t</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">w</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">e</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">e</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">n</mml:mi></mml:mrow><mml:mrow><mml:mtext>&#x00A0;</mml:mtext></mml:mrow><mml:mrow><mml:mi mathvariant="normal">a</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">l</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">l</mml:mi></mml:mrow><mml:mrow><mml:mtext>&#x00A0;</mml:mtext></mml:mrow><mml:mrow><mml:mi mathvariant="normal">r</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">a</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">t</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">e</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">r</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">s</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="normal">n</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">u</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">m</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">b</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">e</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">r</mml:mi></mml:mrow><mml:mrow><mml:mtext>&#x00A0;</mml:mtext></mml:mrow><mml:mrow><mml:mi mathvariant="normal">o</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">f</mml:mi></mml:mrow><mml:mrow><mml:mtext>&#x00A0;</mml:mtext></mml:mrow><mml:mrow><mml:mi mathvariant="normal">c</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">a</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">s</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">e</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">s</mml:mi></mml:mrow></mml:mrow></mml:mfrac></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula></td></tr></thead><tbody><tr><td align="left" valign="top">Modified turing test</td><td align="left" valign="top">0.906</td><td align="left" valign="top">0.947</td></tr><tr><td align="left" valign="top">Hazardous for patients</td><td align="left" valign="top">0.419</td><td align="left" valign="top">0.635</td></tr><tr><td align="left" valign="top">Influence by the respective answer</td><td align="left" valign="top">&#x2212;0.028</td><td align="left" valign="top">0.917</td></tr><tr><td align="left" valign="top">Medical adequacy</td><td align="left" valign="top">0.241</td><td align="left" valign="top">0.206</td></tr><tr><td align="left" valign="top">Conciseness</td><td align="left" valign="top">0.167</td><td align="left" valign="top">0.165</td></tr><tr><td align="left" valign="top">Coherence</td><td align="left" valign="top">0.152</td><td align="left" valign="top">0.159</td></tr><tr><td align="left" valign="top">Comprehensibility</td><td align="left" valign="top">0.186</td><td align="left" valign="top">0.188</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>Interpretation of Cohen&#x2019;s &#x03BA; adapted from Landis and Koch (1977).</p></fn><fn id="table3fn2"><p><sup>b</sup>&#x003C;0: worse than chance; 0.00-0.20: low agreement; 0.21-0.40: moderate agreement; 0.41-0.60: satisfactory correlation; 0.61-0.80: good correlation; 0.81-1.00: very good to perfect correlation.</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Context of the study</title><p>Previous studies assessing LLMs on outpatient treatment in ORL focused on web-based LLMs mainly using simulated cases [<xref ref-type="bibr" rid="ref8">8</xref>-<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref14">14</xref>]. While web-based LLMs show impressive linguistic and professional capabilities, this setup has obvious limitations due to data sharing concerns. These concerns could be circumvented using LLMs that have no connection to the internet working on a local computer. Hence, this study evaluates the capabilities of 3 different locally run LLMs (ie, Llama 3, Mistral Nemo, and Gemma 2) on real patient data from outpatient consultations of a University Medical Center in Germany.</p><p>Despite ORL doctors receiving superior ratings over all categories, this study demonstrates the ability of locally run LLMs to process outpatient data and provide useful diagnostic and treatment information. Regarding medical adequacy, Gemma 2 and Mistral Nemo received similar high ratings (Mistral Nemo: median 4, IQR 3-5; and Gemma: median 4, IQR 3-4), whereas Llama 3 performed inferiorly, achieving a median of 3 (IQR 3-4). Comparing with previous studies, the performance of Gemma 2 and Mistral Nemo is similar to ChatGPT 3&#x2019;s performance in a study from 2023 reaching a median rating of 4 (IQR 4-5) on medical adequacy [<xref ref-type="bibr" rid="ref8">8</xref>]. However, a comparison with a more recent study reveals an inferior rating on medical adequacy. In the mentioned study from 2024, the web-based LLMs Claude 2 (median 5, IQR 4-6), ChatGPT 4 (median 5, IQR 4-6), and Bard 2023.07.13 (median 5, IQR 3-5) achieved higher ratings on medical adequacy than the best performing locally run LLMs (ie, Mistral Nemo and Gemma 2) in this study [<xref ref-type="bibr" rid="ref9">9</xref>]. Regarding conciseness, all locally run LLMs received similar ratings (Gemma 2 and Mistral Nemo: median 4, IQR 4-5; and Llama 3: median 4, IQR 3-4). These performance levels correspond with the results from prior studies [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>]. Mistral Nemo (median 5, IQR 4-5) outperformed Gemma 2 and Llama 3 on ratings for coherence and comprehensibility, which received similar ratings in these categories. With regard to the modified Turing test, the ORL doctors were recognized as human beings in 99% (89/90) of the ratings, whereas the LLMs were almost always correctly identified as machines. Gemma 2 was recognized as a machine in 99% (89/90), Mistral Nemo in 98% (88/90), and Llama 3 in 97% (87/90) of the ratings. Again, these ratings from this study for coherence and comprehensibility are in line with results from previous studies [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>].</p></sec><sec id="s4-2"><title>Hazardous for Patients</title><p>It is important to examine the cases marked as hazardous in more detail to better understand the risks posed by LLMs. The only risk identified by the treating doctor is in the case of a patient with a large nasal polyp, for whom the polyp&#x2019;s removal under local anesthesia was recommended. The reason why this was assessed as a potential risk to the patient remains unclear, but it may be explained by the fact that the patient is taking anticoagulants (acetylsalicylic acid and clopidogrel). Overall, Llama 3&#x2019;s recommendations were considered potentially hazardous in 54% (49/90), Gemma 2&#x2019;s in 47% (42/90), and Mistral Nemo&#x2019;s in 32% (29/90) of cases. A typical example of an LLM shortcoming is a case involving a patient with chronic sinusitis who reports being largely symptom-free while undergoing local cortisone therapy. While the doctor and Mistral Nemo recommend continuing with the cortisone therapy alone, Llama 3 and Gemma 2 recommend adding antibiotics to the treatment. In this case, the patient would be prescribed antibiotics unnecessarily, posing a risk to the patient, particularly given that the examination findings do not mention any pus. In another case involving a 3-year-old girl with suspected adenoids and tympanic fluid, Gemma 2 recommends myringoplasty, Mistral Nemo recommends a wait-and-see approach, and Llama 3 recommends speech therapy and a tonsillectomy. In contrast, the doctor recommends an adenotomy and the insertion of tympanic drains. Recommending the wrong operation is certainly one of the most extreme misjudgments. This demonstrates the importance of careful examination and shows that, despite their eloquence, LLMs should not be trusted unconditionally.</p></sec><sec id="s4-3"><title>Benchmarking With Web-Based LLMs</title><p>The comparison with preceding underscores reveals the improvement of LLMs over time [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>]. To benchmark the locally run LLMs with their current web-based counterparts, an assessment on 10 separate simulated cases (simulated to meet data protection constraints) was performed. Here, the web-based LLM ChatGPT-4o and ORL doctors achieved similar high ratings on medical adequacy (ORL doctors: median 6, IQR 6; ChatGPT-4o: median 6, IQR 5-6). Regarding patients&#x2019; security, a potential hazard for patients was noted in a single rating for ChatGPT-4o (1/30, 3%), in 53% (16/30) for Llama 3, and in 23% (7/30) for Mistral Nemo and Gemma 2. None of the ORL doctors&#x2019; recommendations was considered as potentially hazardous for patients. In addition, the raters found ChatGPT-4o to potentially influence their own judgment in 10% (3/30) of the cases. Among the locally run LLMs, only Llama 3 received a single positive rating in this dimension (3%; detailed data regarding the web-based LLM assessment are present in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Interestingly, ChatGPT-4o has only been accused of jeopardizing a patient in one case. In this case, the patient had a suspicious cervical lymph node, and the doctor recommended extirpation. However, ChatGPT-4o recommended fine needle aspiration, laboratory tests, a magnetic resonance imaging scan, and a follow-up appointment. The reason for flagging this answer as risky is probably that the LLM did not commit to any procedure. Mistral Nemo recommended a biopsy or excision, Gemma recommended a biopsy and a blood test, and Llama suggested a check-up in 4 weeks. Although the assessment of the web-based LLMs did only include 10 simulated cases rather than real-world data, it does demonstrate the high-performance level of current web-based LLMs and their leading edge over the tested locally run competitors.</p><p>Recent developments in advanced reasoning LLMs (often referred to as chain-of-thought or system-prompt&#x2013;augmented models) could further improve diagnostic accuracy and reduce risky recommendations by simulating more nuanced clinical reasoning processes. These models leverage step-by-step logical reasoning, potentially enabling them to identify and correct errors before delivering final outputs.</p></sec><sec id="s4-4"><title>Interrater Reliability</title><p>Analyzing interrater reliability can be challenging. In this study, the evaluating ORL consultants demonstrated an excellent Fleiss &#x03BA; correlation of 0.906 in the modified Turing test (with binary answers). For the assessment of hazards to patients (also binary answers), a Fleiss &#x03BA; of 0.419 was found, indicating a satisfactory level of correlation. Furthermore, ORL consultants were asked whether the presented answer would have influenced their own decision (binary rating). This produced a Fleiss &#x03BA; of &#x2212;0.028, which formally corresponds to poorer correlation than expected by chance. However, assessment of the raw data showed high concordance between raters; therefore, we include the percentage of homolog ratings between all raters, which was 91.7% (number of homolog ratings between all raters/number of cases). Taking these measures into consideration, the low Fleiss &#x03BA; value is therefore most likely caused by the high uniformity of the ratings provided.</p><p>Fleiss &#x03BA; and the percentage of homolog ratings between all raters suggested low interrater reliability for ratings in the categories of medical adequacy, conciseness, coherence, and comprehensibility. The same applied for the absolute agreement. However, an examination of the raw data revealed that the raters&#x2019; assessments were in fact highly consistent throughout. Although the raters often did not award exactly the same points, the scores awarded were close to each other, for example, rating 5 points and rating 6 points for the same answer by different raters. Hence, we calculated ICCs. The ICC analysis shows a moderate (ICC&#x2248;0.46&#x2010;0.65) interrater agreement in the abovementioned categories (<xref ref-type="table" rid="table4">Table 4</xref>). The aggregated assessment of several raters (ICCk) achieved moderate to good values in all categories (ICCk &#x2248;0.72&#x2010;0.85). The highest agreement was for medical adequacy, the lowest for comprehensibility. All ICC values were highly significant (<italic>P</italic>&#x003C;.001), with narrow CIs, indicating stable estimates. In conclusion, despite the fact that all rating ORL consultants rarely choose exactly the same rating for the responses given, they highly agreed within the tendency of ratings (<xref ref-type="table" rid="table4">Table 4</xref>). Overall, this analysis reveals a high consistency among ratings.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Overall interrater reliability: intraclass correlation coefficient (ICC).</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Category</td><td align="left" valign="top">Description</td><td align="left" valign="top">ICC</td><td align="left" valign="top"><italic>F</italic> statistic</td><td align="left" valign="top">df1<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="top">df2<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="top"><italic>P</italic> value</td><td align="left" valign="top">95% CI<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top"><bold>Medical adequacy</bold></td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;ICC1<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td><td align="left" valign="top">Single raters absolute</td><td align="left" valign="top">0.653</td><td align="left" valign="top">6.643</td><td align="left" valign="top">169</td><td align="left" valign="top">340</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">0.58-0.72</td></tr><tr><td align="left" valign="top">&#x2003;ICC2<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup></td><td align="left" valign="top">Single random raters</td><td align="left" valign="top">0.653</td><td align="left" valign="top">6.614</td><td align="left" valign="top">169</td><td align="left" valign="top">338</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">0.58-0.72</td></tr><tr><td align="left" valign="top">&#x2003;ICC3<sup><xref ref-type="table-fn" rid="table4fn5">e</xref></sup></td><td align="left" valign="top">Single fixed raters</td><td align="left" valign="top">0.652</td><td align="left" valign="top">6.614</td><td align="left" valign="top">169</td><td align="left" valign="top">338</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">0.58-0.72</td></tr><tr><td align="left" valign="top">&#x2003;ICC1k<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td><td align="left" valign="top">Average raters absolute</td><td align="left" valign="top">0.849</td><td align="left" valign="top">6.643</td><td align="left" valign="top">169</td><td align="left" valign="top">340</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">0.81-0.88</td></tr><tr><td align="left" valign="top">&#x2003;ICC2k<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td><td align="left" valign="top">Average random raters</td><td align="left" valign="top">0.849</td><td align="left" valign="top">6.614</td><td align="left" valign="top">169</td><td align="left" valign="top">338</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">0.81-0.88</td></tr><tr><td align="left" valign="top">&#x2003;ICC3k<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td><td align="left" valign="top">Average fixed raters</td><td align="left" valign="top">0.849</td><td align="left" valign="top">6.614</td><td align="left" valign="top">169</td><td align="left" valign="top">338</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">0.8-0.88</td></tr><tr><td align="left" valign="top"><bold>Conciseness</bold></td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;ICC1</td><td align="left" valign="top">Single raters absolute</td><td align="left" valign="top">0.544</td><td align="left" valign="top">4.575</td><td align="left" valign="top">169</td><td align="left" valign="top">340</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">0.46-0.62</td></tr><tr><td align="left" valign="top">&#x2003;ICC2</td><td align="left" valign="top">Single random raters</td><td align="left" valign="top">0.552</td><td align="left" valign="top">5.221</td><td align="left" valign="top">169</td><td align="left" valign="top">338</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">0.45-0.64</td></tr><tr><td align="left" valign="top">&#x2003;ICC3</td><td align="left" valign="top">Single fixed raters</td><td align="left" valign="top">0.585</td><td align="left" valign="top">5.221</td><td align="left" valign="top">169</td><td align="left" valign="top">338</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">0.5-0.66</td></tr><tr><td align="left" valign="top">&#x2003;ICC1k</td><td align="left" valign="top">Average raters absolute</td><td align="left" valign="top">0.781</td><td align="left" valign="top">4.575</td><td align="left" valign="top">169</td><td align="left" valign="top">340</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">0.72, 0.83</td></tr><tr><td align="left" valign="top">&#x2003;ICC2k</td><td align="left" valign="top">Average random raters</td><td align="left" valign="top">0.787</td><td align="left" valign="top">5.221</td><td align="left" valign="top">169</td><td align="left" valign="top">338</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">0.71-0.84</td></tr><tr><td align="left" valign="top">&#x2003;ICC3k</td><td align="left" valign="top">Average fixed raters</td><td align="left" valign="top">0.808</td><td align="left" valign="top">5.221</td><td align="left" valign="top">169</td><td align="left" valign="top">338</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">0.75-0.85</td></tr><tr><td align="left" valign="top"><bold>Coherence</bold></td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;ICC1</td><td align="left" valign="top">Single raters absolute</td><td align="left" valign="top">0.500</td><td align="left" valign="top">4.000</td><td align="left" valign="top">169</td><td align="left" valign="top">340</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">0.41-0.58</td></tr><tr><td align="left" valign="top">&#x2003;ICC2</td><td align="left" valign="top">Single random raters</td><td align="left" valign="top">0.512</td><td align="left" valign="top">4.680</td><td align="left" valign="top">169</td><td align="left" valign="top">338</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">0.4-0.61</td></tr><tr><td align="left" valign="top">&#x2003;ICC3</td><td align="left" valign="top">Single fixed raters</td><td align="left" valign="top">0.551</td><td align="left" valign="top">4.680</td><td align="left" valign="top">169</td><td align="left" valign="top">338</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">0.47-0.63</td></tr><tr><td align="left" valign="top">&#x2003;ICC1k</td><td align="left" valign="top">Average raters absolute</td><td align="left" valign="top">0.750</td><td align="left" valign="top">4.000</td><td align="left" valign="top">169</td><td align="left" valign="top">340</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">0.68-0.81</td></tr><tr><td align="left" valign="top">&#x2003;ICC2k</td><td align="left" valign="top">Average random raters</td><td align="left" valign="top">0.759</td><td align="left" valign="top">4.680</td><td align="left" valign="top">169</td><td align="left" valign="top">338</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">0.66-0.83</td></tr><tr><td align="left" valign="top">&#x2003;ICC3k</td><td align="left" valign="top">Average fixed raters</td><td align="left" valign="top">0.786</td><td align="left" valign="top">4.680</td><td align="left" valign="top">169</td><td align="left" valign="top">338</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">0.72-0.84</td></tr><tr><td align="left" valign="top"><bold>Comprehensibility</bold></td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;ICC1</td><td align="left" valign="top">Single raters absolute</td><td align="left" valign="top">0.457</td><td align="left" valign="top">3.523</td><td align="left" valign="top">169</td><td align="left" valign="top">340</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">0.37-0.55</td></tr><tr><td align="left" valign="top">&#x2003;ICC2</td><td align="left" valign="top">Single random raters</td><td align="left" valign="top">0.478</td><td align="left" valign="top">4.524</td><td align="left" valign="top">169</td><td align="left" valign="top">338</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">0.33-0.6</td></tr><tr><td align="left" valign="top">&#x2003;ICC3</td><td align="left" valign="top">Single fixed raters</td><td align="left" valign="top">0.540</td><td align="left" valign="top">4.524</td><td align="left" valign="top">169</td><td align="left" valign="top">338</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">0.45-0.62</td></tr><tr><td align="left" valign="top">&#x2003;ICC1k</td><td align="left" valign="top">Average raters absolute</td><td align="left" valign="top">0.716</td><td align="left" valign="top">3.523</td><td align="left" valign="top">169</td><td align="left" valign="top">340</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">0.63-0.78</td></tr><tr><td align="left" valign="top">&#x2003;ICC2k</td><td align="left" valign="top">Average random raters</td><td align="left" valign="top">0.733</td><td align="left" valign="top">4.524</td><td align="left" valign="top">169</td><td align="left" valign="top">338</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">0.59-0.82</td></tr><tr><td align="left" valign="top">&#x2003;ICC3k</td><td align="left" valign="top">Average fixed raters</td><td align="left" valign="top">0.779</td><td align="left" valign="top">4.524</td><td align="left" valign="top">169</td><td align="left" valign="top">338</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">0.71-0.83</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>df1, df2: degrees of freedom.</p></fn><fn id="table4fn2"><p><sup>b</sup>Interpretation guidelines (Koo &#x0026; Li, 2016) [<xref ref-type="bibr" rid="ref15">15</xref>]: &#x003C;0.50, poor; 0.50&#x2010;0.75, moderate; 0.75&#x2010;0.90, good; &#x003E;0.90, excellent. </p></fn><fn id="table4fn3"><p><sup>c</sup>ICC1: single-rater, 1-way random effects (absolute agreement).</p></fn><fn id="table4fn4"><p><sup>d</sup>ICC2: single-rater, 2-way random effects (absolute agreement).</p></fn><fn id="table4fn5"><p><sup>e</sup>ICC3: single-rater, 2-way mixed effects (absolute agreement).</p></fn><fn id="table4fn6"><p><sup>f</sup>ICC1k, ICC2k, ICC3k: corresponding average measures ICCs.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s4-5"><title>Limitations</title><p>Passing original text documentation to the LLMs is both a strength and a limitation of our study at the same time. Original text input in the local language (German in this study) represents the most realistic application in clinical practice. Despite the advantage of a realistic setup, the use of the German language may have had a negative impact on the results. This is indicated by the fact that some LLMs occasionally responded in English, although the rest of the context was in German. Furthermore, ORL doctors&#x2019; original documentation included grammatical errors and misspellings. Moreover, the documentation provided was limited to the text transcript of a single consultation, not including further diagnostic information, such as imaging diagnostic results. This approach was determined by the input structure of the locally run LLMs assessed in this study. The documentation of the treating ORL doctors may have been created on the basis of further information. Information that the ORL doctors may not include in their text documentation owing to the availability in the clinic&#x2019;s documentation system. However, as the rating ORL consultants had only access to the same documentation as the locally run LLMs, a possible bias is limited.</p><p>Furthermore, documentation style between treating units may vary; the ORL doctors as well as the rating ORL consultants are affiliated with the same institution and thus may prefer a special design of documentation. This important advantage in favor of ORL doctors should be taken into account when interpreting the findings. Nonetheless, this limitation is because of the nature of single-center setting, which limits the generalizability of findings to other institutions and patient populations.</p><p>Moreover, clinicians&#x2019; own documentation style&#x2014;crafted primarily for human readability&#x2014;may inadvertently favor human reviewers, who are already accustomed to reading similar clinical notes, rather than locally run LLMs. This could introduce a systematic bias when comparing human- versus model-generated recommendations.</p><p>Overall, regarding documentation style, this study design applied the most realistic scenario using original documentation of treating ORL doctors. Ultimately, any other strategy (ie, adjustment of documentation) would have introduced a new bias. In addition, when implemented in clinical practice, LLMs must be evaluated using original documentation in any case.</p><p>Assessing &#x201C;conciseness&#x201D; poses its own challenges: the measured output is strongly influenced by how prompts are written (eg, whether specific length caps or &#x201C;be concise&#x201D; cues are used). This study aimed to assess the conciseness of the evaluated LLMs while limiting the word count to 100 within the prompt. Despite the narrow limits given, differences emerged. While the doctors and Gemma 2 adhered to the word limit every time, Mistral Nemo exceeded it twice, and Llama 3 exceeded it 14 times. However, future studies should evaluate standardized prompt instructions, clearly defining output constraints, and using consistent methods (eg, information density assessments or rubrics balancing completeness and brevity) to more reliably measure and compare conciseness across different models and scenarios.</p><p>Besides, the model size of locally run LLMs was limited by the computing capacity of the laptop used in this study. More powerful machines using larger locally run LLMs may achieve better results. Yet, the laptop used is a standard laptop, which appears realistic for use in clinical practice. This was in line with the aim of the study to test the current performance with the technical possibilities currently available. Moreover, implementing local LLMs on doctors&#x2019; local computers at their workstations has advantages over using large LLMs on the clinic&#x2019;s own servers. First, there is no need to purchase additional computing capacity, as the computers already exist and are available, which saves costs. Furthermore, operation is still possible even if there is no connection to the central server, which makes operation significantly more robust and crisis-proof. Owing to continuous technical progress, it is to be expected that the performance of the comparatively small LLMs tested in this study will continue to increase and will surpass the performance of currently available large models in the near future. Accordingly, web-based LLMs can provide a glimpse into the future of what &#x201C;small&#x201D; local LLMs will be capable of in the future. In this study, this assessment was covered by benchmarking web-based LLMs on 10 simulated cases. With regard to rating capacity of the ORL consultants, this study was limited to 30 original cases from our outpatient department, causing constraints in the statistical analysis. Furthermore, the limitation to 30 original cases, duplicates in diagnoses were excluded from the random sampling of cases, which could have induced a selection bias.</p><p>Finally, the retrospective nature of this study prevents real-time validation of diagnoses and treatment recommendations. To further assess the true capabilities of local LLMs, future studies should assess prospective multicenter designs.</p></sec><sec id="s4-6"><title>Local LLM Conquer Data Protection Challenges</title><p>The utilization of LLMs in medical treatment has obvious medicolegal and ethical constraints. While locally run LLMs mitigate data protection concerns by avoiding external data transfer, compliance with regulations such as the General Data Protection Regulation [<xref ref-type="bibr" rid="ref11">11</xref>] in the European Union or the Health Insurance Portability and Accountability Act [<xref ref-type="bibr" rid="ref12">12</xref>] in the United States remains paramount. Organizations adopting such solutions must ensure that data handling procedures&#x2014;including encryption, secure storage, and privacy-preserving protocols&#x2014;are implemented to meet these regulatory standards. Moreover, establishing robust internal governance, maintaining transparency around how patient data are used, and conducting regular audits are essential for preserving patient privacy and upholding trust in AI-assisted care.</p></sec><sec id="s4-7"><title>Lingering Challenges for Clinical Integration</title><p>Beyond data protection, LLMs used for clinical decision-making may also fall under medical device regulations (eg, European Union Medical Device Regulation [<xref ref-type="bibr" rid="ref16">16</xref>] and Food and Drug Administration guidelines in the United States [<xref ref-type="bibr" rid="ref17">17</xref>]). The dynamic nature of LLMs, their propensity for generating unpredictable responses (so-called &#x201C;hallucinations&#x201D;), and the difficulty of verifying their outputs pose unique challenges in fulfilling strict validation and safety requirements. As such, developers and health care institutions must carefully assess whether the LLM&#x2019;s intended use qualifies it as a regulated medical device and, if so, seek the necessary certification or approvals before deploying these technologies at scale.</p><p>However, recent studies also showed patients&#x2019; skepticism regarding AIs' decision-making even when supervised by human specialists [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>]. These aspects illustrate that many obstacles beyond medical correctness and accuracy remain. These issues should be carefully addressed before an implementation of LLMs into clinical workflows is considered.</p></sec><sec id="s4-8"><title>Potential of a Clinical Integration</title><p>Despite these obvious limitations, this study highlights the broad ability of locally run LLMs to process medical data in ORL and provide diagnostic and treatment strategies without further fine-tuning or training. This is an important finding as LLMs are not trained for a specific narrow purpose, yet they are able to perform satisfactorily in a niche medical subspecialty, such as ORL. LLMs therefore represent a potentially highly cost-efficient means to scale diagnostic support particularly relevant to low-resource settings and financially constrained public health services. While the locally run LLMs failed to meet the high benchmark of ORL doctors, they still provide structured output, which in many cases did provide a suitable treatment strategy for the respective patients. Obviously, locally run LLMs are far from replacing human doctors in health care, especially when considering the relatively high rate of hazardous recommendations of some locally run LLMs. However, their web-based counterparts are capable of providing doctor-like answers with very few mistakes and hazardous potential. As technological evaluation continues, it is likely that locally run LLMs will catch up to the performance of their web-based counterparts. While the medicolegal and ethical aspects of an implementation of LLMs in clinical practices remain, they provide a promising solution to support medical staff. In a time of chronic shortage of trained health care workers all over the world, the feasibility of LLM use should be considered. Auspicious applications range from the assistance of local health workers in low-resource settings to triage or background support in high-resource settings. A roadmap of an implementation in clinical practice should include further validation of locally run LLMs in larger prospective studies. Besides evaluation by scientists, legislators need to take action. Current regulations such as device regulations show shortcomings for new software-based medical tools, such as LLMs. For instance, fast-track approval mechanisms for minor updates are needed to ensure that relevant updates do not delay by undergoing an entire approval process. Finally, the implementation of LLMs in clinical practice should be a joint project involving all stakeholders, including patients, manufacturers, physicians/scientists, and authorities.</p></sec><sec id="s4-9"><title>Conclusions</title><p>Considering the generalist nature of locally run LLMs evaluated in this study, their performance in this specific medical field is impressive. However, locally run LLMs are still outperformed by their web-based counterparts and human specialists. In time, however, locally run LLMs are likely to become as powerful as their current web-based equivalents, creating the realistic prospect of practical enhancement of day-to-day clinical diagnostics in both high- and low-resource settings. As only locally run LLMs meet the high data protection requirements for medical application, future studies should monitor and evaluate their performance on a larger scale, ideally within prospective multicenter clinical studies.</p></sec></sec></body><back><ack><p><xref ref-type="fig" rid="figure1">Figures 1</xref> and <xref ref-type="fig" rid="figure2">2</xref> were drawn by Jonas Eckrich and Christoph Buhr using Microsoft PowerPoint (Microsoft Corp, Redmond, WA, USA). <xref ref-type="fig" rid="figure3">Figures 3</xref> and <xref ref-type="fig" rid="figure4">4</xref> were assembled by Jonas Eckrich and Christoph Buhr using Prism for Windows, version 9.5.1 (GraphPad Software, La Jolla, California, USA).</p></ack><notes><sec><title>Funding</title><p>There was no funding received for this study.</p></sec><sec><title>Data Availability</title><p>Owing to the sensitive nature of the clinical data used in this study and in accordance with institutional and legal data protection requirements, the datasets generated and analyzed cannot be made publicly available.</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: CRB (lead), HS, TK, AB, SK, JE (co-lead)</p><p>Data curation: CRB, CS, KB-H, TH, JP, TK, CM, SK, JE</p><p>Formal analysis: CRB, HS, TK, AB, JE</p><p>Investigation: CRB, AB, SK, JE</p><p>Methodology: CRB, CS, JP, HS, TK, JE</p><p>Project administration: CRB, AB, SK, JE</p><p>Resources: CRB, TK, SK, JE</p><p>Supervision: CRB, AB, CM, SK, JE</p><p>Validation: CRB, CS, KB-H, TH, JP, HS, TK, CM, SK, JE</p><p>Visualization: CRB, TK, AB, JE</p><p>Writing &#x2013; original draft: CRB, AB, SK, JE</p><p>Writing &#x2013; review &#x0026; editing: CRB, CS, KB-H, TH, JP, HS, TK, CM, SK, JE</p></fn><fn fn-type="conflict"><p>SK is a Founder &#x0026; shareholder of MED.digital. All other authors declare no conflicts of interest.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI: </term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">LMM</term><def><p>large language models</p></def></def-item><def-item><term id="abb3">ORL</term><def><p>otorhinolaryngology</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Kaddour</surname><given-names>J</given-names> </name><name name-style="western"><surname>Harris</surname><given-names>J</given-names> </name><name name-style="western"><surname>Mozes</surname><given-names>M</given-names> </name><name name-style="western"><surname>Bradley</surname><given-names>H</given-names> </name><name name-style="western"><surname>Raileanu</surname><given-names>R</given-names> </name><name name-style="western"><surname>McHardy</surname><given-names>R</given-names> </name></person-group><article-title>Challenges and applications of large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 19, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2307.10169</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thirunavukarasu</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSJ</given-names> </name><name name-style="western"><surname>Elangovan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gutierrez</surname><given-names>L</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSW</given-names> </name></person-group><article-title>Large language models in medicine</article-title><source>Nat Med</source><year>2023</year><month>08</month><volume>29</volume><issue>8</issue><fpage>1930</fpage><lpage>1940</lpage><pub-id pub-id-type="doi">10.1038/s41591-023-02448-8</pub-id><pub-id pub-id-type="medline">37460753</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Banyi</surname><given-names>N</given-names> </name><name name-style="western"><surname>Ma</surname><given-names>B</given-names> </name><name name-style="western"><surname>Amanian</surname><given-names>A</given-names> </name><name name-style="western"><surname>Bur</surname><given-names>A</given-names> </name><name name-style="western"><surname>Abdalkhani</surname><given-names>A</given-names> </name></person-group><article-title>Applications of natural language processing in otolaryngology: a scoping review</article-title><source>Laryngoscope</source><year>2025</year><month>09</month><volume>135</volume><issue>9</issue><fpage>3049</fpage><lpage>3063</lpage><pub-id pub-id-type="doi">10.1002/lary.32198</pub-id><pub-id pub-id-type="medline">40309961</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pordzik</surname><given-names>J</given-names> </name><name name-style="western"><surname>Bahr-Hamm</surname><given-names>K</given-names> </name><name name-style="western"><surname>Huppertz</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Patient support in obstructive sleep apnoea by a large language model - ChatGPT 4o on answering frequently asked questions on first line positive airway pressure and second line hypoglossal nerve stimulation therapy: a pilot study</article-title><source>Nat Sci Sleep</source><year>2024</year><volume>16</volume><fpage>2269</fpage><lpage>2277</lpage><pub-id pub-id-type="doi">10.2147/NSS.S495654</pub-id><pub-id pub-id-type="medline">39741798</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Seifen</surname><given-names>C</given-names> </name><name name-style="western"><surname>Huppertz</surname><given-names>T</given-names> </name><name name-style="western"><surname>Gouveris</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Chasing sleep physicians: ChatGPT-4o on the interpretation of polysomnographic results</article-title><source>Eur Arch Otorhinolaryngol</source><year>2025</year><month>03</month><volume>282</volume><issue>3</issue><fpage>1631</fpage><lpage>1639</lpage><pub-id pub-id-type="doi">10.1007/s00405-024-08985-3</pub-id><pub-id pub-id-type="medline">39427271</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Buhr</surname><given-names>CR</given-names> </name><name name-style="western"><surname>Ernst</surname><given-names>BP</given-names> </name><name name-style="western"><surname>Blaikie</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Assessment of decision-making with locally run and web-based large language models versus human board recommendations in otorhinolaryngology, head and neck surgery</article-title><source>Eur Arch Otorhinolaryngol</source><year>2025</year><month>03</month><volume>282</volume><issue>3</issue><fpage>1593</fpage><lpage>1607</lpage><pub-id pub-id-type="doi">10.1007/s00405-024-09153-3</pub-id><pub-id pub-id-type="medline">39792200</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lechien</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Chiesa-Estomba</surname><given-names>CM</given-names> </name><name name-style="western"><surname>Baudouin</surname><given-names>R</given-names> </name><name name-style="western"><surname>Hans</surname><given-names>S</given-names> </name></person-group><article-title>Accuracy of ChatGPT in head and neck oncological board decisions: preliminary findings</article-title><source>Eur Arch Otorhinolaryngol</source><year>2024</year><month>04</month><volume>281</volume><issue>4</issue><fpage>2105</fpage><lpage>2114</lpage><pub-id pub-id-type="doi">10.1007/s00405-023-08326-w</pub-id><pub-id pub-id-type="medline">37991498</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Buhr</surname><given-names>CR</given-names> </name><name name-style="western"><surname>Smith</surname><given-names>H</given-names> </name><name name-style="western"><surname>Huppertz</surname><given-names>T</given-names> </name><etal/></person-group><article-title>ChatGPT versus consultants: blinded evaluation on answering otorhinolaryngology case-based questions</article-title><source>JMIR Med Educ</source><year>2023</year><month>12</month><day>5</day><volume>9</volume><fpage>e49183</fpage><pub-id pub-id-type="doi">10.2196/49183</pub-id><pub-id pub-id-type="medline">38051578</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Buhr</surname><given-names>CR</given-names> </name><name name-style="western"><surname>Smith</surname><given-names>H</given-names> </name><name name-style="western"><surname>Huppertz</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Assessing unknown potential-quality and limitations of different large language models in the field of otorhinolaryngology</article-title><source>Acta Otolaryngol</source><year>2024</year><month>03</month><volume>144</volume><issue>3</issue><fpage>237</fpage><lpage>242</lpage><pub-id pub-id-type="doi">10.1080/00016489.2024.2352843</pub-id><pub-id pub-id-type="medline">38781053</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Qu</surname><given-names>RW</given-names> </name><name name-style="western"><surname>Qureshi</surname><given-names>U</given-names> </name><name name-style="western"><surname>Petersen</surname><given-names>G</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>SC</given-names> </name></person-group><article-title>Diagnostic and management applications of ChatGPT in structured otolaryngology clinical scenarios</article-title><source>OTO Open</source><year>2023</year><volume>7</volume><issue>3</issue><fpage>e67</fpage><pub-id pub-id-type="doi">10.1002/oto2.67</pub-id><pub-id pub-id-type="medline">37614494</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="report"><article-title>Regulation (EU) 2016/679 of the European parliament and of the council of 27 april 2016 on the protection of natural persons with regard to the processing of personal data and on the free movement of such data, and repealing directive 95/46/EC (general data protection regulation) (text with EEA relevance)</article-title><year>2016</year><access-date>2025-11-03</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://eur-lex.europa.eu/eli/reg/2017/745/oj/eng">https://eur-lex.europa.eu/eli/reg/2017/745/oj/eng</ext-link></comment></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="report"><article-title>Health insurance portability and accountability act of 1996</article-title><year>1996</year><access-date>2025-11-03</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.hhs.gov/hipaa/for-professionals/privacy/index.html">https://www.hhs.gov/hipaa/for-professionals/privacy/index.html</ext-link></comment></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Turing</surname><given-names>AM</given-names> </name></person-group><article-title>I.&#x2014;Computing machinery and intelligence</article-title><source>Mind</source><year>1950</year><month>10</month><day>1</day><volume>LIX</volume><issue>236</issue><fpage>433</fpage><lpage>460</lpage><pub-id pub-id-type="doi">10.1093/mind/LIX.236.433</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Teixeira-Marques</surname><given-names>F</given-names> </name><name name-style="western"><surname>Medeiros</surname><given-names>N</given-names> </name><name name-style="western"><surname>Nazar&#x00E9;</surname><given-names>F</given-names> </name><etal/></person-group><article-title>Exploring the role of ChatGPT in clinical decision-making in otorhinolaryngology: a ChatGPT designed study</article-title><source>Eur Arch Otorhinolaryngol</source><year>2024</year><month>04</month><volume>281</volume><issue>4</issue><fpage>2023</fpage><lpage>2030</lpage><pub-id pub-id-type="doi">10.1007/s00405-024-08498-z</pub-id><pub-id pub-id-type="medline">38345613</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Koo</surname><given-names>TK</given-names> </name><name name-style="western"><surname>Li</surname><given-names>MY</given-names> </name></person-group><article-title>A guideline of selecting and reporting intraclass correlation coefficients for reliability research</article-title><source>J Chiropr Med</source><year>2016</year><month>06</month><volume>15</volume><issue>2</issue><fpage>155</fpage><lpage>163</lpage><pub-id pub-id-type="doi">10.1016/j.jcm.2016.02.012</pub-id><pub-id pub-id-type="medline">27330520</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="report"><article-title>Regulation (EU) 2017/745 of the European parliament and of the council of 5 April 2017 on medical devices, amending directive 2001/83/EC</article-title><year>2017</year><access-date>2025-11-03</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://eur-lex.europa.eu/eli/reg/2016/679/oj">https://eur-lex.europa.eu/eli/reg/2016/679/oj</ext-link></comment></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="report"><article-title>Policy for device software functions and mobile medical applications guidance for industry and food and drug administration staff</article-title><year>2022</year><access-date>2025-11-03</access-date><publisher-name>FDA</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.fda.gov/regulatory-information/search-fda-guidance-documents/policy-device-software-functions-and-mobile-medical-applications">https://www.fda.gov/regulatory-information/search-fda-guidance-documents/policy-device-software-functions-and-mobile-medical-applications</ext-link></comment></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Esmaeilzadeh</surname><given-names>P</given-names> </name><name name-style="western"><surname>Mirzaei</surname><given-names>T</given-names> </name><name name-style="western"><surname>Dharanikota</surname><given-names>S</given-names> </name></person-group><article-title>Patients&#x2019; perceptions toward human-artificial intelligence interaction in health care: experimental study</article-title><source>J Med Internet Res</source><year>2021</year><month>11</month><day>25</day><volume>23</volume><issue>11</issue><fpage>e25856</fpage><pub-id pub-id-type="doi">10.2196/25856</pub-id><pub-id pub-id-type="medline">34842535</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Reis</surname><given-names>M</given-names> </name><name name-style="western"><surname>Reis</surname><given-names>F</given-names> </name><name name-style="western"><surname>Kunde</surname><given-names>W</given-names> </name></person-group><article-title>Influence of believed AI involvement on the perception of digital medical advice</article-title><source>Nat Med</source><year>2024</year><month>11</month><volume>30</volume><issue>11</issue><fpage>3098</fpage><lpage>3100</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-03180-7</pub-id><pub-id pub-id-type="medline">39054373</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Boxplots illustrating ratings categorized for each model and detailed data on the web-based LLM assessment.</p><media xlink:href="formative_v9i1e76896_app1.docx" xlink:title="DOCX File, 415 KB"/></supplementary-material></app-group></back></article>