<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v10i1e71541</article-id><article-id pub-id-type="doi">10.2196/71541</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Personalized Diabetes Treatment Support Using Large Language Models Fine-Tuned on Electronic Health Records: Development and Evaluation Study</article-title></title-group><contrib-group><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>He</surname><given-names>Shengyang</given-names></name><degrees>BSc</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Zhang</surname><given-names>Yu</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" corresp="yes" equal-contrib="yes"><name name-style="western"><surname>Li</surname><given-names>Jiaxi</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Clinical Laboratory Medicine, Jinniu Maternity and Child Health Hospital of Chengdu</institution><addr-line>Chengdu</addr-line><country>China</country></aff><aff id="aff2"><institution>Information Center, West China Hospital of Sichuan University</institution><addr-line>Chengdu</addr-line><addr-line>Sichuan</addr-line><country>China</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Schwartz</surname><given-names>Amy</given-names></name></contrib><contrib contrib-type="editor"><name name-style="western"><surname>Balcarras</surname><given-names>Matthew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Shen</surname><given-names>Bairong</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Zhang</surname><given-names>Chi</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Xu</surname><given-names>He</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Jiaxi Li, PhD, Department of Clinical Laboratory Medicine, Jinniu Maternity and Child Health Hospital of Chengdu, Chengdu, 610041, China, 86 18227387870; <email>576213658@qq.com</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>all authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>9</day><month>2</month><year>2026</year></pub-date><volume>10</volume><elocation-id>e71541</elocation-id><history><date date-type="received"><day>21</day><month>01</month><year>2025</year></date><date date-type="rev-recd"><day>11</day><month>12</month><year>2025</year></date><date date-type="accepted"><day>11</day><month>12</month><year>2025</year></date></history><copyright-statement>&#x00A9; Shengyang He, Yu Zhang, Jiaxi Li. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 9.2.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2026/1/e71541"/><related-article related-article-type="correction-forward" ext-link-type="doi" xlink:href="10.2196/93482" xlink:title="This is a corrected version. See correction statement in" xlink:type="simple">https://formative.jmir.org/2026/1/e93482</related-article><abstract><sec><title>Background</title><p>Effective diabetes management requires individualized treatment strategies tailored to patients&#x2019; clinical characteristics. With recent advances in artificial intelligence, large language models (LLMs) offer new opportunities to enhance clinical decision support, particularly in generating personalized recommendations.</p></sec><sec><title>Objective</title><p>This study aimed to develop and evaluate an LLM-based outpatient treatment support system for diabetes and examine its potential value in routine clinical decision-making.</p></sec><sec sec-type="methods"><title>Methods</title><p>Three compact LLMs (Llama 3.1-8B, Qwen3-8B, and GLM4-9B) were fine-tuned on deidentified outpatient electronic health records using a parameter-efficient low-rank adaptation approach. The optimized models were embedded into a prototype hospital information system via a retrieval-augmented generation framework to generate individualized treatment recommendations, laboratory test suggestions, and medication prompts based on demographic and clinical data.</p></sec><sec sec-type="results"><title>Results</title><p>Among the models evaluated, the fine-tuned GLM4-9B demonstrated the strongest performance, producing clinically reasonable treatment plans and appropriate laboratory test recommendations and medication suggestions. It achieved a mean Bilingual Evaluation Understudy for 4-grams score of 67.93 (SD 2.74) and mean scores of 44.30 (SD 3.91) for Recall-Oriented Understudy for Gisting Evaluation for overlap of unigrams, 27.34 (SD 1.85) for Recall-Oriented Understudy for Gisting Evaluation for overlap of bigrams, and 37.67 (SD 2.88) for Recall-Oriented Understudy for Gisting Evaluation for Longest Common Subsequence.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The fine-tuned GLM4-9B shows strong potential as a clinical decision support tool for personalized diabetes care. It can provide reference recommendations that may improve clinician efficiency and support decision quality. Future work should focus on enhancing medication guidance, expanding data sources, and improving adaptability in cases involving complex comorbidities.</p></sec></abstract><kwd-group><kwd>GLM4-9B</kwd><kwd>artificial intelligence</kwd><kwd>AI</kwd><kwd>large language model</kwd><kwd>diabetes</kwd><kwd>electronic health record</kwd><kwd>EHR</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Diabetes is a chronic metabolic disorder characterized by elevated blood glucose levels, which over time can cause serious damage to the heart, blood vessels, eyes, kidneys, and nervous system. The most common is type 2 diabetes, which accounts for approximately 90% to 95% of diabetes cases [<xref ref-type="bibr" rid="ref1">1</xref>] and affects mainly adults. According to the World Health Organization, approximately 422 million people worldwide have diabetes, most in low- and middle-income countries, and 1.5 million people die each year as a direct result of diabetes [<xref ref-type="bibr" rid="ref2">2</xref>]. The incidence and prevalence of diabetes have been steadily increasing over the last few decades [<xref ref-type="bibr" rid="ref1">1</xref>]. As patients with diabetes require long-term medication to control blood glucose levels and prevent complications [<xref ref-type="bibr" rid="ref3">3</xref>], they can face several challenges during the treatment process, such as medication selection, dosage adjustment, and management of adverse effects. Failure to address these issues in a timely manner can compromise the efficacy of medication and even pose a threat to patients&#x2019; lives [<xref ref-type="bibr" rid="ref4">4</xref>]. Therefore, people with diabetes need timely medication advice, health education, and nutrition support to help them use their medicines correctly, safely, and effectively, thereby improving adherence and quality of life. To better serve patients and increase the efficiency of health care professionals, we aim to optimize the management of patients with diabetes through the application of artificial intelligence.</p><p>With the significant success of ChatGPT in tasks related to understanding and generating humanlike responses [<xref ref-type="bibr" rid="ref5">5</xref>], large language models (LLMs) have attracted considerable attention. They have shown strong performance in various natural language processing tasks and the ability to generalize to unfamiliar tasks, demonstrating their potential as a unified solution for natural language understanding, text generation, and dialogue. Although ChatGPT has shown promising results in medical document summarization and decision support [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>], as well as in passing the US Medical Licensing Examination Steps 1 and 2 [<xref ref-type="bibr" rid="ref8">8</xref>], the exploration of these broad-domain LLMs in the medical field is still relatively limited [<xref ref-type="bibr" rid="ref9">9</xref>]. Currently, there is a lack of specifically trained LLMs in the field of health care. To address this gap, we plan to fine-tune an LLM using deidentified data from patients with diabetes with the aim of exploring its application in diabetes management. In addition, harnessing the potential of LLMs will open up new opportunities for medical research and practice and drive advances and innovation in health care technology.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Ethical Considerations</title><p>This retrospective study used encrypted and deidentified data from West China Hospital involving no patient privacy&#x2013;sensitive information or clinical interventions. Data extraction and study procedures were approved by the ethics committee of West China Hospital, Sichuan University (approval 2024-126), in accordance with informed consent requirements.</p></sec><sec id="s2-2"><title>Data Collection</title><p>This study used the big data integration platform of West China Hospital as the primary source of data [<xref ref-type="bibr" rid="ref10">10</xref>]. We collected electronic health record data from patients diagnosed with diabetes who visited the outpatient department from January 2022 to February 2022. The collected data included information such as the patients&#x2019; department of visit, age, gender, chief concern, present illness history, and diagnosis, which served as input for the model. In addition, we obtained data on patients&#x2019; outpatient medications, laboratory test items, examinations, and physician opinions, which were used as model outputs. Patient data with missing chief concerns and present illness history were excluded to ensure data quality and usability. Furthermore, we collected an additional set of data from 300 patients (visiting in March 2022) as a test set to evaluate the performance of the optimized model.</p></sec><sec id="s2-3"><title>Data Preprocessing</title><p>Before model training, we performed systematic preprocessing on the collected patient data, including data cleaning, standardization, and anonymization. Data cleaning involved removing missing values, outliers, and duplicate records to ensure the integrity and reliability of the model inputs. Standardization ensured consistency in format, units, and value ranges across different data types. For text data, including chief concerns, present illness history, examinations, and laboratory test results, we applied cleaning, terminology unification, and segmenting of key information while removing sensitive information to protect patient privacy. The processed data were then organized into a question-answer format (instruction-input-output) suitable for LLM fine-tuning: patient basic information and clinical text (eg, department, gender, age, diagnosis, chief concern, and present illness history) were used as the input, and corresponding medical orders, examinations, recommended laboratory tests, and treatment recommendations were integrated as the output (shown in <xref ref-type="other" rid="box1">Textbox 1</xref>). This approach ensured that both structured and unstructured information was effectively used for model training. The dataset was then split into training and test sets in an 8:2 ratio.</p><boxed-text id="box1"><title> Data example for large language model fine-tuning in clinical diabetes recommendations.</title><p><bold>Instruction</bold></p><list list-type="bullet"><list-item><p>&#x201C;You are a diabetes clinical recommendation model. You will receive the patient&#x2019;s basic information and laboratory results, and you are expected to provide tailored clinical recommendations for the patient.&#x201D;</p></list-item></list><p><bold>Input</bold></p><list list-type="bullet"><list-item><p>Department: endocrinology ward</p></list-item><list-item><p>Sex: male</p></list-item><list-item><p>Age: 55 years</p></list-item><list-item><p>Diagnosis: type 2 diabetes mellitus with hypertension</p></list-item><list-item><p>Chief concern: polydipsia; polyuria for 6 months</p></list-item><list-item><p>Present illness history: &#x201C;The patient reports excessive thirst and frequent urination over the past six months, with a weight loss of approximately 3 kg. History of hypertension for 5 years, blood pressure poorly controlled. Recent fasting blood glucose ranges from 8&#x2013;10 mmol/L, postprandial blood glucose up to 12 mmol/L. The patient is not on regular insulin or oral hypoglycemic therapy.&#x201D;</p></list-item></list><p><bold>Output</bold></p><list list-type="bullet"><list-item><p>Provision of a blood glucose monitoring device and test strips</p></list-item><list-item><p>Laboratory tests: fasting plasma glucose, 2-hour postprandial glucose, glycated hemoglobin, liver and renal function, and lipid panel</p></list-item><list-item><p>Examinations: fundoscopy, foot examination, and electrocardiogram</p></list-item><list-item><p>Treatment recommendations: adjust insulin or oral hypoglycemic agents according to blood glucose levels, monitor blood pressure, and implement dietary and exercise interventions</p></list-item></list></boxed-text></sec><sec id="s2-4"><title>Model Selection</title><p>In this study, we compared 3 LLMs for clinical text prediction tasks: Llama 3.1-8B (Meta AI) [<xref ref-type="bibr" rid="ref11">11</xref>], Qwen3-8B (Alibaba Cloud) [<xref ref-type="bibr" rid="ref12">12</xref>], and GLM4-9B (THUDM/Z.ai) [<xref ref-type="bibr" rid="ref13">13</xref>]. Llama 3.1-8B is an open-source model optimized for efficient generalization and complex text understanding. Qwen3-8B is a multilingual LLM with strong capabilities in both structured and unstructured medical text processing. GLM4-9B, developed based on the general language model architecture, is designed for bilingual question-answering tasks and supports local deployment through model quantization. All models were fine-tuned and evaluated using a single H100 graphics processing unit (NVIDIA). Smaller model sizes were chosen to facilitate clinical deployment and wider adoption in practical settings while still maintaining competitive performance for downstream tasks.</p></sec><sec id="s2-5"><title>Model Fine-Tuning</title><p>To adapt the selected LLMs for clinical treatment recommendation tasks, we used a parameter-efficient fine-tuning strategy that combines instruction-based prompting with low-rank adaptation (LoRA) [<xref ref-type="bibr" rid="ref14">14</xref>]. Instruction templates were designed to explicitly guide the model in interpreting patient information and generating clinically appropriate treatment suggestions. Building on this, LoRA was applied to the attention layers to enable efficient task-specific adaptation while updating only a small number of parameters. During fine-tuning, we explored multiple hyperparameter configurations, including learning rate, batch size, and LoRA-specific parameters such as rank and scaling factor. Model performance was assessed on a held-out validation set using both automatic text generation metrics (eg, Bilingual Evaluation Understudy [<xref ref-type="bibr" rid="ref15">15</xref>] and Recall-Oriented Understudy for Gisting Evaluation [<xref ref-type="bibr" rid="ref16">16</xref>]) and clinically oriented evaluation criteria, including the correctness and appropriateness of the recommended treatments. This combined approach provided a balanced trade-off between computational efficiency and clinical relevance, offering practical guidance for deploying LLM-based treatment recommendation systems in real-world clinical settings.</p></sec><sec id="s2-6"><title>Physician Assessment of Recommendations</title><p>To ensure a rigorous and clinically meaningful evaluation of our LLM-generated diabetes treatment recommendations, we implemented a structured, multidimensional physician assessment protocol. Six dimensions were assessed: treatment appropriateness, medication accuracy, relevance of suggested examinations, safety, logical reasoning, and overall clinical usefulness. Each dimension was rated using a standardized 5-point Likert scale [<xref ref-type="bibr" rid="ref17">17</xref>] ranging from 1 (&#x201C;completely unreasonable or not useful&#x201D;) to 5 (&#x201C;fully reasonable and clinically valuable&#x201D;). Five board-certified endocrinologists (each with more than 5 years of independent clinical practice) served as expert raters. All assessments were performed independently and in a fully blinded manner: raters were unaware of which LLM produced each recommendation and were prohibited from discussing cases. Each physician reviewed the same 300 treatment recommendations generated by the 3 LLMs, including both base and fine-tuned versions. Recommendations included medication plans, dosage adjustments, and suggested laboratory tests. This protocol ensured a systematic, reproducible, and clinician-centered evaluation of model outputs, enabling identification of potential risks and areas requiring refinement.</p></sec><sec id="s2-7"><title>Retrieval-Augmented Generation and Agent-Assisted Clinical Data Processing</title><p>We implemented a retrieval-augmented generation (RAG) [<xref ref-type="bibr" rid="ref18">18</xref>] framework to integrate hospital knowledge resources, including the medical order database, diabetes treatment guidelines, and clinical protocols, enabling the model to dynamically retrieve relevant knowledge during treatment recommendations and provide context-specific, up-to-date clinical information. An agent system was developed to interface with the hospital information system. It handles data cleaning, integration, and monitoring by extracting and standardizing patient data, consolidating heterogeneous sources into a structured format suitable for RAG, and ensuring data integrity and consistency. By combining RAG with agent-driven data management, the system efficiently leverages internal knowledge bases to support clinically grounded, accurate, and interpretable treatment recommendations. The overall workflow of this clinical application system is illustrated in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Schematic workflow of the clinical application system integrating retrieval-augmented generation (RAG)&#x2013; and agent-driven data management. AI: artificial intelligence; LLM: large language model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e71541_fig01.png"/></fig></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Overview</title><p>As shown in <xref ref-type="fig" rid="figure2">Figure 2</xref>, the final dataset comprised 20,619 patients, with 80% allocated for model training and 20% held out as a test set for performance evaluation. <xref ref-type="fig" rid="figure2">Figure 2</xref> illustrates the data collection process.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Flowchart of patient inclusion and exclusion criteria</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e71541_fig02.png"/></fig><p>All 3 LLMs&#x2014;Llama 3.1-8B, Qwen3-8B, and GLM4-9B&#x2014;were evaluated before and after LoRA fine-tuning. Their training and test loss curves are shown in <xref ref-type="fig" rid="figure3">Figure 3</xref>, and model performance was assessed using both automatic text generation metrics and clinical physician evaluations.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Training and test loss curves for Llama 3.1-8B, Qwen3-8B, and GLM4-9B.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e71541_fig03.png"/></fig><p><xref ref-type="table" rid="table1">Table 1</xref> summarizes the comparative performance of the baseline and fine-tuned models measured using Bilingual Evaluation Understudy for 4-grams (BLEU-4), Recall-Oriented Understudy for Gisting Evaluation for overlap of unigrams (ROUGE-1), Recall-Oriented Understudy for Gisting Evaluation for overlap of bigrams (ROUGE-2), and Recall-Oriented Understudy for Gisting Evaluation&#x2013;Longest Common Subsequence (ROUGE-L). Before fine-tuning, the 3 base models demonstrated moderate capability in generating clinically relevant recommendations, with mean BLEU-4 scores ranging from 45.13 (SD 1.98) to 50.84&#x202F;(SD 1.87) and ROUGE-L scores ranging from 9.83&#x202F; (SD 0.54) to 13.90 (SD 0.74). Among the base models, GLM4-9B achieved the highest performance across all metrics. After fine-tuning, all models showed significant gains in both lexical similarity and content relevance. The BLEU-4 mean score increased by 14.48 points for Llama 3.1-8B, 15.95 points for Qwen3-8B, and 17.09 points for GLM4-9B. ROUGE-1 and ROUGE-L exhibited similar patterns, with improvements exceeding 20 points for all 3 models. The fine-tuned GLM4-9B outperformed all models, achieving the highest BLEU-4 (mean 67.93, SD 2.74), ROUGE-1 (mean 44.30, SD 3.91), ROUGE-2 (mean 27.34, SD 1.85), and ROUGE-L (mean 37.67, SD 2.88) scores.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Performance analysis based on Bilingual Evaluation Understudy for 4-grams (BLEU-4) and Recall-Oriented Understudy for Gisting Evaluation scores.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">BLEU-4, mean (SD)</td><td align="left" valign="bottom">ROUGE-1<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup>, mean (SD)</td><td align="left" valign="bottom">ROUGE-2<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup>, mean (SD)</td><td align="left" valign="bottom">ROUGE-L<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup>, mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top">Base Llama 3.1-8B</td><td align="char" char="plusmn" valign="top">45.13 (1.98)</td><td align="char" char="plusmn" valign="top">12.55 (0.68)</td><td align="left" valign="top">3.12 (0.21)</td><td align="left" valign="top">9.83 (0.54)</td></tr><tr><td align="left" valign="top">Base Qwen3-8B</td><td align="char" char="plusmn" valign="top">47.92 (2.11)</td><td align="char" char="plusmn" valign="top">15.33 (0.75)</td><td align="left" valign="top">4.46 (0.26)</td><td align="left" valign="top">11.77 (0.63)</td></tr><tr><td align="left" valign="top">Base GLM4-9B</td><td align="char" char="plusmn" valign="top">50.84 (1.87)</td><td align="char" char="plusmn" valign="top">17.92 (0.82)</td><td align="left" valign="top">5.71 (0.31)</td><td align="left" valign="top">13.90 (0.74)</td></tr><tr><td align="left" valign="top">Fine-tuned Llama 3.1-8B</td><td align="char" char="plusmn" valign="top">59.61 (3.21)</td><td align="char" char="plusmn" valign="top">36.12 (3.44)</td><td align="left" valign="top">22.48 (1.78)</td><td align="left" valign="top">31.44 (2.93)</td></tr><tr><td align="left" valign="top">Fine-tuned Qwen3-8B</td><td align="char" char="plusmn" valign="top">63.87 (3.18)</td><td align="char" char="plusmn" valign="top">40.58 (3.72)</td><td align="left" valign="top">25.01 (1.92)</td><td align="left" valign="top">34.80 (3.12)</td></tr><tr><td align="left" valign="top">Fine-tuned GLM4-9B</td><td align="char" char="plusmn" valign="top">67.93 (2.74)</td><td align="char" char="plusmn" valign="top">44.30 (3.91)</td><td align="left" valign="top">27.34 (1.85)</td><td align="left" valign="top">37.67 (2.88)</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>ROUGE-1: Recall-Oriented Understudy for Gisting Evaluation for overlap of unigrams. </p></fn><fn id="table1fn2"><p><sup>b</sup>ROUGE-2: Recall-Oriented Understudy for Gisting Evaluation for overlap of bigrams. </p></fn><fn id="table1fn3"><p><sup>c</sup>ROUGE-L: Recall-Oriented Understudy for Gisting Evaluation&#x2013;Longest Common Subsequence.</p></fn></table-wrap-foot></table-wrap><p>Regarding the clinical evaluation, the performance of the original models and their LoRA fine-tuned versions is summarized in <xref ref-type="other" rid="box2">Textbox 2</xref>.</p><boxed-text id="box2"><title> Performance of the original models and their low-rank adaptation fine-tuned versions.</title><p><bold>Input</bold></p><list list-type="bullet"><list-item><p>Department: endocrinology and metabolism medical unit</p></list-item><list-item><p>Gender: male</p></list-item><list-item><p>Age: 68 years</p></list-item><list-item><p>Diagnosis: type 2 diabetes; hyperlipidemia</p></list-item><list-item><p>Chief concern: diabetes follow-up visit</p></list-item><list-item><p>Medical history: diabetes for 2 or more years</p></list-item></list><p><bold>Real clinical data output</bold></p><list list-type="bullet"><list-item><p>Laboratory tests: liver and kidney function, glucose and fat-a, and glycated hemoglobin (HbA<sub>1c</sub>; biochemistry 2)</p></list-item><list-item><p>Medication: 11 units once a day of Ganjing insulin injection (3 mL: 300 prefilled units), 5 mg of OuTangning once a day (5 mg &#x00D7; 7 tablets), and 10 mg of atorvastatin calcium tablets once every night (10 mg &#x00D7; 28 tablets)</p></list-item></list><p><bold>Base GLM4-9B output</bold></p><list list-type="bullet"><list-item><p>&#x201C;A patient with diabetes attending a follow-up visit requires a comprehensive medical history review and physical examination to better assess their condition and guide treatment planning. According to your description, the patient has type 2 diabetes and hyperlipidemia. Type 2 diabetes is a chronic condition that necessitates long-term management, while hyperlipidemia is a common metabolic disorder that can negatively impact cardiovascular health and increase the risk of cardiovascular disease. During follow-up, the patient should monitor blood glucose levels and adjust management strategies&#x2014;including medication, diet, and exercise&#x2014;accordingly. Regular monitoring of blood lipid levels is also necessary, with treatment plans tailored based on the results. Common treatment strategies for diabetes include oral medications, insulin therapy, blood glucose monitoring, and dietary control. Successful management requires active patient engagement, adherence to lifestyle recommendations, and consistent follow-up to maintain stable and healthy blood glucose levels.&#x201D;</p></list-item></list><p><bold>Fine-tuned GLM4-9B output</bold></p><list list-type="bullet"><list-item><p>Laboratory tests: HbA<sub>1c</sub> and liver and kidney function tests, lipase A, fasting insulin, 2-hour postprandial insulin, fasting plasma glucose, 2-hour postprandial C-peptide, and 2-hour postprandial plasma glucose (biochemistry 1)</p></list-item><list-item><p>Medication: 11 units once daily of Ganjing insulin injection (3 mL: 300 units/prefilled) and 5 mg of OuTangning once daily (5 mg &#x00D7; 7 tablets)</p></list-item><list-item><p>Physician&#x2019;s recommendation: regular follow-up visits are advised to monitor treatment response and adjust therapy as needed</p></list-item></list></boxed-text></sec><sec id="s3-2"><title>Clinical Evaluation</title><p><xref ref-type="table" rid="table2">Table 2</xref> shows the results of 5 endocrinologists&#x2019; evaluation of the 300 treatment recommendations generated by the 3 fine-tuned LLMs. Overall, the mean scores indicated that most recommendations were clinically relevant to diabetes management (mean usefulness scores above 4 for all fine-tuned models), whereas the base models received substantially lower ratings. Approximately 10% of the recommendations (32/300, 10.7%) were judged by at least one rater as potentially risky (eg, inappropriate escalation or overtreatment), underscoring the need for human oversight.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Physician assessment of recommendations.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Treatment appropriateness (1-5), mean (SD)</td><td align="left" valign="bottom">Medication accuracy (1-5), mean (SD)</td><td align="left" valign="bottom">Relevance (1-5), mean (SD)</td><td align="left" valign="bottom">Safety (1-5), mean (SD)</td><td align="left" valign="bottom">Logical reasoning (1-5), mean (SD)</td><td align="left" valign="bottom">Usefulness (1-5), mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top">Base GLM4-9B</td><td align="left" valign="top">3.28 (0.38)</td><td align="left" valign="top">2.96 (0.37)</td><td align="left" valign="top">3.52 (0.34)</td><td align="left" valign="top">3.60 (0.53)</td><td align="left" valign="top">3.60 (0.50)</td><td align="left" valign="top">3.44 (0.30)</td></tr><tr><td align="left" valign="top">Fine-tuned GLM4-9B</td><td align="left" valign="top">4.72 (0.31)</td><td align="left" valign="top">4.48 (0.43)</td><td align="left" valign="top">4.76 (0.36)</td><td align="left" valign="top">4.66 (0.47)</td><td align="left" valign="top">4.78 (0.20)</td><td align="left" valign="top">4.78 (0.16)</td></tr><tr><td align="left" valign="top">Base Llama 3.1-8B</td><td align="left" valign="top">2.50 (0.53)</td><td align="left" valign="top">2.80 (0.70)</td><td align="left" valign="top">2.94 (0.18)</td><td align="left" valign="top">2.86 (0.29)</td><td align="left" valign="top">2.54 (0.36)</td><td align="left" valign="top">2.60 (0.54)</td></tr><tr><td align="left" valign="top">Fine-tuned Llama 3.1-8B</td><td align="left" valign="top">4.58 (0.47)</td><td align="left" valign="top">4.68 (0.43)</td><td align="left" valign="top">4.70 (0.42)</td><td align="left" valign="top">4.40 (0.49)</td><td align="left" valign="top">4.36 (0.57)</td><td align="left" valign="top">4.60 (0.39)</td></tr><tr><td align="left" valign="top">Base Qwen3-8B</td><td align="left" valign="top">3.42 (0.42)</td><td align="left" valign="top">2.98 (0.41)</td><td align="left" valign="top">3.00 (0.85)</td><td align="left" valign="top">3.14 (0.62)</td><td align="left" valign="top">3.32 (0.48)</td><td align="left" valign="top">3.24 (0.51)</td></tr><tr><td align="left" valign="top">Fine-tuned Qwen3-8B</td><td align="left" valign="top">4.50 (0.71)</td><td align="left" valign="top">4.26 (0.50)</td><td align="left" valign="top">4.16 (0.78)</td><td align="left" valign="top">3.80 (0.37)</td><td align="left" valign="top">4.44 (0.55)</td><td align="left" valign="top">4.44 (0.38)</td></tr></tbody></table></table-wrap><p>As shown in <xref ref-type="fig" rid="figure4">Figure 4</xref>, the radar chart highlights the multidimensional improvements achieved through LoRA fine-tuning across all 6 evaluation metrics. For all 3 models, the fine-tuned versions consistently exhibited higher scores than their base counterparts. Among the models, GLM4-9B (fine-tuned) achieved the highest performance across nearly all dimensions, particularly in treatment appropriateness, relevance, and overall usefulness.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Radar chart illustrating the multidimensional physician assessment scores of treatment recommendations generated by Llama 3.1-8B, Qwen3-8B, and GLM4-9B before and after low-rank adaptation fine-tuning.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e71541_fig04.png"/></fig></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study systematically evaluated the performance of 3 LLMs (Llama 3.1-8B, Qwen3-8B, and GLM4-9B) on a clinical treatment recommendation task for diabetes and explored a parameter-efficient optimization strategy combining instruction-based prompting and LoRA fine-tuning. It should be noted that all 3 models are general-purpose open-source LLMs and were not specifically fine-tuned on medical data, serving as preliminary baselines for clinical recommendation tasks. The untuned base models achieved moderate performance in generating clinically relevant recommendations, with mean BLEU-4 scores ranging from 45.13 to 50.84 and mean ROUGE-L scores ranging from 9.83 to 13.90, among which the base GLM4-9B model performed best across metrics. These findings indicate that even relatively small models possess nontrivial text generation capabilities but exhibit limitations in content relevance and accuracy for specialized clinical tasks. After LoRA fine-tuning, all models showed substantial improvements on both automatic evaluation metrics and clinician assessments. BLEU-4 scores increased by approximately 14 to 17 absolute points on average, whereas ROUGE-1 and ROUGE-L scores improved by more than 20 points, suggesting that fine-tuning effectively enhanced lexical similarity and information completeness in the generated texts. In clinician evaluations, scores increased markedly across the 6 dimensions&#x2014;therapeutic appropriateness, medication accuracy, examination relevance, safety, logical reasoning, and overall usefulness. Among them, fine-tuned GLM4-9B achieved the best performance on key dimensions such as therapeutic appropriateness, relevance, and overall usefulness, whereas fine-tuned Llama 3.1-8B and Qwen3-8B also demonstrated varying degrees of improvement (<xref ref-type="fig" rid="figure4">Figure 4</xref>). The radar chart clearly illustrates these multidimensional gains, underscoring the effectiveness of combining LoRA with instruction-based prompting to enhance clinical applicability. Furthermore, by leveraging an RAG framework and an agent-based data processing and knowledge retrieval pipeline (<xref ref-type="fig" rid="figure1">Figure 1</xref>), the models can dynamically access up-to-date institutional clinical guidelines, medication order databases, and diabetes treatment standards, thereby enabling individualized, context-aware clinical recommendations. This approach not only improves the accuracy and safety of the generated suggestions but also strengthens their interpretability and practical usability in clinical settings. After repeated debugging and testing of the model, we found that, for a subset of patients with complex medical records, the model output can be potentially harmful and fail to assist health care professionals in their treatment. This situation can occur for the following reasons:</p><list list-type="bullet"><list-item><p>Data imbalance or sample bias [<xref ref-type="bibr" rid="ref19">19</xref>]: the model may have been trained on an overrepresentation of certain types of patient records, leading to an inadequate understanding of other types of patients. This bias can lead to inaccurate or harmful treatment recommendations for certain patients.</p></list-item><list-item><p>Unknown or rare scenarios [<xref ref-type="bibr" rid="ref20">20</xref>]: if the model encounters unfamiliar or infrequent situations during training, it may struggle to make accurate predictions or appropriate recommendations. Complex patient records often contain such unknown scenarios, rendering the model&#x2019;s output ineffective.</p></list-item><list-item><p>Limitations of the model [<xref ref-type="bibr" rid="ref21">21</xref>]: the model may have inherent design or training limitations that prevent it from adequately accounting for factors specific to complex medical records. As a result, the model&#x2019;s outputs may lack accuracy or reliability in these cases. Additionally, the fact that the models are not medically fine-tuned may contribute to these limitations, highlighting the importance of exploring medically fine-tuned LLMs and larger, multi-institutional datasets in future research.</p></list-item><list-item><p>The size of the training data has a direct impact on the performance of large models. Therefore, it is crucial for us to explore methods to increase the scale of our dataset (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p></list-item></list></sec><sec id="s4-2"><title>Limitations</title><p>This study has several limitations. First, the proposed framework has been evaluated only in a controlled experimental setting and has not yet been prospectively validated in real-world clinical workflows. As such, its clinical effectiveness and operational impact remain to be systematically assessed. Second, we only used relatively small-scale models, which have substantially fewer parameters than larger frontier models such as DeepSeek R1 [<xref ref-type="bibr" rid="ref22">22</xref>], Qwen3-32B, or Llama 3-65B. While the smaller models reduce computational requirements and facilitate low-cost intranet deployment, they may limit performance and generalization compared with larger models. Third, the training data for this study were collected from a single medical institution and are relatively limited in size, which may affect the robustness and external validity of the results. It is important to understand that, while LLMs often perform well in many scenarios, they may have limitations when dealing with complex medical records. Therefore, model outputs should not be the sole basis for decision-making. Health care professionals should rely on their expertise and clinical judgment and integrate model outputs with comprehensive assessments to make informed decisions [<xref ref-type="bibr" rid="ref23">23</xref>]. Additionally, we plan to fine-tune domain-specific medical LLMs (such as MedGemma 27B [<xref ref-type="bibr" rid="ref24">24</xref>]) and explore the integration of medical knowledge graphs to incorporate structured clinical knowledge, enhance model reasoning [<xref ref-type="bibr" rid="ref25">25</xref>], and further improve the consistency and reliability of the generated recommendations. Future research will also focus on leveraging larger models and multi-institutional datasets to strengthen model performance, generalizability, and robustness.</p></sec><sec id="s4-3"><title>Conclusions</title><p>Overall, this study validates the feasibility and effectiveness of combining small-scale LLMs with LoRA fine-tuning and an RAG- and agent-assisted data processing and knowledge retrieval strategy for clinical treatment recommendations in diabetes. The fine-tuned models not only achieved superior performance on automated text generation metrics but also generated treatment recommendations deemed safe, clinically appropriate, and of substantial reference value in structured clinician evaluations. These findings provide a viable pathway for the responsible deployment of LLMs in real-world medical applications. Future research should scale up training datasets; extend validation to a broader range of disease entities; and incorporate longitudinal real-world evidence to further assess long-term clinical effectiveness, safety, and generalizability.</p></sec></sec></body><back><ack><p>The authors sincerely thank Drs Jing Zhao, Yuhang Zhang, Yan Tang, Zengzhen Lai, and Benyong Mi for their valuable contributions in evaluating the model outputs and providing insightful feedback that enhanced the quality and rigor of this study. The authors used ChatGPT (OpenAI) only for English-language polishing during the preparation of this manuscript. The authors reviewed and edited all content and take full responsibility for the final version of the manuscript.</p></ack><notes><sec><title>Funding</title><p>This work was supported by the Chengdu Medical Research Project (grant 2022387).</p></sec><sec><title>Data Availability</title><p>Considering data security, the data will not be publicly available on the website for now. However, the data and code used in this study are available from the corresponding author on reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>JL and SH conceptualized and designed the study, performed the experiments, conducted model analyses, analyzed the data, and drafted the manuscript. JL provided clinical guidance, oversaw data processing, and contributed to manuscript revisions. All authors contributed to manuscript revisions, approved the final version, and accept accountability for all aspects of the work.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">BLEU-4</term><def><p>Bilingual Evaluation Understudy for 4-grams</p></def></def-item><def-item><term id="abb2">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb3">LoRA</term><def><p>low-rank adaptation</p></def></def-item><def-item><term id="abb4">RAG</term><def><p>retrieval-augmented generation</p></def></def-item><def-item><term id="abb5">ROUGE-1</term><def><p>Recall-Oriented Understudy for Gisting Evaluation for overlap of unigrams</p></def></def-item><def-item><term id="abb6">ROUGE-2</term><def><p>Recall-Oriented Understudy for Gisting Evaluation for overlap of bigrams</p></def></def-item><def-item><term id="abb7">ROUGE-L</term><def><p>Recall-Oriented Understudy for Gisting Evaluation&#x2013;Longest Common Subsequence</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Artasensi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pedretti</surname><given-names>A</given-names> </name><name name-style="western"><surname>Vistoli</surname><given-names>G</given-names> </name><name name-style="western"><surname>Fumagalli</surname><given-names>L</given-names> </name></person-group><article-title>Type 2 diabetes mellitus: a review of multi-target drugs</article-title><source>Molecules</source><year>2020</year><month>04</month><day>23</day><volume>25</volume><issue>8</issue><fpage>1987</fpage><pub-id pub-id-type="doi">10.3390/molecules25081987</pub-id><pub-id pub-id-type="medline">32340373</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Tayyab</surname><given-names>SL</given-names> </name><name name-style="western"><surname>Seher</surname><given-names>W</given-names> </name><name name-style="western"><surname>Hussain</surname><given-names>K</given-names> </name><name name-style="western"><surname>Murtaza</surname><given-names>I</given-names> </name></person-group><article-title>Diabetes: a global health concern and potential strategies to reduce its prevalence</article-title><source>Integrated Science for Sustainable Development Goal 3</source><year>2024</year><publisher-name>Springer</publisher-name><fpage>329</fpage><lpage>348</lpage><pub-id pub-id-type="doi">10.1007/978-3-031-64288-3_14</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Landgraf</surname><given-names>R</given-names> </name><name name-style="western"><surname>Aberle</surname><given-names>J</given-names> </name><name name-style="western"><surname>Birkenfeld</surname><given-names>AL</given-names> </name><etal/></person-group><article-title>Therapy of type 2 diabetes</article-title><source>Exp Clin Endocrinol Diabetes</source><year>2019</year><month>12</month><volume>127</volume><issue>S 01</issue><fpage>S73</fpage><lpage>S92</lpage><pub-id pub-id-type="doi">10.1055/a-1018-9106</pub-id><pub-id pub-id-type="medline">31860927</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Eghbali-Zarch</surname><given-names>M</given-names> </name><name name-style="western"><surname>Tavakkoli-Moghaddam</surname><given-names>R</given-names> </name><name name-style="western"><surname>Esfahanian</surname><given-names>F</given-names> </name><name name-style="western"><surname>Azaron</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sepehri</surname><given-names>MM</given-names> </name></person-group><article-title>A Markov decision process for modeling adverse drug reactions in medication treatment of type 2 diabetes</article-title><source>Proc Inst Mech Eng H</source><year>2019</year><month>08</month><volume>233</volume><issue>8</issue><fpage>793</fpage><lpage>811</lpage><pub-id pub-id-type="doi">10.1177/0954411919853394</pub-id><pub-id pub-id-type="medline">31177917</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Eloundou</surname><given-names>T</given-names> </name><name name-style="western"><surname>Manning</surname><given-names>S</given-names> </name><name name-style="western"><surname>Mishkin</surname><given-names>P</given-names> </name><name name-style="western"><surname>Rock</surname><given-names>D</given-names> </name></person-group><article-title>GPTs are GPTs: an early look at the labor market impact potential of large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 17, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2303.10130</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>X</given-names> </name><etal/></person-group><article-title>DeID-GPT: zero-shot medical text de-identification by GPT-4</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 20, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2303.11032</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wright</surname><given-names>AP</given-names> </name><name name-style="western"><surname>Patterson</surname><given-names>BL</given-names> </name><etal/></person-group><article-title>Using AI-generated suggestions from ChatGPT to optimize clinical decision support</article-title><source>J Am Med Inform Assoc</source><year>2023</year><month>06</month><day>20</day><volume>30</volume><issue>7</issue><fpage>1237</fpage><lpage>1245</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocad072</pub-id><pub-id pub-id-type="medline">37087108</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Nori</surname><given-names>H</given-names> </name><name name-style="western"><surname>King</surname><given-names>N</given-names> </name><name name-style="western"><surname>McKinney</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Carignan</surname><given-names>D</given-names> </name><name name-style="western"><surname>Horvitz</surname><given-names>E</given-names> </name></person-group><article-title>Capabilities of GPT-4 on medical challenge problems</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 20, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2303.13375</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gilson</surname><given-names>A</given-names> </name><name name-style="western"><surname>Safranek</surname><given-names>CW</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>T</given-names> </name><etal/></person-group><article-title>How does ChatGPT perform on the United States Medical Licensing Examination (USMLE)? The implications of large language models for medical education and knowledge assessment</article-title><source>JMIR Med Educ</source><year>2023</year><month>02</month><day>8</day><volume>9</volume><fpage>e45312</fpage><pub-id pub-id-type="doi">10.2196/45312</pub-id><pub-id pub-id-type="medline">36753318</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>M</given-names> </name><name name-style="western"><surname>Li</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zheng</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Big data health care platform with multisource heterogeneous data integration and massive high-dimensional data governance for large hospitals: design, development, and application</article-title><source>JMIR Med Inform</source><year>2022</year><month>04</month><day>13</day><volume>10</volume><issue>4</issue><fpage>e36481</fpage><pub-id pub-id-type="doi">10.2196/36481</pub-id><pub-id pub-id-type="medline">35416792</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>He</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Shu</surname><given-names>W</given-names> </name><name name-style="western"><surname>Ge</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Llama scope: extracting millions of features from Llama-3.1-8B with sparse autoencoders</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 27, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2410.20526</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>A</given-names> </name><name name-style="western"><surname>Li</surname><given-names>A</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Qwen3 technical report</article-title><source>arXiv</source><comment>Preprint posted online on  May 14, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2505.09388</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="other"><person-group person-group-type="author"><collab>Team GLM</collab></person-group><article-title>ChatGLM: a family of large language models from GLM-130B to GLM-4 all tools</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 18, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2406.12793</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>EJ</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wallis</surname><given-names>P</given-names> </name><etal/></person-group><article-title>Lora: low-rank adaptation of large language models</article-title><access-date>2026-01-18</access-date><conf-name>Proceedings of the International Conference on Learning Representations</conf-name><conf-date>Apr 25-29, 2022</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://openreview.net/pdf?id=nZeVKeeFYf9">https://openreview.net/pdf?id=nZeVKeeFYf9</ext-link></comment></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Post</surname><given-names>M</given-names> </name></person-group><article-title>A call for clarity in reporting BLEU scores</article-title><conf-name>Proceedings of the Third Conference on Machine Translation</conf-name><conf-date>Oct 31 to Nov 1, 2018</conf-date><pub-id pub-id-type="doi">10.18653/v1/W18-6319</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Lin</surname><given-names>CY</given-names> </name></person-group><article-title>ROUGE: a package for automatic evaluation of summaries</article-title><access-date>2026-01-27</access-date><conf-name>Text Summarization Branches Out</conf-name><conf-date>Jul 25-26, 2004</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/W04-1013/">https://aclanthology.org/W04-1013/</ext-link></comment></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Joshi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kale</surname><given-names>S</given-names> </name><name name-style="western"><surname>Chandel</surname><given-names>S</given-names> </name><name name-style="western"><surname>Pal</surname><given-names>DK</given-names> </name></person-group><article-title>Likert scale: explored and explained</article-title><source>Br J Appl Sci Technol</source><year>2015</year><volume>7</volume><issue>4</issue><fpage>396</fpage><lpage>403</lpage><pub-id pub-id-type="doi">10.9734/BJAST/2015/14975</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Lewis</surname><given-names>P</given-names> </name><name name-style="western"><surname>Perez</surname><given-names>E</given-names> </name><name name-style="western"><surname>Piktus</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Retrieval-augmented generation for knowledge-intensive NLP tasks</article-title><conf-name>Proceedings of the 34th International Conference on Neural Information Processing Systems</conf-name><conf-date>Dec 6-12, 2020</conf-date><pub-id pub-id-type="doi">10.5555/3495724.3496517</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Suenghataiphorn</surname><given-names>T</given-names> </name><name name-style="western"><surname>Tribuddharat</surname><given-names>N</given-names> </name><name name-style="western"><surname>Danpanichkul</surname><given-names>P</given-names> </name><name name-style="western"><surname>Kulthamrongsri</surname><given-names>N</given-names> </name></person-group><article-title>Bias in large language models across clinical applications: a systematic review</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 3, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2504.02917</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Zhou</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Cheng</surname><given-names>L</given-names> </name><name name-style="western"><surname>Di Eugenio</surname><given-names>B</given-names> </name></person-group><article-title>Unveiling performance challenges of large language models in low-resource healthcare: a demographic fairness perspective</article-title><access-date>2026-01-18</access-date><conf-name>Proceedings of the 31st International Conference on Computational Linguistics</conf-name><conf-date>Jan 19-24, 2025</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2025.coling-main.485/">https://aclanthology.org/2025.coling-main.485/</ext-link></comment></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Maity</surname><given-names>S</given-names> </name><name name-style="western"><surname>Saikia</surname><given-names>MJ</given-names> </name></person-group><article-title>Large language models in healthcare and medical applications: a review</article-title><source>Bioengineering (Basel)</source><year>2025</year><month>06</month><day>10</day><volume>12</volume><issue>6</issue><fpage>631</fpage><pub-id pub-id-type="doi">10.3390/bioengineering12060631</pub-id><pub-id pub-id-type="medline">40564447</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Guo</surname><given-names>D</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>D</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>H</given-names> </name><etal/></person-group><article-title>DeepSeek-R1: incentivizing reasoning capability in LLMs via reinforcement learning</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 22, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2501.12948</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Antaki</surname><given-names>F</given-names> </name><name name-style="western"><surname>Touma</surname><given-names>S</given-names> </name><name name-style="western"><surname>Milad</surname><given-names>D</given-names> </name><name name-style="western"><surname>El-Khoury</surname><given-names>J</given-names> </name><name name-style="western"><surname>Duval</surname><given-names>R</given-names> </name></person-group><article-title>Evaluating the performance of ChatGPT in ophthalmology: an analysis of its successes and shortcomings</article-title><source>Ophthalmol Sci</source><year>2023</year><month>05</month><day>5</day><volume>3</volume><issue>4</issue><fpage>100324</fpage><pub-id pub-id-type="doi">10.1016/j.xops.2023.100324</pub-id><pub-id pub-id-type="medline">37334036</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Sellergren</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kazemzadeh</surname><given-names>S</given-names> </name><name name-style="western"><surname>Jaroensri</surname><given-names>T</given-names> </name><etal/></person-group><article-title>MedGemma technical report</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 7, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2507.05201</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>C</given-names> </name><name name-style="western"><surname>Sierra</surname><given-names>AP</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>B</given-names> </name></person-group><article-title>Large language model-driven knowledge graph construction in sepsis care using multicenter clinical databases: development and usability study</article-title><source>J Med Internet Res</source><year>2025</year><month>03</month><day>27</day><volume>27</volume><fpage>e65537</fpage><pub-id pub-id-type="doi">10.2196/65537</pub-id><pub-id pub-id-type="medline">40146985</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Supplementary material demonstrating the outputs of the large language models before and after fine-tuning. Screenshots illustrate how different fine-tuning strategies (P-tuning and low-rank adaptation) improve clinical response accuracy; provide structured treatment suggestions; and reduce harmful or inaccurate recommendations in typical, complex, and rare patient scenarios.</p><media xlink:href="formative_v10i1e71541_app1.docx" xlink:title="DOCX File, 691 KB"/></supplementary-material></app-group></back></article>