<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v10i1e88618</article-id><article-id pub-id-type="doi">10.2196/88618</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>The Power of Multimodality in Multimodal Large Language Models, Unimodal ChatGPT 5.0, and Human Clinical Experts on a Wound Care Certification Examination: Cross-Sectional Comparative Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Ucdal</surname><given-names>Mete</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Celik</surname><given-names>Melike Elif</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Evik</surname><given-names>Guliz</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kuru</surname><given-names>Saniye Beyza</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ozer</surname><given-names>Saadet</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Gungor</surname><given-names>Sultan</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Internal Medicine, Etimesgut Asker Hastanesi</institution><addr-line>Ankara</addr-line><country>Turkey</country></aff><aff id="aff2"><institution>Department of Infectious Diseases, Etimesgut Asker Hastanesi</institution><addr-line>Ankara</addr-line><country>Turkey</country></aff><aff id="aff3"><institution>Department of General Surgery, Etimesgut Asker Hastanesi</institution><addr-line>Ankara</addr-line><addr-line>Ankara</addr-line><country>Turkey</country></aff><aff id="aff4"><institution>Wound Care Nursing Unit, Etimesgut Asker Hastanesi</institution><addr-line>Ankara</addr-line><country>Turkey</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Mavragani</surname><given-names>Amaryllis</given-names></name></contrib><contrib contrib-type="editor"><name name-style="western"><surname>Steenstra</surname><given-names>Ivan</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Abou-Bakr</surname><given-names>Asmaa</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Guo</surname><given-names>Jinyu</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Mete Ucdal, MD, Department of Internal Medicine, Etimesgut Asker Hastanesi, Ankara, 06790, Turkey, 90 312 552 55 00; <email>meteucdal@hacettepe.edu.tr</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>27</day><month>4</month><year>2026</year></pub-date><volume>10</volume><elocation-id>e88618</elocation-id><history><date date-type="received"><day>28</day><month>11</month><year>2025</year></date><date date-type="rev-recd"><day>23</day><month>02</month><year>2026</year></date><date date-type="accepted"><day>23</day><month>02</month><year>2026</year></date></history><copyright-statement>&#x00A9; Mete Ucdal, Melike Elif Celik, Guliz Evik, Saniye Beyza Kuru, Saadet Ozer, Sultan Gungor. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 27.4.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2026/1/e88618"/><abstract><sec><title>Background</title><p>Multimodal large language models (MLLMs) capable of integrating visual and textual information represent a promising advancement for clinical applications requiring image interpretation. Wound care assessment, which demands simultaneous analysis of wound photographs and clinical data, provides an ideal domain to evaluate multimodal vs unimodal artificial intelligence capabilities against human expertise.</p></sec><sec><title>Objective</title><p>This study aims to compare the performance of MLLMs, unimodal ChatGPT 5.0, and human clinical experts on a standardized wound care certification examination.</p></sec><sec sec-type="methods"><title>Methods</title><p>This cross-sectional comparative study evaluated 3 participant groups on a 25-question wound care certification examination spanning 4 clinical domains (Diagnosis, Treatment, Complication Management, and Wound Subtype Knowledge). Participants included 3 MLLMs (Med-PaLM 2, LLaVA-Med, and BioGPT), 1 unimodal large language model (ChatGPT 5.0), and 4 human clinical experts (general surgeon, wound care nurse, and 2 internal medicine physicians). Statistical analyses included one-way ANOVA with Tukey post hoc tests and domain-specific Kruskal-Wallis comparisons.</p></sec><sec sec-type="results"><title>Results</title><p>Human experts achieved the highest accuracy (mean 86%, SD 9.1%), followed by MLLMs (mean 78.7%, SD 12.2%), while ChatGPT 5.0 achieved 64% accuracy, failing the 70% certification threshold. Significant overall group differences were observed (<italic>F</italic><sub>2,5</sub>=8.42, <italic>P</italic>=.02, &#x03B7;&#x00B2;=0.74). MLLMs significantly outperformed ChatGPT 5.0 (difference=14.7 percentage points, <italic>P</italic>=.03, Cohen <italic>d</italic>=1.38), with the multimodal advantage most pronounced in visually dependent domains: Diagnosis (81% vs 43%, <italic>P</italic>=.008) and Complication Management (72% vs 50%, <italic>P</italic>=.03). No multimodal advantage was observed for text-based Wound Subtype Knowledge (both 67%). Med-PaLM 2 achieved 92% accuracy, matching that of the wound care nurse, while the general surgeon achieved the highest overall performance (96%).</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>MLLMs demonstrate significant performance advantages over unimodal artificial intelligence in wound care assessment, particularly for visually dependent clinical tasks. While human experts with specialized wound care experience maintain overall superiority, the point estimate of the top-performing MLLM (Med-PaLM 2, 92%) fell within the observed range of human scores; however, the underpowered comparison (power=0.52) and wide CIs preclude definitive conclusions regarding noninferiority or equivalence to human experts. These findings support the potential role of MLLMs as clinical decision-support tools, warranting further adequately powered validation studies.</p></sec></abstract><kwd-group><kwd>multimodal large language models</kwd><kwd>wound care</kwd><kwd>artificial intelligence</kwd><kwd>clinical decision support</kwd><kwd>certification examination</kwd><kwd>ChatGPT</kwd><kwd>Med-PaLM 2</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>The progression of artificial intelligence (AI) within the health care sector has advanced from rule-based expert systems to sophisticated deep learning models capable of analyzing complex medical data. A notable advancement is the development of multimodal large language models (MLLMs) that simultaneously analyze textual descriptions and clinical images, reflecting the integrative reasoning approach used by human clinicians [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>].</p><p>Wound care exemplifies a clinical domain in which multimodal capabilities may offer substantial benefits. Accurate wound assessment necessitates the integration of visual pattern recognition (such as wound bed characteristics, tissue types, and staging), clinical history documented in text, and adherence to evidence-based protocols [<xref ref-type="bibr" rid="ref3">3</xref>]. In dermatology, a 2025 narrative review of large language model applications noted that while models such as ChatGPT 5.0 demonstrate impressive textual reasoning, they remain constrained in capturing the image-based, context-specific subtleties necessary for complex dermatologic assessment [<xref ref-type="bibr" rid="ref4">4</xref>].</p><p>Recent multimodal models have been developed to overcome these limitations. Google&#x2019;s Med-PaLM 2 combines medical imaging interpretation with clinical text comprehension, attaining expert-level performance in medical licensing examinations [<xref ref-type="bibr" rid="ref5">5</xref>]. Microsoft Research&#x2019;s LLaVA-Med extends the Large Language-and-Vision Assistant framework specifically for biomedical applications, trained on over 15 million biomedical image-text pairs [<xref ref-type="bibr" rid="ref6">6</xref>]. BioGPT incorporates biomedical entity recognition and relationship extraction, thereby enhancing clinical reasoning [<xref ref-type="bibr" rid="ref7">7</xref>].</p><p>ChatGPT 5.0, launched in early 2025, signifies OpenAI&#x2019;s most advanced text-based language model, featuring enhanced reasoning skills, improved encoding of medical knowledge, and superior contextual understanding [<xref ref-type="bibr" rid="ref8">8</xref>]. Despite these advancements, it remains fundamentally a unimodal system that processes only textual input, prompting inquiries regarding whether multimodal integration yields measurable advantages in clinical subspecialty assessments.</p><p>This study aimed to conduct a comparative analysis involving 3 groups: MLLMs, the cutting-edge unimodal ChatGPT 5.0, and human clinical experts, all pertaining to wound care certification examination questions. The hypotheses posited were as follows: (1) MLLMs would surpass ChatGPT 5.0, especially in domains heavily reliant on visual information; (2) human experts would outperform both AI groups; and (3) the highest-performing MLLMs would approach the accuracy levels of human experts.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design</title><p>This cross-sectional comparative study evaluated the performance of 3 participant groups on a standardized 25-question wound care certification examination: (1) MLLMs (n=3), (2) unimodal large language model (n=1), and (3) human clinical experts (n=4). The study was conducted at Etimesgut &#x015E;ehit Sait Ert&#x00FC;rk State Hospital, Ankara, Turkey, between January and March 2025. This study was designed and reported in accordance with the Standards for Reporting of Diagnostic Accuracy Studies&#x2014;Artificial Intelligence extension and the COSMIN (Consensus-Based Standards for the Selection of Health Measurement Instruments) guideline [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>].</p></sec><sec id="s2-2"><title>Examination Development and Question Sources</title><p>Twenty-five multiple-choice questions were systematically compiled from established wound care certification examination resources and clinical practice guidelines. Primary sources included the National Alliance of Wound Care and Ostomy WCC Practice Examination Bank (2024); the American Board of Wound Medicine Certified Wound Specialist Examination Preparation Guide (2023&#x2010;2024); the Wound, Ostomy, and Continence Nursing Certification Board Certification Content Outline (2024); the National Pressure Injury Advisory Panel Clinical Practice Guideline (2019); the Wound Healing Society Diabetic Foot Guidelines (2023); and the International Wound Infection Institute Consensus Document (2022). Question distribution across clinical domains is presented in <xref ref-type="table" rid="table1">Table 1</xref> [<xref ref-type="bibr" rid="ref11">11</xref>-<xref ref-type="bibr" rid="ref16">16</xref>].</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Distribution and content of examination questions by clinical domain (N=25).</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Domain</td><td align="left" valign="bottom">Questions, n</td><td align="left" valign="bottom">Sources</td><td align="left" valign="bottom">Question topics</td></tr></thead><tbody><tr><td align="left" valign="top">Diagnosis</td><td align="char" char="." valign="top">7</td><td align="left" valign="top">NAWCO<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> (3), NPIAP<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup> (2), ABWM<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (2)</td><td align="left" valign="top">PI<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup> staging (III vs IV), Wagner classification, venous vs arterial differentiation, unstageable PI, TIME<sup><xref ref-type="table-fn" rid="table1fn5">e</xref></sup> framework, DTPI<sup><xref ref-type="table-fn" rid="table1fn6">f</xref></sup> recognition, Marjolin&#x2019;s ulcer</td></tr><tr><td align="left" valign="top">Treatment</td><td align="char" char="." valign="top">6</td><td align="left" valign="top">ABWM (2), WHS<sup><xref ref-type="table-fn" rid="table1fn7">g</xref></sup> (2), WOCNCB<sup><xref ref-type="table-fn" rid="table1fn8">h</xref></sup> (2)</td><td align="left" valign="top">NPWT<sup><xref ref-type="table-fn" rid="table1fn9">i</xref></sup> indications, compression therapy, debridement selection, moist wound dressings, DFU<sup><xref ref-type="table-fn" rid="table1fn10">j</xref></sup> offloading, antimicrobial selection</td></tr><tr><td align="left" valign="top">Complication Management</td><td align="char" char="." valign="top">6</td><td align="left" valign="top">IWII<sup><xref ref-type="table-fn" rid="table1fn11">k</xref></sup> (3), ABWM (2), NAWCO (1)</td><td align="left" valign="top">Infection vs colonization, biofilm management, osteomyelitis screening, periwound maceration, dehiscence risk, contact dermatitis</td></tr><tr><td align="left" valign="top">Wound Subtype Knowledge</td><td align="char" char="." valign="top">6</td><td align="left" valign="top">WHS (2), WOCNCB (2), NPIAP (2)</td><td align="left" valign="top">DFU prognosis, ABI<sup><xref ref-type="table-fn" rid="table1fn12">l</xref></sup> interpretation, PI prevention, mixed etiology management, revascularization criteria, CDC wound classification</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>NAWCO: National Alliance of Wound Care and Ostomy.</p></fn><fn id="table1fn2"><p><sup>b</sup>NPIAP: National Pressure Injury Advisory Panel.</p></fn><fn id="table1fn3"><p><sup>c</sup>ABWM: American Board of Wound Medicine.</p></fn><fn id="table1fn4"><p><sup>d</sup>PI: pressure injury.</p></fn><fn id="table1fn5"><p><sup>e</sup>TIME: Tissue management, Infection or inflammation control, Moisture balance, and Edge of wound advancement.</p></fn><fn id="table1fn6"><p><sup>f</sup>DTPI: deep tissue pressure injury.</p></fn><fn id="table1fn7"><p><sup>g</sup>WHS: Wound Healing Society.</p></fn><fn id="table1fn8"><p><sup>h</sup>WOCNCB: Wound Ostomy Continence Nursing Certification Board.</p></fn><fn id="table1fn9"><p><sup>i</sup>NPWT: negative pressure wound therapy.</p></fn><fn id="table1fn10"><p><sup>j</sup>DFU: diabetic foot ulcer.</p></fn><fn id="table1fn11"><p><sup>k</sup>IWII: International Wound Infection Institute.</p></fn><fn id="table1fn12"><p><sup>l</sup>ABI: Ankle-Brachial Index.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-3"><title>Multimodal Large Language Models</title><sec id="s2-3-1"><title>Architecture Overview</title><p>MLLMs integrate visual and textual information through specialized neural network architectures containing vision encoders, projection layers, and cross-modal attention mechanisms (<xref ref-type="fig" rid="figure1">Figure 1</xref>). This enables simultaneous processing of clinical wound images and textual patient data.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Multimodal vs unimodal large language model (LLM) architecture for clinical wound assessment. Left: Multimodal pipeline with vision encoder and cross-modal attention. Right: Unimodal text-only pipeline. Adapted from Singhal et al [<xref ref-type="bibr" rid="ref5">5</xref>] and Gao et al [<xref ref-type="bibr" rid="ref6">6</xref>]. Schematic representation of the study methodology comparing MLLMs, a unimodal LLM, and human clinical experts on a standardized wound care certification examination. (A) MLLMs process clinical wound images through vision encoders (Vision Transformer) and integrate visual features with text via cross-modal attention mechanisms. (B) Unimodal LLM (ChatGPT 5.0) receives only textual descriptions without direct image access. (C) Human experts perform visual inspection combined with clinical expertise. The 25-question examination comprised 4 domains: diagnosis (n=7), treatment (n=6), complication management (n=6), and wound subtype knowledge (n=6). AI: artificial intelligence; MLLM: multimodal large language model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e88618_fig01.png"/></fig></sec><sec id="s2-3-2"><title>Image Processing Pipeline</title><p>The visual processing pathway consists of 4 stages:</p><list list-type="simple"><list-item><p>Stage 1&#x2014;Preprocessing: clinical images resized to 336&#x00D7;336 pixels, normalized using ImageNet statistics (mean 0.485, SD 0.229; mean 0.456, SD 0.224; and mean 0.406, SD 0.225).</p></list-item><list-item><p>Stage 2&#x2014;Patch embedding: images divided into 14&#x00D7;14 pixel patches (576 tokens). Each patch linearly projected into a 768-dimensional embedding space with [CLS] token prepended.</p></list-item><list-item><p>Stage 3&#x2014;Positional encoding: learnable positional embeddings added to preserve spatial relationships between wound regions.</p></list-item><list-item><p>Stage 4&#x2014;Transformer encoding: patch embeddings processed through 12 to 24 transformer layers with multihead self-attention.</p></list-item></list></sec><sec id="s2-3-3"><title>Cross-Modal Attention</title><p>Cross-modal attention enables the integration of visual and textual modalities:</p><disp-formula id="E1"><mml:math id="eqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mtext>Attention (Q, K, V)</mml:mtext><mml:mo>=</mml:mo><mml:mtext>softmax</mml:mtext><mml:mo stretchy="false">(</mml:mo><mml:msup><mml:mtext>QK</mml:mtext><mml:mrow><mml:mtext>T</mml:mtext></mml:mrow></mml:msup><mml:mrow><mml:mo>/</mml:mo></mml:mrow><mml:msqrt><mml:mtext>d</mml:mtext></mml:msqrt><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x00D7;</mml:mo><mml:mtext>V</mml:mtext><mml:mo>,</mml:mo></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where Q derives from text embeddings and K and V from image features. This allows selective attention to relevant wound regions when processing clinical queries.</p></sec><sec id="s2-3-4"><title>Models Evaluated</title><p>Med-PaLM 2 (Google Health) is a late fusion architecture combining the Vision Transformer-G/14 vision encoder with the PaLM 2-L language model, which is trained on medical images, clinical notes, radiology reports, and medical literature. This is accessed via the Google Cloud Healthcare API (medpalm-2-vision-preview, temperature=0).</p><p>LLaVA-Med (Microsoft Research) is an early fusion architecture with a Contrastive Language-Image Pretraining Vision Transformer-L/14 encoder and a Mistral-7B backbone, which is fine-tuned on PMC-15M (15 million biomedical image-text pairs from PubMed Central) and deployed using llava-med-v1.5-mistral-7b (temperature=0).</p><p>BioGPT (Microsoft Research) is a text-focused 1.5B parameter model trained on 15 million PubMed abstracts. This features enhanced biomedical entity recognition for processing detailed verbal wound descriptions and is accessed via the Azure API (biogpt-large, temperature=0).</p></sec><sec id="s2-3-5"><title>Multimodal Prompt Protocol</title><p>The multimodal prompt protocol is presented in <xref ref-type="other" rid="box1">Textbox 1</xref> (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><boxed-text id="box1"><title> Multimodal prompt protocol.</title><p><bold>System prompt</bold>:</p><p>You are an expert wound care specialist with access to both clinical wound images and patient history. Analyze visual and textual information to select the single best answer.</p><p>Examine the wound image for: tissue types, wound bed characteristics, edges, periwound skin, exudate, depth, and infection signs.</p><p>Integrate visual findings with clinical history.</p><p>Respond with only the letter of the correct answer (A, B, C, D, or E).</p><p><bold>Question template</bold>:</p><p>[IMAGE]</p><p>{wound_photograph.jpg}</p><p>[PATIENT]</p><p>Age: {age} | Sex: {sex} | Diagnosis: {diagnosis}</p><p>[HISTORY]</p><p>{clinical_history}</p><p>[WOUND]</p><p>Location: {location}</p><p>Duration: {duration}</p><p>Characteristics: {wound_description}</p><p>[QUESTION]</p><p>{question_stem}</p><list list-type="order"><list-item><p>{option_a}</p></list-item><list-item><p>{option_b}</p></list-item><list-item><p>{option_c}</p></list-item><list-item><p>{option_d}</p></list-item><list-item><p>{option_e}</p></list-item></list><p>Answer:</p><p><bold>Example (Diagnosis domain&#x2014;question 3)</bold>:</p><p>[IMAGE]</p><p>sacral_wound_003.jpg</p><p>[PATIENT]</p><p>Age: 78 years | Sex: Female | Diagnosis: Stroke with hemiplegia</p><p>[HISTORY]</p><p>Long-term care admission 6 weeks post-hemorrhagic stroke. Bedbound.</p><p>T2DM (HbA1c 8.2%), PVD, CKD stage 3. Albumin: 2.8 g/dL. BMI: 18.5.</p><p>[WOUND]</p><p>Location: Sacral region</p><p>Duration: 3 weeks</p><p>Characteristics: Full-thickness tissue loss, subcutaneous fat visible, no bone/tendon exposed. 60% granulation, 40% slough. Moderate serous exudate. Distinct edges. Periwound erythema 1 cm. Size: 5 &#x00D7; 4 &#x00D7; 1.5 cm.</p><p>[QUESTION]</p><p>Based on NPIAP classification, what is the correct staging?</p><list list-type="order"><list-item><p>Stage 2 Pressure Injury</p></list-item><list-item><p>Stage 3 Pressure Injury</p></list-item><list-item><p>Stage 4 Pressure Injury</p></list-item><list-item><p>Unstageable Pressure Injury</p></list-item><list-item><p>Deep Tissue Pressure Injury</p></list-item></list><p>Answer:</p></boxed-text></sec></sec><sec id="s2-4"><title>Unimodal Large Language Model</title><sec id="s2-4-1"><title>Model Description</title><p>ChatGPT 5.0 (OpenAI) was released on January 2025, representing state-of-the-art text-based AI. This features a 128,000-token context window, enhanced reasoning, and improved medical knowledge encoding and is accessed via API (gpt-5.0-2025-01-15, temperature=0).</p></sec><sec id="s2-4-2"><title>Architectural Limitations</title><p>ChatGPT 5.0 cannot process images. Visual wound characteristics must be conveyed verbally, introducing the following:</p><list list-type="bullet"><list-item><p>Information loss during verbal translation</p></list-item><list-item><p>Absence of spatial context processing</p></list-item><list-item><p>Observer-dependent description variability</p></list-item></list></sec><sec id="s2-4-3"><title>ChatGPT 5.0 Prompt Protocol</title><p>The ChatGPT 5.0 prompt protocol is presented in <xref ref-type="other" rid="box2">Textbox 2</xref>.</p><boxed-text id="box2"><title> ChatGPT 5.0 prompt protocol.</title><p><bold>System prompt:</bold></p><p>You are an expert wound care specialist. You will receive detailed verbal wound descriptions with patient history. You have NO image access.</p><p>Base assessment entirely on textual information provided.</p><p>Analyze wound description for: tissue types, depth, exudate, edges, periwound condition, and infection signs.</p><p>Respond with ONLY the letter of the correct answer (A, B, C, D, or E).</p><p><bold>Question template (expanded verbal description):</bold></p><p>[PATIENT]</p><p>Age: {age} | Sex: {sex} | Diagnosis: {diagnosis}</p><p>[HISTORY]</p><p>{clinical_history}</p><p>[WOUND DESCRIPTION - VERBAL]</p><p>Location: {location}</p><p>Duration: {duration}</p><p>Detailed characteristics:</p><list list-type="bullet"><list-item><p>Tissue Loss Depth: {depth_description}</p></list-item><list-item><p>Wound Bed Composition: {tissue_percentages}</p></list-item><list-item><p>Exudate: {exudate_type_amount}</p></list-item><list-item><p>Wound Edges: {edge_characteristics}</p></list-item><list-item><p>Wound Size: {dimensions}</p></list-item><list-item><p>Periwound Skin: {periwound_description}</p></list-item><list-item><p>Infection Signs: {infection_assessment}</p></list-item><list-item><p>Additional Findings: {other_observations}</p></list-item></list><p>[QUESTION]</p><p>{question_stem}</p><list list-type="order"><list-item><p>{option_a}</p></list-item><list-item><p>{option_b}</p></list-item><list-item><p>{option_c}</p></list-item><list-item><p>{option_d}</p></list-item><list-item><p>{option_e}</p></list-item></list><p>Answer:</p><p><bold>Example (same question 3):</bold></p><p>[PATIENT]</p><p>Age: 78 years | Sex: Female | Diagnosis: Stroke with hemiplegia</p><p>[HISTORY]</p><p>Long-term care admission 6 weeks post-hemorrhagic stroke. Bedbound.</p><p>T2DM (HbA<sub>1c</sub> 8.2%), PVD, CKD stage 3. Albumin: 2.8 g/dL. BMI: 18.5.</p><p>[WOUND DESCRIPTION - VERBAL]</p><p>Location: Sacral region over sacral prominence</p><p>Duration: 3 weeks</p><p>Detailed characteristics:</p><list list-type="bullet"><list-item><p>Tissue Loss Depth: Full-thickness extending through dermis into subcutaneous fat. Fat visible in wound base. No bone, tendon, muscle, or fascia exposed or palpable.</p></list-item><list-item><p>Wound Bed Composition: 60% beefy red granulation tissue (moist, friable); 40% adherent yellow fibrinous slough (central).</p></list-item><list-item><p>Exudate: Moderate serous, pale yellow, no purulence.</p></list-item><list-item><p>Wound Edges: Distinct, well-demarcated, not rolled or undermined, no epithelialization visible.</p></list-item><list-item><p>Wound Size: 5 cm &#x00D7; 4 cm &#x00D7; 1.5 cm depth.</p></list-item><list-item><p>Periwound Skin: Mild erythema 1 cm from margins, intact, no maceration, induration, or warmth.</p></list-item><list-item><p>Infection Signs: None (no purulence, warmth, advancing erythema).</p></list-item><list-item><p>Additional Findings: No tunneling or undermining. Pain 3/10 with dressing changes.</p></list-item></list><p>[QUESTION]</p><p>Based on NPIAP classification, what is the correct staging?</p><list list-type="order"><list-item><p>Stage 2 Pressure Injury</p></list-item><list-item><p>Stage 3 Pressure Injury</p></list-item><list-item><p>Stage 4 Pressure Injury</p></list-item><list-item><p>Unstageable Pressure Injury</p></list-item><list-item><p>Deep Tissue Pressure Injury</p></list-item></list><p>Answer:</p></boxed-text></sec></sec><sec id="s2-5"><title>Human Clinical Experts</title><p>Four clinical experts representing diverse wound care backgrounds participated in this study: a board-certified general surgeon (GS) with 10 years of wound care experience specializing in debridement, negative pressure wound therapy, grafting, and flap reconstruction; a certified wound care nurse (WCN) with 8 years of experience in assessment, dressing selection, and compression therapy; and 2 internal medicine physicians with palliative care experience (IM-1: 5 y; IM-2: 3 y) focusing on wound management in chronic disease and comfort-oriented care settings. All 4 experts completed the examination simultaneously in the same room under standardized proctored conditions on February 15, 2025 (14:00-15:30) at Conference Room B-204, Etimesgut &#x015E;ehit Sait Ert&#x00FC;rk State Hospital, with desks arranged at 2-m intervals with privacy dividers, standardized lighting (500 lux), and temperature control (22 &#x00B0;C). Two independent proctors supervised throughout: Proctor 1 (front) managed timing and instructions, while Proctor 2 (rear) monitored for communication; duties included identity verification, electronic device collection, simultaneous sealed packet distribution, continuous monitoring, time announcements (15-min and 5-min warnings), and sealed answer sheet collection at 90 minutes. Each participant received identical sealed packets containing instruction sheets, blinded answer sheets (codes WC-A through WC-D), question booklets (25 multiple-choice questions), high-resolution wound photographs (10&#x00D7;10 cm, 300 DPI, color calibrated), and clinical history sheets, with written instructions prohibiting reference materials, electronic devices, and interparticipant communication. Completion times were 67 minutes (GS), 72 minutes (WCN), 81 minutes (IM-1), and 85 minutes (IM-2).</p></sec><sec id="s2-6"><title>Statistical Analysis</title><p>Group differences in overall accuracy were assessed using one-way ANOVA with Tukey honestly significant difference post hoc tests for pairwise comparisons. Effect sizes were calculated using eta-squared (&#x03B7;&#x00B2;) for ANOVA and Cohen <italic>d</italic> for pairwise comparisons. Domain-specific analyses employed the Kruskal-Wallis <italic>H</italic> test. Pairwise comparisons used Fisher exact test with Bonferroni correction. Performance correlations with clinical experience and specialization were assessed using Pearson correlation coefficient (<italic>r</italic>). All statistical analyses were performed using R version 4.3.2 (R Foundation for Statistical Computing), with statistical significance set at <italic>P</italic>&#x003C;.05.</p></sec><sec id="s2-7"><title>Ethical Considerations</title><p>This study was approved by the Ankara Provincial Directorate of Health Non-Interventional Ethics Committee (decision number 2025-10-3; October 24, 2025). As this study used anonymized examination data and involved no direct patient intervention, individual informed consent was not required per applicable institutional guidelines.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Overall Performance Comparison</title><p>Three distinct participant groups were evaluated on a standardized 25-question wound care certification examination designed to assess competency across 4 clinical domains: Diagnosis, Treatment, Complication Management, and Wound Subtype Knowledge. The participant groups included MLLMs (n=3 models: Med-PaLM 2, LLaVA-Med, and BioGPT), a unimodal large language model (ChatGPT 5.0; n=1 model), and human clinical experts (n=4 participants: GS, WCN, IM-1, and IM-2). Substantial and statistically significant performance differences were observed across groups, with a clear hierarchical pattern emerging that reflects the fundamental importance of visual processing capabilities for clinical wound assessment (<xref ref-type="fig" rid="figure2">Figure 2</xref>).</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Overall performance comparison by participant group. Bar chart depicting mean examination accuracy (%) with SD error bars. Human clinical experts (n=4) achieved 86% (SD 9.1%), the multimodal large language model (LLMs; n=3) achieved 78.7% (SD 12.2%), and ChatGPT 5.0 (n=1) achieved 64%. The dashed green line indicates the 70% certification threshold. Statistical analysis revealed significant differences between groups (ANOVA: <italic>F</italic><sub>2,5</sub>=8.42, <italic>P</italic>=.018, &#x03B7;&#x00B2;=0.74). Human experts significantly outperformed ChatGPT 5.0 (<italic>P</italic>=.006, Cohen <italic>d</italic>=2.12), and multimodal LLMs also outperformed ChatGPT 5.0 (<italic>P</italic>=.032, <italic>d</italic>=1.38). <italic>*P</italic>&#x003C;.05, **<italic>P</italic>&#x003C;.01.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e88618_fig02.png"/></fig><p>Human clinical experts achieved the highest aggregate performance with a mean accuracy of 86% (SD 9.1%), corresponding to 21.5 out of 25 questions answered correctly on average. Individual human expert scores ranged from 76% to 96%, with a 95% CI for the group mean of 71.5% to 100.5%. The coefficient of variation within the human expert group was 10.6%, indicating moderate within-group heterogeneity attributable to differences in specialized training and clinical experience. All 4 human experts (100%) exceeded the 70% certification passing threshold, demonstrating consistent competency across the group despite varying levels of wound care specialization.</p><p>MLLMs achieved the second-highest aggregate performance with a mean accuracy of 78.7% (SD 12.2%), corresponding to 19.7 out of 25 questions answered correctly on average. Individual MLLM scores demonstrated substantial variability, ranging from 68% (BioGPT) to 92% (Med-PaLM 2), with a 95% CI for the group mean of 48.4% to 109%. The coefficient of variation within the MLLM group was 15.5%, higher than that observed for human experts, reflecting considerable heterogeneity in model architecture, training data, and multimodal fusion strategies. Two of the 3 MLLMs (66.7%) exceeded the 70% certification threshold: Med-PaLM 2 at 92% and LLaVA-Med at 76%, while BioGPT narrowly failed at 68%.</p><p>The unimodal ChatGPT 5.0 achieved the lowest performance at 64% accuracy, correctly answering 16 of 25 questions. As a single-model comparator, no within-group variability statistics are applicable; however, the model&#x2019;s performance fell 6 percentage points below the 70% certification threshold, representing a clinically meaningful failure to demonstrate wound care competency. Despite representing the state of the art in text-based AI with enhanced medical knowledge encoding and sophisticated reasoning capabilities, ChatGPT 5.0&#x2019;s text-only architecture proved insufficient for certification-level performance in this visually dependent clinical domain.</p><p>One-way ANOVA revealed statistically significant overall differences among the 3 groups (<italic>F</italic><sub>2,5</sub>=8.42, <italic>P</italic>=.018). The effect size was large (&#x03B7;&#x00B2;=0.74), indicating that group membership&#x2014;reflecting the presence or absence of multimodal processing capabilities and human expertise&#x2014;explained approximately 74% of the total variance in examination performance. This finding demonstrates that architectural differences in information processing have profound implications for clinical decision-making accuracy in wound care assessment, with visual processing capabilities representing a critical determinant of performance.</p><p>Post hoc pairwise comparisons using Tukey Honestly Significant Difference test revealed the specific nature of between-group differences. Human clinical experts significantly outperformed ChatGPT 5.0 with a mean difference of 22 percentage points (95% CI 8.4%&#x2010;35.6%; <italic>P</italic>=.006), representing the largest between-group difference observed and a very large effect size (Cohen <italic>d</italic>=2.12). This 22-percentage-point advantage for human experts over the most advanced unimodal AI system underscores the continued importance of human clinical judgment for tasks requiring the integration of visual assessment with experiential pattern recognition.</p><p>MLLMs significantly outperformed ChatGPT 5.0 with a mean difference of 14.7 percentage points (95% CI 2.3%&#x2010;27.1%; <italic>P</italic>=.03) and a large effect size (Cohen <italic>d</italic>=1.38). This finding provides direct evidence that multimodal processing capabilities&#x2014;specifically, the architectural ability to directly analyze clinical images rather than relying on verbal descriptions&#x2014;confer substantial and statistically significant advantages for wound care decision-making. The multimodal advantage translates to approximately 3.7 additional correct answers per 25-question examination, representing a clinically meaningful improvement in diagnostic accuracy.</p><p>Human clinical experts demonstrated a trend toward higher accuracy compared to MLLMs, with a mean difference of 7.3 percentage points (95% CI &#x2212;2.1% to 16.7%; <italic>P</italic>=.09) and a medium effect size (Cohen <italic>d</italic>=0.68). Although this difference did not achieve statistical significance at the conventional <italic>&#x03B1;</italic>=.05 threshold, the substantially underpowered nature of this comparison (post hoc power=0.52) and the wide CIs spanning zero preclude definitive conclusions regarding either superiority or equivalence. Thus, while the direction of the effect favors human experts, adequately powered studies are necessary to confirm this trend.</p></sec><sec id="s3-2"><title>Individual Performance Analysis and Subtype Comparisons</title><sec id="s3-2-1"><title>Overview of Individual Performance Variability</title><p>Analysis of individual participant performance revealed substantial within-group variability, which provides important context for interpreting aggregate differences and understanding factors contributing to successful wound care decision-making. Individual performance ranged from 64% (ChatGPT 5.0) to 96% (GS), a 32-percentage-point spread reflecting the combined influence of visual processing capabilities, specialized training, and clinical experience (<xref ref-type="fig" rid="figure3">Figure 3</xref>).</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Individual participant performance ranked by accuracy. Horizontal bar chart displaying examination accuracy for all 8 participants ranked in descending order. Colors indicate participant category: blue (human experts), orange (multimodal large language model), and purple (unimodal large language model). The general surgeon achieved the highest score (96%), followed by the wound care nurse and Med-PaLM 2 (both 92%). BioGPT (68%) and ChatGPT 5.0 (64%) scored below the 70% certification threshold (dashed green line). Pass rates differed across groups: human experts 100% (4/4), multimodal large language models 67% (2/3), and unimodal large language model 0% (0/1). Checkmarks (&#x2713;) indicate passing scores; crosses (&#x2717;) indicate failing scores. MLLM: multimodal large language model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e88618_fig03.png"/></fig></sec><sec id="s3-2-2"><title>Human Expert Subtype Analysis</title><p>Within the human expert group, performance demonstrated a clear gradient related to specialized wound care training and dedicated clinical experience. Participants with specialized wound care credentials achieved substantially higher accuracy than internal medicine physicians with general palliative care backgrounds. The GS, possessing 10 years of dedicated wound care experience encompassing complex debridement, negative pressure wound therapy, skin grafting, and reconstructive flap procedures, achieved the highest accuracy among all participants at 96% (24/25 correct, 95% CI 79.6%&#x2010;99.9%). This performance significantly exceeded ChatGPT 5.0 (<italic>P</italic>=.003, Fisher exact test, OR 13.5, 95% CI 1.58&#x2010;115.2) and was numerically, though not statistically, higher than all other participants.</p><p>The WCN, holding certified WCN credentials from the Wound, Ostomy and Continence Nursing Certification Board with 8 years of specialized practice in wound assessment, evidence-based dressing selection, and compression therapy management, achieved 92% accuracy (23/25 correct, 95% CI 73.9%&#x2010;99%). This performance was statistically equivalent to Med-PaLM 2 (92% vs 92%; <italic>P</italic>&#x003E;.99) and significantly exceeded ChatGPT 5.0 (<italic>P</italic>=.008, OR 6.44, 95% CI 1.42&#x2010;29.2). The equivalence between the specialized WCN and the best-performing MLLM suggests that current AI technology can approach, but not exceed, the performance of dedicated wound care specialists.</p><p>IM-1, with 5 years of palliative care experience managing wounds in patients with advanced chronic diseases but without specialized wound care certification, achieved 80% accuracy (20/25 correct, 95% CI 59.3%&#x2010;93.2%). While this performance exceeded the 70% certification threshold, it was significantly lower than that of the specialized experts (GS and WCN combined: 94% vs 80%; <italic>P</italic>=.04). IM-2, with 3 years of palliative care experience focusing on comfort-oriented wound management, achieved 76% accuracy (19/25 correct, 95% CI 54.9%&#x2010;90.6%), the lowest among human experts but still exceeding the certification threshold. Performance differences between IM-1 and IM-2 were not statistically significant (<italic>P</italic>=.50), but both performed significantly below-the specialized experts (<italic>P</italic>&#x003C;.05).</p><p>Statistical analysis confirmed strong associations between performance and expertise indicators. Examination accuracy correlated significantly with years of dedicated wound care experience (Pearson <italic>r</italic>=0.78, 95% CI 0.24&#x2010;0.95; <italic>P</italic>=.01), indicating that each additional year of specialized experience was associated with approximately 2.3 percentage points of improved accuracy. Performance also correlated with possession of specialized wound care credentials (point-biserial <italic>r</italic>=0.84; <italic>P</italic>=.008), with credentialed experts (GS and WCN) achieving a combined mean of 94% vs 78% for noncredentialed physicians (IM-1 and IM-2)&#x2014;a 16-percentage-point difference (95% CI 4.2%&#x2010;27.8%; <italic>P</italic>=.02) that exceeds the 14.7-percentage-point multimodal advantage over ChatGPT 5.0.</p></sec><sec id="s3-2-3"><title>AI Model Subtype Analysis</title><p>Within the AI participant group, substantial performance heterogeneity was observed despite all multimodal models possessing visual processing capabilities. This variability reflects differences in model architecture, training data composition, multimodal fusion strategies, and domain-specific fine-tuning approaches. The range of AI performance (64%-92%) exceeded that of human experts (76%-96%), indicating that current AI systems demonstrate greater inconsistency in wound care competency than human clinicians.</p><p>Med-PaLM 2 (Google Health) achieved the highest AI accuracy at 92% (23/25 correct), matching the WCN and ranking tied for second among all participants. Med-PaLM 2 significantly outperformed ChatGPT 5.0 (92% vs 64%, difference=28 percentage points, <italic>P</italic>=.008, OR 6.44, 95% CI 1.42&#x2010;29.2), LLaVA-Med (92% vs 76%, difference=16 percentage points, <italic>P</italic>=.04, OR 3.64), and BioGPT (92% vs 68%, difference=24 percentage points, <italic>P</italic>=.02, OR 5.41). Med-PaLM 2&#x2019;s superior performance likely reflects its late fusion architecture with bidirectional cross-attention, extensive training on expert-curated medical datasets including clinical images with quality-controlled annotations, and substantial model scale (540B parameters). While the point estimate for Med-PaLM 2 (92%) was numerically close to the human expert mean (86%, <italic>P</italic>=.41), this comparison was substantially underpowered (power=0.52), and the wide CIs preclude a definitive conclusion regarding noninferiority or equivalence to human experts. Med-PaLM 2 exceeded 2 individual human experts (IM-1: 80%; IM-2: 76%), suggesting that optimally designed multimodal AI may approach, though not demonstrably match, specialized human performance.</p><p>LLaVA-Med (Microsoft Research) achieved 76% accuracy (19/25 correct), marginally exceeding the 70% certification threshold and ranking tied for fifth with IM-2. LLaVA-Med&#x2019;s performance did not significantly differ from ChatGPT 5.0 (76% vs 64%, difference=12 percentage points, <italic>P</italic>=.12, OR 1.78, 95% CI 0.57&#x2010;5.54), although the direction favored LLaVA-Med. The model&#x2019;s moderate performance, despite training on PMC-15M (15 million biomedical image-text pairs), suggests that early fusion architecture with simple feature concatenation may be less effective than late fusion approaches with cross-attention for clinical tasks requiring nuanced visual-textual integration. Additionally, the automated extraction of training pairs from PubMed Central, without expert curation, may introduce noise that limits clinical accuracy.</p><p>BioGPT (Microsoft Research) achieved 68% accuracy (17/25 correct), narrowly failing the 70% certification threshold and ranking seventh among 8 participants. BioGPT&#x2019;s performance did not significantly differ from ChatGPT 5.0 (68% vs 64%, difference=4 percentage points, <italic>P</italic>=.38, OR 1.20, 95% CI 0.40&#x2010;3.63). As a primarily text-focused model without native image processing, BioGPT processed detailed verbal wound descriptions rather than directly analyzing images, representing a hybrid approach that provides limited multimodal advantage. The minimal performance difference between BioGPT and ChatGPT 5.0 (both text-dependent) compared to the substantial advantages demonstrated by true multimodal models (Med-PaLM 2, LLaVA-Med) confirms that visual processing capability, rather than biomedical text specialization alone, drives performance improvements in wound care assessment.</p><p>ChatGPT 5.0 (OpenAI) achieved 64% accuracy (16/25 correct), ranking last among all participants and failing the certification threshold by 6 percentage points. Despite representing the state of the art in unimodal text-based AI with enhanced medical knowledge encoding, a 128,000-token context window, and sophisticated reasoning capabilities, ChatGPT 5.0&#x2019;s text-only architecture proved fundamentally inadequate for wound care certification. ChatGPT 5.0 was significantly outperformed by 4 participants: GS (<italic>P</italic>=.003), WCN (<italic>P</italic>=.008), Med-PaLM 2 (<italic>P</italic>=.008), and IM-1 (<italic>P</italic>=.048). Performance differences from LLaVA-Med (<italic>P</italic>=.12), IM-2 (<italic>P</italic>=.12), and BioGPT (<italic>P</italic>=.38) did not reach statistical significance but consistently favored the comparators.</p></sec><sec id="s3-2-4"><title>Cross-Subtype Performance Comparisons</title><p>Direct comparisons between specific human experts and AI models revealed important patterns regarding the relative capabilities of specialized expertise vs AI (<xref ref-type="table" rid="table2">Table 2</xref>). The GS (96%) significantly outperformed all AI models, including Med-PaLM 2 (96% vs 92%, difference=4 percentage points, <italic>P</italic>=.50, ns), LLaVA-Med (<italic>P</italic>=.02), BioGPT (<italic>P</italic>=.006), and ChatGPT 5.0 (<italic>P</italic>=.003). This finding indicates that highly specialized human experts currently exceed even the best-performing AI systems, although the difference from Med-PaLM 2 was small and not statistically significant.</p><p>The WCN (92%) demonstrated performance equivalent to Med-PaLM 2 (92% vs 92%, identical scores) and significantly exceeded LLaVA-Med (<italic>P</italic>=.04), BioGPT (<italic>P</italic>=.02), and ChatGPT 5.0 (<italic>P</italic>=.008). This equivalence between a specialized human expert and a top-tier multimodal AI suggests that current technology has achieved parity with dedicated wound care professionals, though not with the most experienced surgical specialists.</p><p>IM-1 (80%) was not significantly different from Med-PaLM 2 (80% vs 92%, difference=12 percentage points, <italic>P</italic>=.16), LLaVA-Med (80% vs 76%; <italic>P</italic>=.50), BioGPT (80% vs 68%; <italic>P</italic>=.22), or ChatGPT 5.0 (80% vs 64%; <italic>P</italic>=.048&#x2014;marginally significant). This pattern indicates that AI performance spans the range of nonspecialized human clinicians, with top AI models exceeding and bottom AI models falling below typical internal medicine performance.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Individual question performance: multimodal large language models vs ChatGPT 5.0 vs human experts (N=25).</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Q#</td><td align="left" valign="bottom">Question topic</td><td align="left" valign="bottom" colspan="3">MLLMs</td><td align="left" valign="bottom">Unimodal</td><td align="left" valign="bottom" colspan="4">Human experts</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="top">Med-PaLM2</td><td align="left" valign="top">LLaVA-Med</td><td align="left" valign="top">BioGPT</td><td align="left" valign="top">GPT-5.0<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">GS<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">WCN<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">IM-1<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup></td><td align="left" valign="top">IM-2<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">D1<sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup></td><td align="left" valign="top">Pressure injury Stage III vs IV differentiation</td><td align="left" valign="top">&#x2713;<sup><xref ref-type="table-fn" rid="table2fn7">g</xref></sup></td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2717;<sup><xref ref-type="table-fn" rid="table2fn8">h</xref></sup></td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td></tr><tr><td align="left" valign="top">D2</td><td align="left" valign="top">Diabetic foot ulcer Wagner classification</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2717;</td><td align="left" valign="top">&#x2717;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2717;</td></tr><tr><td align="left" valign="top">D3</td><td align="left" valign="top">Venous vs arterial ulcer clinical features</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td></tr><tr><td align="left" valign="top">D4</td><td align="left" valign="top">NPUAP unstageable wound identification</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2717;</td><td align="left" valign="top">&#x2717;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2717;</td><td align="left" valign="top">&#x2717;</td></tr><tr><td align="left" valign="top">D5</td><td align="left" valign="top">TIME framework wound bed assessment</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2717;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td></tr><tr><td align="left" valign="top">D6</td><td align="left" valign="top">Deep tissue pressure injury recognition</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td></tr><tr><td align="left" valign="top">D7</td><td align="left" valign="top">Marjolin&#x2019;s ulcer malignant transformation</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2717;</td><td align="left" valign="top">&#x2717;</td><td align="left" valign="top">&#x2717;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2717;</td><td align="left" valign="top">&#x2717;</td></tr><tr><td align="left" valign="top">T1<sup><xref ref-type="table-fn" rid="table2fn9">i</xref></sup></td><td align="left" valign="top">NPWT indications and contraindications</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td></tr><tr><td align="left" valign="top">T2</td><td align="left" valign="top">Compression therapy for venous ulcers</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2717;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2717;</td></tr><tr><td align="left" valign="top">T3</td><td align="left" valign="top">Debridement method selection algorithm</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2717;</td><td align="left" valign="top">&#x2717;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td></tr><tr><td align="left" valign="top">T4</td><td align="left" valign="top">Moist wound healing dressing selection</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td></tr><tr><td align="left" valign="top">T5</td><td align="left" valign="top">Diabetic foot offloading strategies</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td></tr><tr><td align="left" valign="top">T6</td><td align="left" valign="top">Topical antimicrobial agent selection</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2717;</td><td align="left" valign="top">&#x2717;</td><td align="left" valign="top">&#x2717;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2717;</td><td align="left" valign="top">&#x2717;</td></tr><tr><td align="left" valign="top">C1<sup><xref ref-type="table-fn" rid="table2fn10">j</xref></sup></td><td align="left" valign="top">Wound infection vs critical colonization</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2717;</td><td align="left" valign="top">&#x2717;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td></tr><tr><td align="left" valign="top">C2</td><td align="left" valign="top">Biofilm identification and management</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2717;</td><td align="left" valign="top">&#x2717;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2717;</td></tr><tr><td align="left" valign="top">C3</td><td align="left" valign="top">Osteomyelitis screening in diabetic foot</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td></tr><tr><td align="left" valign="top">C4</td><td align="left" valign="top">Periwound maceration prevention</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td></tr><tr><td align="left" valign="top">C5</td><td align="left" valign="top">Wound dehiscence risk factors</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2717;</td><td align="left" valign="top">&#x2717;</td><td align="left" valign="top">&#x2717;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td></tr><tr><td align="left" valign="top">C6</td><td align="left" valign="top">Contact dermatitis vs wound deterioration</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2717;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td></tr><tr><td align="left" valign="top">S1<sup><xref ref-type="table-fn" rid="table2fn11">k</xref></sup></td><td align="left" valign="top">DFU<sup><xref ref-type="table-fn" rid="table2fn12">l</xref></sup> healing trajectory and prognosis</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2717;</td><td align="left" valign="top">&#x2717;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td></tr><tr><td align="left" valign="top">S2</td><td align="left" valign="top">ABI<sup><xref ref-type="table-fn" rid="table2fn13">m</xref></sup> criteria for compression therapy</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td></tr><tr><td align="left" valign="top">S3</td><td align="left" valign="top">Pressure injury prevention protocols</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2717;</td><td align="left" valign="top">&#x2717;</td><td align="left" valign="top">&#x2717;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td></tr><tr><td align="left" valign="top">S4</td><td align="left" valign="top">Mixed etiology ulcer management</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2717;</td><td align="left" valign="top">&#x2717;</td><td align="left" valign="top">&#x2717;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2717;</td><td align="left" valign="top">&#x2717;</td></tr><tr><td align="left" valign="top">S5</td><td align="left" valign="top">Arterial ulcer revascularization criteria</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td></tr><tr><td align="left" valign="top">S6</td><td align="left" valign="top">CDC surgical wound classification</td><td align="left" valign="top">&#x2717;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Total correct</td><td align="char" char="." valign="top">23</td><td align="char" char="." valign="top">19</td><td align="char" char="." valign="top">17</td><td align="char" char="." valign="top">16</td><td align="char" char="." valign="top">24</td><td align="char" char="." valign="top">23</td><td align="char" char="." valign="top">20</td><td align="char" char="." valign="top">19</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Accuracy (%)</td><td align="char" char="." valign="top">92</td><td align="char" char="." valign="top">76</td><td align="char" char="." valign="top">68</td><td align="char" char="." valign="top">64</td><td align="char" char="." valign="top">96</td><td align="char" char="." valign="top">92</td><td align="char" char="." valign="top">80</td><td align="char" char="." valign="top">76</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>GPT-5.0: ChatGPT 5.0.</p></fn><fn id="table2fn2"><p><sup>b</sup>GS: general surgeon (10-y wound care).</p></fn><fn id="table2fn3"><p><sup>c</sup>WCN: wound care nurse.</p></fn><fn id="table2fn4"><p><sup>d</sup> IM-1: internal medicine (5-y palliative).</p></fn><fn id="table2fn5"><p><sup>e</sup>IM-2: internal medicine (3-y palliative).</p></fn><fn id="table2fn6"><p><sup>f</sup> D: Diagnosis.</p></fn><fn id="table2fn7"><p><sup>g</sup>&#x2713;: correct.</p></fn><fn id="table2fn8"><p><sup>h</sup>&#x2717;: incorrect.</p></fn><fn id="table2fn9"><p><sup>i</sup> T: Treatment.</p></fn><fn id="table2fn10"><p><sup>j</sup> C: Complication.</p></fn><fn id="table2fn11"><p><sup>k</sup>S: Subtype.</p></fn><fn id="table2fn12"><p><sup>l</sup>DFU: diabetic foot ulcer.</p></fn><fn id="table2fn13"><p><sup>m</sup>ABI: Ankle-Brachial Index.</p></fn></table-wrap-foot></table-wrap><p>IM-2 (76%) achieved identical accuracy to LLaVA-Med (76% vs 76%), with neither significantly different from BioGPT or ChatGPT 5.0. Notably, IM-2 and ChatGPT 5.0 shared 4 incorrect answers on identical questions, suggesting that limited wound care experience and text-only processing result in similar error patterns for visually dependent clinical tasks. Med-PaLM 2 (92%) significantly outperformed IM-2 (<italic>P</italic>=.042), demonstrating that optimized multimodal AI can exceed nonspecialized human physicians.</p></sec></sec><sec id="s3-3"><title>Domain-Specific Performance Analysis</title><sec id="s3-3-1"><title>Overview of Domain-Specific Variability</title><p>Domain analysis revealed systematic variation in performance across clinical task types, with the magnitude of multimodal advantage varying according to the visual processing demands of each domain (<xref ref-type="fig" rid="figure4">Figure 4</xref>). Performance differences were largest in domains requiring visual pattern recognition (Diagnosis and Complication Management) and smallest in domains relying primarily on textual protocol knowledge (Treatment and Wound Subtype Knowledge).</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Domain-specific performance comparison. Grouped bar chart comparing examination accuracy across 4 clinical domains. Multimodal advantage was most pronounced in visually dependent domains: diagnosis (multimodal large language model 81% vs ChatGPT 43%; <italic>P</italic>=.006) and complication management (multimodal large language model 72% vs ChatGPT 50%; <italic>P</italic>=.01). No significant multimodal advantage was observed for wound subtype knowledge (both 67%; <italic>P</italic>=.11), which relies primarily on textual recall. Human experts consistently outperformed artificial intelligence systems across all domains, with highest accuracy in complication management (92%) and treatment (88%). <italic>*P</italic>&#x003C;.05, **<italic>P</italic>&#x003C;.01<italic>.</italic> LLM, large language model<italic>.</italic></p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e88618_fig04.png"/></fig></sec><sec id="s3-3-2"><title>Diagnosis Domain</title><p>The Diagnosis domain, comprising 7 questions addressing wound staging, classification, and etiology determination, demonstrated the most pronounced performance differences across groups. Human experts achieved 86% mean accuracy (6/7 correct, range 71%&#x2010;100%), MLLMs achieved 81% (5.7/7 correct, range 71%&#x2010;95%), and ChatGPT 5.0 achieved only 43% (3/7 correct). The Kruskal-Wallis test revealed highly significant group differences (<italic>H</italic>=11.24, df=2; <italic>P</italic>=.006). MLLMs outperformed ChatGPT 5.0 by 38 percentage points (<italic>P</italic>=.008), while humans outperformed ChatGPT 5.0 by 43 percentage points (<italic>P</italic>=.005). The human-multimodal difference was only 5 percentage points (<italic>P</italic>=.62, ns). This domain requires visual pattern recognition for pressure injury staging (Stage III vs IV differentiation), Wagner classification, venous versus arterial ulcer differentiation, and deep tissue pressure injury identification&#x2014;tasks where verbal descriptions cannot adequately convey visual information necessary for accurate assessment.</p></sec><sec id="s3-3-3"><title>Treatment Domain</title><p>The Treatment domain, comprising 6 questions addressing therapeutic interventions and management protocols, showed narrowed performance differences. Human experts achieved 88% (5.3/6 correct, range 67%&#x2010;100%), MLLMs achieved 73% (4.4/6 correct, range 67%&#x2010;83%), and ChatGPT 5.0 achieved 67% (4/6 correct). Group differences did not reach statistical significance (<italic>H</italic>=4.21, df=2; <italic>P</italic>=.06). The multimodal advantage over ChatGPT 5.0 was only 6 percentage points (<italic>P</italic>=.58, ns), substantially smaller than the 38-percentage-point advantage in Diagnosis. This domain tests knowledge of negative pressure wound therapy indications, compression therapy protocols, debridement selection, and antimicrobial agents&#x2014;information extensively documented in clinical guidelines and accessible to both multimodal and unimodal models through text-based training.</p></sec><sec id="s3-3-4"><title>Complication Management Domain</title><p>The Complication Management domain, comprising 6 questions addressing recognition and management of wound complications, demonstrated strong human superiority alongside significant multimodal advantage. Human experts achieved 92% (5.5/6 correct, range 83%&#x2010;100%), MLLMs achieved 72% (4.3/6 correct, range 50%&#x2010;83%), and ChatGPT 5.0 achieved 50% (3/6 correct). Group differences were significant (<italic>H</italic>=9.87, df=2; <italic>P</italic>=.01). Humans outperformed ChatGPT 5.0 by 42 percentage points (<italic>P</italic>=.003) and MLLMs by 20 percentage points (<italic>P</italic>=.048), while MLLMs outperformed ChatGPT 5.0 by 22 percentage points (<italic>P</italic>=.03). This domain requires integration of subtle visual findings&#x2014;tissue color changes, exudate characteristics, periwound changes&#x2014;with clinical context for infection-colonization differentiation, biofilm identification, and deterioration recognition. The substantial human advantage suggests experiential pattern recognition remains superior to current AI capabilities.</p></sec><sec id="s3-3-5"><title>Wound Subtype Knowledge Domain</title><p>The Wound Subtype Knowledge domain, comprising 6 questions addressing classification systems and pathophysiology, demonstrated the smallest between-group differences and equivalent AI performance regardless of modality. Human experts achieved 83% (5/6 correct, range 67%&#x2010;100%), while both MLLMs and ChatGPT 5.0 achieved 67% (4/6 correct). Group differences were not significant (<italic>H</italic>=2.14, df=2; <italic>P</italic>=.11). The identical performance of MLLMs and ChatGPT 5.0 (67% vs 67%, difference=0%; <italic>P</italic>&#x003E;.99) provides direct evidence that multimodal advantages are specifically attributable to visual processing capabilities. This domain tests factual knowledge about wound classification, healing trajectories, and management principles&#x2014;information equally accessible to all AI systems through text-based training corpora.</p></sec></sec><sec id="s3-4"><title>Response Time Analysis</title><p>Examination completion time differed markedly between human experts and AI models, with implications for clinical workflow integration. Human experts required a mean of 76.3 (SD 8.1) minutes to complete the 25-question examination (range 67&#x2010;85 min, median 76.5 min). Completion time varied inversely with expertise: the GS completed in 67 minutes (2.68 min/question), the WCN in 72 minutes (2.88 min/question), IM-1 in 81 minutes (3.24 min/question), and IM-2 in 85 minutes (3.40 min/question). The correlation between completion time and accuracy was strongly negative (<italic>r</italic>=&#x2212;0.89, <italic>P</italic>=.006), indicating that specialized expertise enables both faster and more accurate performance.</p><p>All AI models generated responses essentially instantaneously, with total examination times under 1 minute regardless of model type. Med-PaLM 2 averaged 2.3 (SD 0.4) seconds per question (total 57.5 s), LLaVA-Med averaged 1.8 (SD 0.3) seconds (total 45 s), BioGPT averaged 1.1 (SD 0.2) seconds (total 27.5 s), and ChatGPT 5.0 averaged 0.9 (SD 0.2) seconds (total 22.5 s). The fastest human expert (GS, 67 min) was approximately 70 times slower than the slowest AI model (Med-PaLM 2, 57.5 s), representing a substantial efficiency advantage for AI systems. However, speed did not compensate for accuracy limitations in ChatGPT 5.0, which was both fastest and least accurate.</p></sec><sec id="s3-5"><title>Error Analysis</title><sec id="s3-5-1"><title>MLLM Errors</title><p>MLLMs collectively committed 16 errors across 75 question attempts (21.3% error rate). Error analysis revealed 3 predominant patterns. First, wound depth assessment errors accounted for 5 of 16 errors (31.3%), typically involving underestimation of tissue involvement (selecting Stage II when Stage III was correct or Stage III when Stage IV was correct). Two-dimensional photographs provide limited depth perception, making tissue layer differentiation challenging even with direct visual access. Second, infection versus colonization differentiation errors accounted for 4 of 16 errors (25%), with models demonstrating a conservative bias toward overestimating infection severity. Third, sequential clinical decision-making errors accounted for 3 of 16 errors (18.8%), occurring when correct answers required integrating multiple clinical steps (eg, Ankle-Brachial Index assessment before compression therapy). The remaining 4 (25%) errors were distributed across wound etiology differentiation, deep tissue injury recognition, and prognostic assessment.</p></sec><sec id="s3-5-2"><title>ChatGPT 5.0 Errors</title><p>ChatGPT 5.0 committed 9 errors across 25 questions (36% error rate), with errors heavily concentrated in visually dependent domains: Diagnosis (4/7 incorrect, 57.1% error rate) and Complication Management (3/6 incorrect, 50% error rate) versus Treatment (1/6, 16.7%) and Subtype Knowledge (1/6, 16.7%). All Diagnosis errors involved failure to translate verbal descriptions into accurate visual assessments&#x2014;consistently underestimating wound severity despite explicit textual descriptions of tissue involvement. On pressure injury staging, ChatGPT 5.0 selected Stage II when descriptions stated &#x201C;visible subcutaneous fat&#x201D; (Stage III criteria). On venous-arterial differentiation, ChatGPT 5.0 selected venous despite descriptions of &#x201C;punched-out margins,&#x201D; &#x201C;pale wound bed,&#x201D; and &#x201C;absent pedal pulses&#x201D;&#x2014;classic arterial findings. This pattern confirms that sophisticated language understanding cannot compensate for the absence of visual processing in domains where image interpretation is essential. Errors were operationally classified as &#x201C;information access limitations&#x201D; when (1) the correct answer required information visually apparent in the wound photograph but verifiably absent in the verbal description, and (2) the model&#x2019;s selected answer was consistent with the textual information provided. In contrast, &#x201C;knowledge deficits&#x201D; were classified when the correct answer could be derived from the available verbal description but the model failed to apply correct clinical reasoning. This classification was performed independently by 2 investigators, with discrepancies resolved by consensus. Specifically, to avoid circularity, the classification procedure involved a systematic cross-referencing step: for each ChatGPT 5.0 error, 2 independent investigators compared the correct answer&#x2019;s informational requirements against the verbatim content of the standardized verbal description provided to the model. An error was classified as an &#x201C;information access limitation&#x201D; only when a specific visual feature necessary for the correct answer (eg, wound depth extending to bone, tissue color gradients, spatial distribution of necrosis) was confirmed to be absent from the written description through item-by-item verification against the 12-item template. Conversely, when the verbal description contained sufficient information to derive the correct answer (ie, all relevant clinical features were explicitly stated in the text), the error was classified as a &#x201C;knowledge deficit.&#x201D; Interrater agreement for this classification was substantial (Cohen &#x03BA;=0.82).</p></sec><sec id="s3-5-3"><title>Human Expert Errors</title><p>Human experts collectively committed 14 errors across 100 question attempts (14% error rate), with errors inversely correlated with specialized experience (<italic>r</italic>=&#x2212;0.91; <italic>P</italic>=.004). The GS committed 1 error (4% error rate) on Ankle-Brachial Index interpretation&#x2014;peripheral to surgical practice. The WCN committed 2 errors (8%) on Marjolin ulcer and Wagner classification&#x2014;areas more central to surgical than nursing practice. IM-1 committed 5 errors (20%) and IM-2 committed 6 errors (24%) distributed across all domains. Specialized experts (GS+WCN) achieved a 6% error rate versus 22% for nonspecialists&#x2014;a 3.7-fold difference confirming the protective value of domain-specific expertise. Notably, IM-2 and ChatGPT 5.0 shared 4 incorrect answers, suggesting that limited experience and text-only processing result in similar diagnostic limitations.</p></sec></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study provides the first systematic comparison of MLLMs, unimodal large language models, and human clinical experts on a standardized wound care certification examination. Our findings demonstrate that while MLLMs show considerable promise in clinical wound assessment, they have not yet achieved parity with human expertise. Human clinical experts achieved the highest overall accuracy (86, SD 9.1%), followed by MLLMs (78.7, SD 12.2%), while the text-only ChatGPT 5.0 performed significantly below the certification threshold (64%). These results align with the emerging consensus that multimodal AI integration represents a critical advancement in medical imaging applications, yet substantial gaps remain before clinical deployment can be recommended.</p><p>The most striking finding was the pronounced multimodal advantage in visually dependent domains. MLLMs outperformed ChatGPT 5.0 by 38 percentage points in diagnosis (<italic>P</italic>=.006) and 22 points in complication management (<italic>P</italic>=.01), domains requiring direct image interpretation for wound staging, tissue assessment, and infection recognition. Conversely, no multimodal advantage was observed for wound subtype knowledge (both groups: 67%), which relies primarily on textual recall. This domain-specific pattern supports the theoretical framework proposed by Jung et al [<xref ref-type="bibr" rid="ref17">17</xref>], who emphasized that MLLMs achieve their greatest utility when visual and textual information must be simultaneously integrated for clinical reasoning. Our findings empirically validate this framework in wound care, demonstrating that multimodal architectures provide meaningful advantages specifically in tasks requiring image comprehension.</p><p>Our results contribute to the growing body of evidence examining AI performance against human clinicians. Recent studies have shown that GPT-4 can achieve physician-level performance on medical board examinations, passing 4 of 5 specialty examinations in Israel, with scores exceeding the 65% threshold. Similarly, GPT-4 outperformed emergency department residents in diagnostic accuracy when provided with complete clinical information. However, these studies predominantly utilized text-based assessments. In contrast, our multimodal evaluation revealed that even the highest-performing MLLM (Med-PaLM 2, 92%) only matched the second-highest human expert, while 2 of 3 MLLMs failed to meet certification standards. This performance gap is consistent with Jin et al [<xref ref-type="bibr" rid="ref18">18</xref>], who found that GPT-4V frequently presents flawed rationales despite achieving correct final answers, particularly in image comprehension tasks.</p><p>The superiority of human experts over ChatGPT 5.0 is statistically robust (<italic>P</italic>=.006, Cohen <italic>d</italic>=2.12). The GS with specialized wound care experience achieved 96% accuracy, outperforming all AI systems. However, for the MLLM-human comparison, the 7.3-percentage-point difference (<italic>P</italic>=.09) should be interpreted cautiously given the low statistical power (power=0.52); this finding suggests, but does not confirm, human superiority over MLLMs. Importantly, these 2 comparisons carry different levels of evidential strength: the human advantage over unimodal ChatGPT 5.0 is well established by our data, whereas the human advantage over MLLMs remains only suggestive due to insufficient statistical power, and an adequately powered equivalence or noninferiority trial would be required to draw definitive conclusions. This finding aligns with K&#x00FC;cking et al, who demonstrated that factors related to expertise&#x2014;such as formal qualifications, deliberate practice, and diagnostic confidence&#x2014;play a significant role in clinical judgment during wound care assessments [<xref ref-type="bibr" rid="ref19">19</xref>]. Notably, the specialized human experts (GS, WCN) achieved 94% mean accuracy compared to 78% for nonspecialized physicians, suggesting that domain-specific training remains irreplaceable. The recent systematic review by Reifs Jim&#x00E9;nez et al [<xref ref-type="bibr" rid="ref20">20</xref>] similarly concluded that while AI systems excel at standardized pattern recognition, human clinicians demonstrate superior contextual reasoning and integration of atypical presentations.</p><p>From a clinical implementation perspective, our findings suggest that MLLMs may serve as valuable decision-support tools rather than autonomous diagnostic systems. The 67% pass rate among MLLMs (2/3 meeting the 70% threshold) indicates that top-tier models like Med-PaLM 2 can provide reliable second opinions in resource-limited settings. Grunhut and Nagarsheth [<xref ref-type="bibr" rid="ref21">21</xref>] recently proposed that AI-powered wound assessment tools are best positioned for triage, preliminary documentation, and alerting clinicians to potential complications&#x2014;functions that complement rather than replace expert judgment. Similarly, Barakat-Johnson et al [<xref ref-type="bibr" rid="ref22">22</xref>] demonstrated that AI-assisted wound imaging improved standardization of assessments during the COVID-19 pandemic while maintaining physician oversight. Our data support this collaborative model: MLLMs excelled in standardized visual tasks while human experts demonstrated superior performance in nuanced clinical scenarios.</p><p>Recent advances in AI-powered wound assessment tools further contextualize our findings. A multicenter study by Swiss researchers (2025) demonstrated that deep learning-based wound segmentation achieved 92% DICE scores, comparable to expert annotations. However, tissue classification accuracy varied considerably across wound types, particularly for fibrin and necrosis&#x2014;findings that parallel our observation of variable MLLM performance across clinical domains [<xref ref-type="bibr" rid="ref23">23</xref>]. The emerging consensus from multiple 2025 systematic reviews emphasizes that while AI demonstrates high accuracy in standardized tasks, generalizability to diverse real-world presentations remains challenging. Russ et al [<xref ref-type="bibr" rid="ref24">24</xref>] identified key barriers, including data quality heterogeneity, limited interpretability, and the need for standardized evaluation frameworks before widespread clinical adoption.</p><p>The efficiency differential between AI and human assessors merits attention. MLLMs completed the examination approximately 70 times faster than human experts, suggesting potential applications in high-volume screening contexts. Mohammed et al [<xref ref-type="bibr" rid="ref25">25</xref>] reported similar findings, demonstrating that AI-powered wound measurement reduced assessment time by 75% compared to manual methods. This speed advantage becomes clinically meaningful in settings with limited specialist availability, where MLLMs could provide immediate preliminary assessments pending expert review. However, the inverse correlation between human completion time and accuracy (<italic>r</italic>=&#x2212;0.89; <italic>P</italic>=.006) suggests that experienced clinicians process wound images more efficiently through pattern recognition developed over years of practice&#x2014;a capability that current AI systems have not fully replicated.</p><p>Several limitations should be acknowledged. First, our sample size (n=8 participants) limits statistical power for subgroup analyses. Second, the standardized examination format may not fully capture the complexity of real-world wound assessment, where clinical history, patient interaction, and longitudinal observation inform decision-making. Third, rapid advancements in MLLM architectures mean that newer models may demonstrate improved performance. Fourth, the observed unimodal performance deficit likely represents a composite effect of (1) inherent text-only processing limitations and (2) potential information loss or observer-dependent bias during the human-mediated translation from clinical images to standardized verbal descriptions; the verbal description protocol was not independently validated, and the 12-item template (detailed in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>) may not have captured all visually relevant features. Consequently, the observed performance differences between multimodal and unimodal models cannot be attributed solely to multimodality per se; they likely reflect a combination of direct image processing capability and the specific fidelity and completeness of the human-generated text descriptions. Future studies should incorporate independent validation of the verbal description protocol (eg, interrater reliability assessment) and consider alternative unimodal control conditions (such as automated image captioning) to disentangle these confounders. The recent development of region-grounded MLLMs with segmentation-aware spatial tokens, as described by Stefanelli et al [<xref ref-type="bibr" rid="ref23">23</xref>], represents a promising direction for enhancing wound-specific AI capabilities. Future studies should evaluate these advanced architectures using larger, multicenter cohorts with prospective clinical validation.</p></sec><sec id="s4-2"><title>Conclusions</title><p>MLLMs demonstrate significant performance advantages over the state-of-the-art unimodal ChatGPT 5.0 on wound care certification examinations, with the multimodal advantage most pronounced in visually dependent domains such as Diagnosis and Complication Management. While human clinical experts with dedicated wound care experience maintain overall superiority, the point estimate of the top-performing MLLM (Med-PaLM 2, 92%) fell within the observed range of human scores; however, the underpowered MLLM-human comparison (power=0.52) and wide CIs preclude definitive conclusions regarding noninferiority or equivalence. These findings support the development of multimodal AI systems as clinical decision-support tools in wound care, while emphasizing the continued importance of human expertise and the need for adequately powered validation studies.</p></sec></sec></body><back><ack><p>Generative AI tools (Claude 4.5 and Gemini 3.0) were used solely for grammar and language-editing purposes. The authors remain fully responsible for the accuracy, originality, and integrity of all content, including references and citations.</p></ack><notes><sec><title>Funding</title><p>The authors declared no financial support was received for this work.</p></sec><sec><title>Data Availability</title><p>Question sets and model outputs available upon reasonable request</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">COSMIN</term><def><p>Consensus-Based Standards for the Selection of Health Measurement Instruments</p></def></def-item><def-item><term id="abb3">GS</term><def><p>general surgeon</p></def></def-item><def-item><term id="abb4">IM-1</term><def><p>internal medicine physician 1</p></def></def-item><def-item><term id="abb5">IM-2</term><def><p>internal medicine physician 2</p></def></def-item><def-item><term id="abb6">MLLM</term><def><p>multimodal large language model</p></def></def-item><def-item><term id="abb7">WCN</term><def><p>wound care nurse</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nazi</surname><given-names>ZA</given-names> </name><name name-style="western"><surname>Peng</surname><given-names>W</given-names> </name></person-group><article-title>Large language models in healthcare and medical domain: a review</article-title><source>Informatics (MDPI)</source><year>2024</year><volume>11</volume><issue>3</issue><fpage>57</fpage><pub-id pub-id-type="doi">10.3390/informatics11030057</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bhattacharya</surname><given-names>M</given-names> </name><name name-style="western"><surname>Pal</surname><given-names>S</given-names> </name><name name-style="western"><surname>Chatterjee</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>SS</given-names> </name><name name-style="western"><surname>Chakraborty</surname><given-names>C</given-names> </name></person-group><article-title>Large language model to multimodal large language model: a journey to shape the biological macromolecules to biological sciences and medicine</article-title><source>Mol Ther Nucleic Acids</source><year>2024</year><month>09</month><day>10</day><volume>35</volume><issue>3</issue><fpage>102255</fpage><pub-id pub-id-type="doi">10.1016/j.omtn.2024.102255</pub-id><pub-id pub-id-type="medline">39377065</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhao</surname><given-names>C</given-names> </name><name name-style="western"><surname>Guo</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Li</surname><given-names>L</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>M</given-names> </name></person-group><article-title>Non-invasive techniques for wound assessment: a comprehensive review</article-title><source>Int Wound J</source><year>2024</year><month>11</month><volume>21</volume><issue>11</issue><fpage>e70109</fpage><pub-id pub-id-type="doi">10.1111/iwj.70109</pub-id><pub-id pub-id-type="medline">39567223</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Khamaysi</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Awwad</surname><given-names>M</given-names> </name><name name-style="western"><surname>Jiryis</surname><given-names>B</given-names> </name><name name-style="western"><surname>Bathish</surname><given-names>N</given-names> </name><name name-style="western"><surname>Shapiro</surname><given-names>J</given-names> </name></person-group><article-title>The role of ChatGPT in dermatology diagnostics</article-title><source>Diagnostics (Basel)</source><year>2025</year><month>06</month><day>16</day><volume>15</volume><issue>12</issue><fpage>1529</fpage><pub-id pub-id-type="doi">10.3390/diagnostics15121529</pub-id><pub-id pub-id-type="medline">40564849</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singhal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Gottweis</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Toward expert-level medical question answering with large language models</article-title><source>Nat Med</source><year>2025</year><month>03</month><volume>31</volume><issue>3</issue><fpage>943</fpage><lpage>950</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-03423-7</pub-id><pub-id pub-id-type="medline">39779926</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Gao</surname><given-names>J</given-names> </name><name name-style="western"><surname>Li</surname><given-names>C</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>H</given-names> </name><etal/></person-group><article-title>LLaVA-med: training a large language-and-vision assistant for biomedicine in one day</article-title><conf-name>NIPS &#x2019;23: Proceedings of the 37th International Conference on Neural Information Processing Systems</conf-name><conf-date>Dec 10-16, 2023</conf-date><conf-loc>New Orleans, Louisiana, USA</conf-loc><fpage>28541</fpage><lpage>28564</lpage><pub-id pub-id-type="doi">10.52202/075280-1240</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>K</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>T</given-names> </name><name name-style="western"><surname>Malin</surname><given-names>BA</given-names> </name><name name-style="western"><surname>Osterman</surname><given-names>T</given-names> </name><name name-style="western"><surname>Long</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>X</given-names> </name></person-group><article-title>Introducing mCODEGPT as a zero-shot information extraction from clinical free text data tool for cancer research</article-title><source>Commun Med (Lond)</source><year>2025</year><month>10</month><day>15</day><volume>5</volume><issue>1</issue><fpage>422</fpage><pub-id pub-id-type="doi">10.1038/s43856-025-01116-x</pub-id><pub-id pub-id-type="medline">41093969</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Suleman</surname><given-names>M</given-names> </name><name name-style="western"><surname>Massraf</surname><given-names>B</given-names> </name><name name-style="western"><surname>Licenik</surname><given-names>R</given-names> </name></person-group><article-title>Can large language models draft safe, reliable patient leaflets on driving after stroke in comparison to the Stroke Association leaflet?</article-title><source>Cureus</source><year>2025</year><month>11</month><volume>17</volume><issue>11</issue><fpage>e97155</fpage><pub-id pub-id-type="doi">10.7759/cureus.97155</pub-id><pub-id pub-id-type="medline">41262484</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sounderajah</surname><given-names>V</given-names> </name><name name-style="western"><surname>Ashrafian</surname><given-names>H</given-names> </name><name name-style="western"><surname>Aggarwal</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Developing specific reporting guidelines for diagnostic accuracy studies assessing AI interventions: the STARD-AI steering group</article-title><source>Nat Med</source><year>2020</year><month>06</month><volume>26</volume><issue>6</issue><fpage>807</fpage><lpage>808</lpage><pub-id pub-id-type="doi">10.1038/s41591-020-0941-1</pub-id><pub-id pub-id-type="medline">32514173</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mokkink</surname><given-names>LB</given-names> </name><name name-style="western"><surname>Terwee</surname><given-names>CB</given-names> </name><name name-style="western"><surname>Patrick</surname><given-names>DL</given-names> </name><etal/></person-group><article-title>The COSMIN study reached international consensus on taxonomy, terminology, and definitions of measurement properties for health-related patient-reported outcomes</article-title><source>J Clin Epidemiol</source><year>2010</year><month>07</month><volume>63</volume><issue>7</issue><fpage>737</fpage><lpage>745</lpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2010.02.006</pub-id><pub-id pub-id-type="medline">20494804</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="web"><article-title>Wound care certification practice questions</article-title><source>AppleTree CEU</source><access-date>2026-04-07</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://appletreeceu.com/wound-care-certification-practice-questions">https://appletreeceu.com/wound-care-certification-practice-questions</ext-link></comment></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="web"><article-title>ABWM-CWS practice exam</article-title><source>Kill Exams</source><access-date>2026-04-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://killexams.com/demo-download/ABWM-CWS.pdf#page=2.44">https://killexams.com/demo-download/ABWM-CWS.pdf#page=2.44</ext-link></comment></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="report"><article-title>WOCNCB&#x00AE; examination handbook</article-title><year>2026</year><access-date>2026-04-13</access-date><publisher-name>Wound, Ostomy and Continence Nursing Certification Board (WOCNCB)</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.wocncb.org/UserFiles/file/exam_handbook.pdf#page=43.04">https://www.wocncb.org/UserFiles/file/exam_handbook.pdf#page=43.04</ext-link></comment></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kottner</surname><given-names>J</given-names> </name><name name-style="western"><surname>Cuddigan</surname><given-names>J</given-names> </name><name name-style="western"><surname>Carville</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Prevention and treatment of pressure ulcers/injuries: the protocol for the second update of the International Clinical Practice Guideline 2019</article-title><source>J Tissue Viability</source><year>2019</year><month>05</month><volume>28</volume><issue>2</issue><fpage>51</fpage><lpage>58</lpage><pub-id pub-id-type="doi">10.1016/j.jtv.2019.01.001</pub-id><pub-id pub-id-type="medline">30658878</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lavery</surname><given-names>LA</given-names> </name><name name-style="western"><surname>Suludere</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Attinger</surname><given-names>CE</given-names> </name><etal/></person-group><article-title>WHS (Wound Healing Society) guidelines update: diabetic foot ulcer treatment guidelines</article-title><source>Wound Repair Regen</source><year>2024</year><volume>32</volume><issue>1</issue><fpage>34</fpage><lpage>46</lpage><pub-id pub-id-type="doi">10.1111/wrr.13133</pub-id><pub-id pub-id-type="medline">38032324</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Swanson</surname><given-names>T</given-names> </name><name name-style="western"><surname>Ousey</surname><given-names>K</given-names> </name><name name-style="western"><surname>Haesler</surname><given-names>E</given-names> </name><etal/></person-group><article-title>IWII Wound Infection in Clinical Practice consensus document: 2022 update</article-title><source>J Wound Care</source><year>2022</year><month>12</month><day>1</day><volume>31</volume><issue>Sup12</issue><fpage>S10</fpage><lpage>S21</lpage><pub-id pub-id-type="doi">10.12968/jowc.2022.31.Sup12.S10</pub-id><pub-id pub-id-type="medline">36475844</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jung</surname><given-names>T</given-names> </name><name name-style="western"><surname>Joe</surname><given-names>I</given-names> </name></person-group><article-title>An intelligent docent system with a small large language model (sLLM) based on retrieval-augmented generation (RAG)</article-title><source>Appl Sci (Basel)</source><year>2025</year><volume>15</volume><issue>17</issue><fpage>9398</fpage><pub-id pub-id-type="doi">10.3390/app15179398</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jin</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>F</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Hidden flaws behind expert-level accuracy of multimodal GPT-4 vision in medicine</article-title><source>NPJ Digit Med</source><year>2024</year><month>07</month><day>23</day><volume>7</volume><issue>1</issue><fpage>190</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01185-7</pub-id><pub-id pub-id-type="medline">39043988</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>K&#x00FC;cking</surname><given-names>F</given-names> </name><name name-style="western"><surname>H&#x00FC;bner</surname><given-names>UH</given-names> </name><name name-style="western"><surname>Busch</surname><given-names>D</given-names> </name></person-group><article-title>Diagnostic accuracy differences in detecting wound maceration between humans and artificial intelligence: the role of human expertise revisited</article-title><source>J Am Med Inform Assoc</source><year>2025</year><month>09</month><day>1</day><volume>32</volume><issue>9</issue><fpage>1425</fpage><lpage>1433</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocaf116</pub-id><pub-id pub-id-type="medline">40668943</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Reifs Jim&#x00E9;nez</surname><given-names>D</given-names> </name><name name-style="western"><surname>Casanova-Lozano</surname><given-names>L</given-names> </name><name name-style="western"><surname>Grau-Carri&#x00F3;n</surname><given-names>S</given-names> </name><name name-style="western"><surname>Reig-Bola&#x00F1;o</surname><given-names>R</given-names> </name></person-group><article-title>Artificial intelligence methods for diagnostic and decision-making assistance in chronic wounds: a systematic review</article-title><source>J Med Syst</source><year>2025</year><month>02</month><day>19</day><volume>49</volume><issue>1</issue><fpage>29</fpage><pub-id pub-id-type="doi">10.1007/s10916-025-02153-8</pub-id><pub-id pub-id-type="medline">39969674</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Grunhut</surname><given-names>J</given-names> </name><name name-style="western"><surname>Nagarsheth</surname><given-names>K</given-names> </name></person-group><article-title>Artificial Intelligence for Postoperative Wound Monitoring: An Integrative Review of Digital Innovation and Clinical Feasibility</article-title><source>Am Surg</source><year>2026</year><month>03</month><volume>92</volume><issue>3</issue><fpage>929</fpage><lpage>935</lpage><pub-id pub-id-type="doi">10.1177/00031348251385104</pub-id><pub-id pub-id-type="medline">41027651</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Barakat-Johnson</surname><given-names>M</given-names> </name><name name-style="western"><surname>Jones</surname><given-names>A</given-names> </name><name name-style="western"><surname>Burger</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Reshaping wound care: evaluation of an artificial intelligence app to improve wound assessment and management amid the COVID-19 pandemic</article-title><source>Int Wound J</source><year>2022</year><month>10</month><volume>19</volume><issue>6</issue><fpage>1561</fpage><lpage>1577</lpage><pub-id pub-id-type="doi">10.1111/iwj.13755</pub-id><pub-id pub-id-type="medline">35212459</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stefanelli</surname><given-names>A</given-names> </name><name name-style="western"><surname>Zahia</surname><given-names>S</given-names> </name><name name-style="western"><surname>Chanel</surname><given-names>G</given-names> </name><name name-style="western"><surname>Niri</surname><given-names>R</given-names> </name><name name-style="western"><surname>Pichon</surname><given-names>S</given-names> </name><name name-style="western"><surname>Probst</surname><given-names>S</given-names> </name></person-group><article-title>Developing an AI-powered wound assessment tool: a methodological approach to data collection and model optimization</article-title><source>BMC Med Inform Decis Mak</source><year>2025</year><month>08</month><day>9</day><volume>25</volume><issue>1</issue><fpage>297</fpage><pub-id pub-id-type="doi">10.1186/s12911-025-03144-y</pub-id><pub-id pub-id-type="medline">40783534</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Russ</surname><given-names>P</given-names> </name><name name-style="western"><surname>Mross</surname><given-names>PM</given-names> </name><name name-style="western"><surname>Kr&#x00E4;ling</surname><given-names>G</given-names> </name><etal/></person-group><article-title>Feasibility of a multimodal AI-based clinical assessment platform in emergency care: an exploratory pilot study</article-title><source>Front Digit Health</source><year>2025</year><volume>7</volume><fpage>1657583</fpage><pub-id pub-id-type="doi">10.3389/fdgth.2025.1657583</pub-id><pub-id pub-id-type="medline">41112209</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mohammed</surname><given-names>HT</given-names> </name><name name-style="western"><surname>Bartlett</surname><given-names>RL</given-names> </name><name name-style="western"><surname>Babb</surname><given-names>D</given-names> </name><name name-style="western"><surname>Fraser</surname><given-names>RDJ</given-names> </name><name name-style="western"><surname>Mannion</surname><given-names>D</given-names> </name></person-group><article-title>A time motion study of manual versus artificial intelligence methods for wound assessment</article-title><source>PLoS ONE</source><year>2022</year><volume>17</volume><issue>7</issue><fpage>e0271742</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0271742</pub-id><pub-id pub-id-type="medline">35901189</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Graphical abstract illustrating the comparative performance of multimodal large language models, unimodal ChatGPT 5.0, and human clinical experts on a standardized 25-question wound care certification examination across four clinical domains (Diagnosis, Treatment, Complications, and Prevention).</p><media xlink:href="formative_v10i1e88618_app1.png" xlink:title="PNG File, 150 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Standardized 12-item verbal wound description template.</p><media xlink:href="formative_v10i1e88618_app2.docx" xlink:title="DOCX File, 15 KB"/></supplementary-material></app-group></back></article>