<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v10i1e88407</article-id><article-id pub-id-type="doi">10.2196/88407</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Evaluation of GPT-5 in Periodontitis Staging and Grading: Retrospective Observational Study</article-title></title-group><contrib-group><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Amugo</surname><given-names>Ihunna</given-names></name><degrees>DDS</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Frederickson</surname><given-names>Katie Lee</given-names></name><degrees>MSN</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Rajakaruna</surname><given-names>Harshana</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Xie</surname><given-names>Hua</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Gangula</surname><given-names>Pandu</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Shanker</surname><given-names>Anil</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Wang</surname><given-names>Qingguo</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Oral Diagnostic Sciences (ODS) &#x0026; Research, School of Dentistry, Meharry Medical College</institution><addr-line>Nashville</addr-line><addr-line>TN</addr-line><country>United States</country></aff><aff id="aff2"><institution>Department of Biochemistry, Cancer Biology, Neurosciences and Pharmacology, School of Medicine, Meharry Medical College</institution><addr-line>1023 21st Ave N</addr-line><addr-line>Nashville</addr-line><addr-line>TN</addr-line><country>United States</country></aff><aff id="aff3"><institution>The Office for Research and Innovation, Meharry Medical College</institution><addr-line>Nashville</addr-line><addr-line>TN</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Sarvestan</surname><given-names>Javad</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Shah</surname><given-names>Drashti</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Dhawan</surname><given-names>Pankaj</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Qingguo Wang, PhD, Department of Biochemistry, Cancer Biology, Neurosciences and Pharmacology, School of Medicine, Meharry Medical College, 1023 21st Ave N, Nashville, TN, 37208, United States; <email>qiwang@mmc.edu</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>6</day><month>4</month><year>2026</year></pub-date><volume>10</volume><elocation-id>e88407</elocation-id><history><date date-type="received"><day>25</day><month>11</month><year>2025</year></date><date date-type="accepted"><day>04</day><month>03</month><year>2026</year></date></history><copyright-statement>&#x00A9; Ihunna Amugo, Katie Lee Frederickson, Harshana Rajakaruna, Hua Xie, Pandu Gangula, Anil Shanker, Qingguo Wang. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 6.4.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2026/1/e88407"/><abstract><sec><title>Background</title><p>Periodontitis is a chronic gum disease affecting approximately 42% of adults aged 30 years and older in the United States. Training dental students to accurately diagnose and manage periodontitis is a critical component of dental education and clinical care. Recent advances in large language models offer new opportunities to support both domains, yet their performance in periodontal diagnosis remains largely unexplored, particularly for newer models such as GPT-5.</p></sec><sec><title>Objective</title><p>This study conducted an exploratory evaluation of GPT-5&#x2019;s ability to stage and grade periodontitis.</p></sec><sec sec-type="methods"><title>Methods</title><p>A total of 25 publicly available clinical cases explicitly reporting periodontitis stage and grade were identified through Google and PubMed searches. Each case description was entered into GPT-5 using a zero-shot prompting approach to assess guideline-based reasoning without exemplar conditioning. The model&#x2019;s predictions were compared with the published reference diagnoses. Performance was measured using accuracy, 95% CI, unweighted Cohen &#x03BA;, and weighted Cohen &#x03BA;.</p></sec><sec sec-type="results"><title>Results</title><p>Across these cases, GPT-5 showed marked class-dependent performance and a tendency to overestimate disease severity. Grading performance was notably imbalanced, with high recall for grade C but substantially lower discrimination for grade B. GPT-5 achieved a staging accuracy of 68% (95% CI 48.4%-82.8%) and a grading accuracy of 77.3% (95% CI 56.6%-89.9%), with corresponding Cohen &#x03BA; values of 0.454 (95% CI 11.0%-75.6%) and 0.179 (95% CI &#x2212;15.8% to 63.8%), respectively. While staging performance showed fair agreement beyond chance, the low &#x03BA; for grading indicates poor agreement and limited reliability in distinguishing periodontal disease severity.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>These findings suggest that although GPT-5 demonstrates potential for guideline-based periodontitis staging and grading, its current diagnostic performance, particularly for periodontitis grading, limits its use in clinical assessment and educational training. Meaningful application in periodontal diagnosis and training will require substantial improvements in reliability and rigorous validation in larger, more diverse, and prospectively collected datasets.</p></sec></abstract><kwd-group><kwd>large language model</kwd><kwd>ChatGPT</kwd><kwd>GPT-5</kwd><kwd>dental care</kwd><kwd>dental education</kwd><kwd>periodontitis</kwd><kwd>periodontitis staging and grading</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>As the demand for accessible, accurate, and cost-effective resources to support dental care and education continues to grow, large language model (LLM)&#x2013;based chatbots, such as ChatGPT [<xref ref-type="bibr" rid="ref1">1</xref>], have emerged as potentially useful tools. Although not originally developed for dental and health care applications, these systems can generate humanlike responses with remarkable accuracy on many health-related topics [<xref ref-type="bibr" rid="ref2">2</xref>-<xref ref-type="bibr" rid="ref8">8</xref>], offering new opportunities for disease surveillance, biomedical research, and education.</p><p>Compared with traditional resources, LLMs offer distinct advantages for education and diagnostic support, including lower cost, continuous availability without the need for appointments, good accuracy for many diseases and conditions, customizable interactions, and user-friendly interfaces. As a result, people increasingly turn to them for medication information, self-diagnosis, and disease prevention guidance [<xref ref-type="bibr" rid="ref9">9</xref>-<xref ref-type="bibr" rid="ref12">12</xref>]. Clinicians, dental students, and medical students also use them to acquire knowledge and support clinical decision-making [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>].</p><p>A growing body of research has examined LLMs&#x2019; use in dental care and education [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref18">18</xref>]. Several benchmark studies have demonstrated LLMs&#x2019; competitive performance on the American Academy of Periodontology in-service examination [<xref ref-type="bibr" rid="ref6">6</xref>], the United States Medical Licensing Examination [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref7">7</xref>], and other major assessments [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]. In an educational study involving 77 second-year dental students, those who used LLMs for learning assignments were found to perform better on knowledge examinations than peers relying on traditional methods [<xref ref-type="bibr" rid="ref19">19</xref>]. Furthermore, Rahad et al [<xref ref-type="bibr" rid="ref5">5</xref>] showed that ChatGPT excelled in recognizing and correcting specialized dental terminology and achieved 66.7% accuracy in extracting and synthesizing information from documents. In the clinical context, Tastan Eroglu et al [<xref ref-type="bibr" rid="ref4">4</xref>] evaluated ChatGPT-3.5 on 200 untreated periodontitis cases and reported moderate performance for staging and grading.</p><p>Despite these advances, critical gaps remain. Most prior studies used nonpublic datasets, limiting the reproducibility of their findings. Moreover, given the continual iteration and rapid improvement of LLMs, earlier assessments may not accurately reflect the capabilities of newer models such as GPT-5, released in August 2025, whose performance in dentistry has not yet been systematically evaluated. Evaluation of LLMs in high-stakes dental clinical contexts is essential to establish quality control mechanisms, mitigate risks of inaccurate or biased outputs, and guide their safe adoption into dental education and care.</p><p>An important component of dental education and care is training students to diagnose and manage periodontitis, a chronic gum disease affecting approximately 42% of adults aged 30 years and older in the United States [<xref ref-type="bibr" rid="ref20">20</xref>]. Periodontitis staging (I-IV) reflects disease severity and extent based on levels of destroyed tissues, including gingival attachment and alveolar bone, while grading (A-C) estimates the rate of progression and future risk [<xref ref-type="bibr" rid="ref21">21</xref>-<xref ref-type="bibr" rid="ref23">23</xref>]. Even with explicit and standardized criteria for staging and grading [<xref ref-type="bibr" rid="ref21">21</xref>-<xref ref-type="bibr" rid="ref23">23</xref>], clinical diagnosis of periodontitis remains challenging and context dependent, requiring careful integration of radiographic evidence, periodontal charting, and patient-specific risk factors. An LLM capable of accurately analyzing, staging, and grading clinical periodontitis cases could serve as both a valuable diagnostic aid for clinicians and a useful educational resource for students. To address this need, we conducted an exploratory evaluation of the performance of the newly released GPT-5 in staging and grading periodontitis cases.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Case Identification and Data Collection</title><p>This study was conducted as an exploratory, case-based evaluation of GPT-5&#x2019;s diagnostic performance in periodontitis staging and grading. The unit of analysis was publicly available published clinical case descriptions rather than individual patients; thus, no clinical intervention or participant recruitment was involved. Standardized textual prompts were submitted to GPT-5, and the resulting categorical outputs were compared against the reference diagnoses reported in the source publications. The scope was strictly limited to assessing model-level diagnostic agreement and did not include evaluation of uncertainty calibration, triage behavior, clinical implementation, feasibility, or user acceptability.</p><p>We identified dental clinical cases by searching Google and PubMed in August and September 2025 using the keywords &#x201C;periodontitis,&#x201D; &#x201C;staging,&#x201D; and &#x201C;grading.&#x201D; PubMed processes such queries through its automatic term mapping system, which searches across Medical Subject Headings (MeSH) terms and text fields. In contrast, Google search results are dynamic, not field restricted, and were accordingly used to supplement PubMed retrieval. These searches targeted peer-reviewed articles, case reports, and publicly available teaching materials that explicitly described periodontitis staging and grading according to established clinical criteria.</p><p>All records retrieved from the searches were screened manually. Review articles; duplicate records; and reports lacking complete staging or grading diagnoses, sufficient periodontal clinical descriptions, or adequate medical and dental histories were excluded. Records that did not provide sufficient quantitative or descriptive information to independently determine stage and grade under the 2017 World Workshop criteria were also excluded. After screening, 25 (48%) cases were retained from a total of 52 identified records. The workflow for data collection and case selection is illustrated in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><p>Most of the public cases collected included panoramic and periapical radiographs, which, together with patients&#x2019; dental histories and periodontal charting, play a central role in diagnosis. In these cases, the radiographs had already been systematically evaluated by the original authors, and numerical measures of the bone loss were extracted and reported in their publications. We manually extracted these clinically meaningful data from the publications and used them directly in our assessments. Radiographic images themselves were not used in our analyses.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Data collection and case selection workflow.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e88407_fig01.png"/></fig><p>It is important to note that publicly available and teaching-oriented cases, which are often unusually well documented and may not fully reflect the spectrum of routine clinical presentations, may introduce selection bias and limit generalizability. As a result, the GPT-5 performance observed in this exploratory study should be interpreted cautiously and may not reflect real-world diagnostic performance. These limitations are discussed in greater detail in the Discussion section.</p></sec><sec id="s2-2"><title>GPT-5 Prompting and Evaluation</title><p>While GPT-5 can process multimodal data, we used only its interactive textual interface. All analyses were conducted between September 1 and 12, 2025, using the free GPT-5 version, without any paid enhancements. GPT-5 was accessed through its publicly available interactive interface, which does not permit user control over system-level parameters such as temperature and system prompts. Therefore, all interactions were conducted using the default system configuration (temperature=1.0).</p><p>Given GPT-5&#x2019;s demonstrated accuracy and reasoning improvements over earlier models [<xref ref-type="bibr" rid="ref24">24</xref>], and because the criteria for periodontitis staging and grading outlined in the 2017 World Workshop are strictly guideline based [<xref ref-type="bibr" rid="ref21">21</xref>-<xref ref-type="bibr" rid="ref23">23</xref>], we adopted a zero-shot prompting strategy. This approach evaluates GPT-5&#x2019;s ability to apply explicit clinical thresholds and decision logic without exemplar conditioning. In contrast to few-shot prompting or fine-tuning [<xref ref-type="bibr" rid="ref25">25</xref>-<xref ref-type="bibr" rid="ref27">27</xref>], which may introduce anchoring effects or label leakage, zero-shot prompting provides a more conservative and transparent assessment of model reasoning. This design was intentionally selected to evaluate GPT-5&#x2019;s intrinsic application of guideline-based criteria without exemplar-based calibration.</p><p>Using zero-shot prompting, case descriptions were submitted directly to the model without examples or prior instructions, and GPT-5 was asked to return predictions of periodontitis stage and grade. Categorical outputs were recorded exactly as generated, and diagnostic reasoning text was not scored. Each case was processed independently, with no feedback incorporated across cases. Before submission, case descriptions were lightly reformatted to correct line breaks and formatting artifacts introduced during PDF extraction to improve readability. No clinical content was added, removed, reworded, or reorganized. An example prompt and model response are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>To assess response stability, each case was tested in multiple sessions using slight variations in prompt phrasing (eg, &#x201C;Can you help determine the periodontitis stage and grade of this patient?&#x201D; or &#x201C;Please stage and grade the periodontitis of this patient&#x201D;). The model consistently generated identical predictions, regardless of prompt phrasing or session timing, indicating stable behavior at the decision level under these conditions. For the final analyses reported in this study, we used a standardized prompt, &#x201C;Please stage and grade the periodontitis of this patient,&#x201D; followed by the corresponding clinical case description.</p><p>Data collection and analysis were conducted by 7 domain experts, all of whom are coauthors of this study. The team included a dental student, a medical student, a mathematician, 2 professors from the School of Dentistry, and 2 professors from the School of Medicine at Meharry Medical College. Their responsibilities included extracting clinical cases from publications, developing prompts for GPT-5, and verifying the model&#x2019;s responses.</p></sec><sec id="s2-3"><title>Performance Metrics</title><p>GPT-5 predictions were compared against published ground-truth diagnoses reported in the original case sources. The primary outcomes were staging accuracy, grading accuracy, and agreement beyond chance, quantified using unweighted Cohen &#x03BA;. Because staging (I-IV) and grading (A-C) represent ordinal categories, weighted Cohen &#x03BA; was additionally calculated using the R package irr (version 0.84.1) to account for partial agreement across adjacent categories.</p><p>We calculated 95% CIs for accuracy (Wilson method) and &#x03BA; statistics (bootstrap resampling, 2000 iterations). To address potential class imbalance, macro-averaged <italic>F</italic><sub>1</sub>-scores and balanced accuracy were computed. In addition, confusion matrices were analyzed to characterize error patterns, including adjacent-category misclassification and directional bias (overestimation vs underestimation). All data analyses were conducted in R (version 4.3.3), and visualizations were generated using the R package ggplot2 (version 3.5.1).</p></sec><sec id="s2-4"><title>Ethical Considerations</title><p>This study used publicly available, published clinical cases and did not involve human or access to protected health information. Only deidentified textual case descriptions were submitted to GPT-5; no patient images, radiographs, protected health information, or identifiable data were entered into or transmitted through the external LLM platform. Case materials were reviewed before submission to confirm the absence of identifiable information. Because this study involved only publicly available, deidentified materials and did not involve human participants, it does not constitute human subjects research and, therefore, does not require institutional review board approval. Accordingly, informed consent was not required and participant compensation is not applicable. Prompts and outputs were stored on secure, password-protected institutional computers accessible only to the research team. This study was designed and reported according to the Standards for Reporting Diagnostic Accuracy (STARD) and STARD-AI principles [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref29">29</xref>], and a completed checklist is provided in <xref ref-type="supplementary-material" rid="app2">Checklist 1</xref>.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Description of Periodontitis Cases</title><p>Of the 25 periodontitis cases collected for evaluating GPT-5, 2 (8%) were borderline cases with staging ambiguities between stage III and stage IV, and 3 (12%) provided only staging information in the original publications. Full case descriptions and corresponding sources are provided in Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. The median age of these patients was 45 (IQR 35-56) years, with most cases (17/25, 68%) occurring between 35 and 64 years. There were 3 participants aged 5-19 years, 4 aged 20-34 years, 9 aged 35-49 years, 8 aged 50-64 years, and 1 aged 65-80 years. Female participants comprised the majority (n=18, 72%), while male participants accounted for 28% (n=7) of the cohort. In terms of disease severity, stage III periodontitis was most common (n=14, 56%), followed by stage IV (n=9, 36%), stage II (n=1, 4%), and stage I (n=1, 4%). For grading, most cases were classified as grade C (n=17, 77%), with smaller proportions in grade B (n=5, 23%).</p></sec><sec id="s3-2"><title>Workflow for Evaluating GPT-5</title><p><xref ref-type="fig" rid="figure2">Figure 2</xref> illustrates the workflow used to assess GPT-5. For each dental clinical case, we applied a zero-shot prompting approach (<xref ref-type="fig" rid="figure2">Figure 2A</xref> [<xref ref-type="bibr" rid="ref30">30</xref>]), in which patient information and clinical details were extracted from publications and lightly reformatted to correct formatting artifacts introduced during PDF extraction for GPT-5 input. An example prompt for a male patient aged 56 years is provided in <xref ref-type="fig" rid="figure2">Figure 2A</xref>.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Framework for evaluating GPT-5 performance in periodontitis staging and grading. (A) A prompt was constructed from a publicly available case [<xref ref-type="bibr" rid="ref30">30</xref>], with minor formatting adjustments (eg, line breaks and spacing) to improve readability for the model while preserving the original content. (B) GPT-5 generated categorical outputs for periodontitis stage (I-IV) and grade (A-C). (C) Model predictions were compared with the clinical reference diagnoses reported in the original publication to assess accuracy.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e88407_fig02.png"/></fig><p>The prompt was then submitted to GPT-5, which was asked to determine the stage and grade of periodontitis. The model&#x2019;s response was collected (<xref ref-type="fig" rid="figure2">Figure 2B</xref>), recording only the predicted stage and grade while disregarding the diagnostic reasoning. The output was compared against the clinical diagnosis to evaluate accuracy (<xref ref-type="fig" rid="figure2">Figure 2C</xref>).</p><p>After all cases were processed, GPT-5&#x2019;s predictions were aggregated, and performance metrics were calculated to summarize its diagnostic accuracy and Cohen &#x03BA; and assess its potential use in real-world clinical settings.</p></sec><sec id="s3-3"><title>GPT-5 Performance in Periodontitis Staging and Grading</title><p>Across the 25 periodontitis cases (including the 2 borderline cases), GPT-5 achieved a staging accuracy of 68% (17/25; 95% CI 48.4%-82.8%) and a grading accuracy of 77.3% (17/22; 95% CI 56.6%-89.9%), excluding 3 (12%) cases without specified grades. The corresponding Cohen &#x03BA; values were 0.454 for staging (95% CI 11.0%-75.6%) and 0.179 for grading (95% CI &#x2212;15.8% to 63.8%). These findings indicate fair agreement beyond chance for staging and poor agreement for grading. The relatively wide CIs for both outcomes reflect substantial uncertainty attributable to the small sample size and class imbalance. Additional performance metrics for GPT-5, including weighted Cohen &#x03BA;, balanced and stratified accuracy, and macro <italic>F</italic><sub>1</sub>-score, are provided in Tables S2 and S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>The confusion matrix in <xref ref-type="fig" rid="figure3">Figure 3A</xref> shows that all stage I and II cases were classified correctly (recall=100%), whereas recall was substantially lower for stage III (57%) and stage IV (78%). For grading, performance was markedly imbalanced, with high recall for grade C (94%) but very low recall for grade B (20%; <xref ref-type="fig" rid="figure3">Figure 3B</xref>). This class-dependent performance indicates that GPT-5 performs well for early-stage periodontitis and advanced disease detection but struggles to reliably distinguish intermediate disease categories.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Confusion matrices of GPT-5 predictions for periodontitis staging and grading.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e88407_fig03.png"/></fig><p>Importantly, misclassifications exhibited nonrandom patterns: errors were confined to adjacent categories, with no stage I or II cases misclassified as stage III or IV, no stage IV cases misclassified as stage I or II (<xref ref-type="fig" rid="figure3">Figure 3A</xref>), and no &#x201C;skipping&#x201D; across nonadjacent categories for grading (<xref ref-type="fig" rid="figure3">Figure 3B</xref>). Because all errors were limited to adjacent stages and grades and no catastrophic misclassifications were observed, these patterns are clinically and educationally relevant.</p><p>Among the misclassified cases, 43% (6/14) of stage III cases were predicted as stage IV, while 22% (2/9) of stage IV cases were predicted as stage III (<xref ref-type="fig" rid="figure3">Figure 3A</xref>). For grading (<xref ref-type="fig" rid="figure3">Figure 3B</xref>), GPT-5 correctly classified 94% (16/17) of grade C cases, with most (4/5) grade B cases misclassified as grade C. These results suggest that GPT-5 tends to assign higher severity for both periodontitis stage and grade. GPT-5&#x2019;s predictions for the 25 cases are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p><xref ref-type="table" rid="table1">Table 1</xref> and <xref ref-type="fig" rid="figure4">Figure 4</xref> present contextual comparisons with 2 previous assessments. Tastan Eroglu et al [<xref ref-type="bibr" rid="ref4">4</xref>] evaluated GPT-3.5 on 200 untreated patients and reported 59.5% accuracy in staging and 50.5% accuracy in grading&#x2014;both notably lower than the performance achieved by GPT-5 on our dataset. However, this comparison should be interpreted cautiously because the datasets and evaluation protocols differ across studies. A direct head-to-head comparison was not feasible due to the unavailability of the datasets used in the previous study. The low &#x03BA; values observed for both models (0.284 for GPT-3.5 and 0.179 for GPT-5) in <xref ref-type="table" rid="table1">Table 1</xref> indicate that current LLMs have limited discriminatory ability in grading periodontitis beyond chance agreement.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Recent studies of periodontitis staging and grading using textual input.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Study</td><td align="left" valign="bottom">Dataset<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="bottom">Model</td><td align="left" valign="bottom" colspan="2">Accuracy (%)</td><td align="left" valign="bottom" colspan="2">Cohen &#x03BA;</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="bottom">Stage</td><td align="left" valign="bottom">Grade</td><td align="left" valign="bottom">Stage</td><td align="left" valign="bottom">Grade</td></tr></thead><tbody><tr><td align="left" valign="top">Ameli et al [<xref ref-type="bibr" rid="ref31">31</xref>]</td><td align="left" valign="top">309 periodontal charts and clinical notes</td><td align="left" valign="top">Bidirectional Encoder Representations from Transformers (BERT)</td><td align="left" valign="top">77</td><td align="left" valign="top">75</td><td align="left" valign="top">N/A<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="left" valign="top">N/A</td></tr><tr><td align="left" valign="top">Tastan Eroglu et al [<xref ref-type="bibr" rid="ref4">4</xref>]</td><td align="left" valign="top">200 patients with untreated periodontitis</td><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">59.5</td><td align="left" valign="top">50.5</td><td align="left" valign="top">0.447</td><td align="left" valign="top">0.284</td></tr><tr><td align="left" valign="top">Our result</td><td align="left" valign="top">25 dental clinical cases from public sources</td><td align="left" valign="top">GPT-5</td><td align="left" valign="top">68</td><td align="left" valign="top">77.3</td><td align="left" valign="top">0.454</td><td align="left" valign="top">0.179</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>The findings from the first 2 studies were extracted from published papers, while the details of our own assessment results are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></fn><fn id="table1fn2"><p><sup>b</sup>N/A: not available.</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Comparison of Bidirectional Encoder Representations from Transformers (BERT), GPT-3.5, and GPT-5 performance in periodontitis staging and grading. Data are derived from <xref ref-type="table" rid="table1">Table 1</xref>. Error bars indicate 95% CIs for GPT-5, calculated from case-level data from Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>; CIs were not reported for BERT and GPT-3.5 in the original studies.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e88407_fig04.png"/></fig><p>Another study, summarized in <xref ref-type="table" rid="table1">Table 1</xref> and <xref ref-type="fig" rid="figure4">Figure 4</xref>, conducted by Ameli et al [<xref ref-type="bibr" rid="ref31">31</xref>] fine-tuned a Bidirectional Encoder Representations from Transformers (BERT) model using 309 anonymized periodontal charts and corresponding clinician notes. The model was trained on 70% of the data and tested on 32 holdout cases. The fine-tuned BERT model achieved 77% accuracy in staging and 75% in grading [<xref ref-type="bibr" rid="ref31">31</xref>]. Although these results fall within a similar range and are presented alongside GPT-5 for contextual reference, they should not be interpreted as evidence of relative model superiority, as BERT was trained and evaluated on periodontal charts and clinician notes, whereas GPT-5 and GPT-3.5 were assessed using standardized textual case descriptions without task-specific optimization.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This exploratory study evaluated GPT-5&#x2019;s ability to perform periodontitis staging and grading using publicly available clinical case descriptions based on the 2017 World Workshop classification framework. Overall, GPT-5 demonstrated moderate diagnostic performance and fair agreement beyond chance for staging (accuracy 68%; &#x03BA;=0.454), but substantially lower reliability for grading (&#x03BA;=0.179). In addition, the model exhibited a tendency to overestimate disease severity. These findings suggest that while GPT-5 is capable of applying guideline-based diagnostic criteria, important limitations remain in its ability to reliably distinguish disease grades.</p></sec><sec id="s4-2"><title>Interpretation and Implications of Findings</title><p>Although this study focused narrowly on GPT-5&#x2019;s performance in periodontitis staging and grading, the potential applications of LLMs extend more broadly to both clinical diagnostics and dental education. In clinical practice, LLMs could assist practitioners by consistently applying standardized staging and grading criteria, integrating charting and radiographic data, and generating preliminary assessments under appropriate clinician oversight. In education, LLMs can function as personalized learning assistants, offering structured feedback on case analyses and helping students navigate diagnostic complexity. Beyond these applications, LLM-based chatbots hold potential for reducing gaps in dental educational resources, particularly in underresourced institutions, thereby strengthening their capacity to deliver high-quality dental education and care.</p><p>However, the observed diagnostic agreement, particularly for periodontitis grading, highlights an important limitation. The low Cohen &#x03BA; (&#x03BA;=0.179) for grading indicates poor agreement beyond chance, suggesting that GPT-5 currently lacks sufficient reliability to accurately distinguish between periodontitis grades. Consequently, GPT-5 is not yet suitable for independent clinical grading or treatment decision-making, and its outputs should be interpreted with caution. In addition, in the absence of established minimum clinically important difference thresholds for artificial intelligence (AI)&#x2013;assisted periodontitis staging and grading, &#x03BA;=0.454 for staging should be interpreted cautiously and not as evidence of clinical readiness.</p><p>Looking forward, GPT-5 and other LLMs are likely to continue to improve diagnostic performance. Nevertheless, its meaningful clinical translation will hinge on overcoming current deficiencies in reliability and consistency. Beyond advancements in model development, a potential pathway toward high-stakes clinical and educational applications likely lies in the integration of LLMs with validated AI tools optimized specifically for clinical use. Such hybrid systems, which combine the precision of specialized diagnostic models with the reasoning, interpretability, and interactivity of LLMs, may provide more robust support for complex, multimodal clinical decision-making in dental care and education.</p></sec><sec id="s4-3"><title>Limitations</title><p>Our exploratory evaluation of GPT-5 relied solely on published cases, which represent a relatively limited sample. Moreover, both the small sample size and the gender imbalance within the data may disproportionately reflect more severe or well-documented presentations, potentially inflating GPT-5&#x2019;s performance. In addition, publicly available case reports are often curated to highlight clear diagnostic features and may not accurately reflect the full clinical heterogeneity or noise encountered in real-world practice. As a result, such cases may underrepresent diverse patient populations and disease presentations commonly seen in routine clinical settings. Furthermore, the dataset was not constructed through a systematic review process and may, therefore, be subject to selection bias. Therefore, expanding the dataset in future studies to include larger, more diverse, and nonpublished clinical data will be essential to broaden the scope of evaluation, strengthen generalizability, and support meaningful clinical translation.</p><p>Additional methodological constraints include our use of GPT-5&#x2019;s interactive interface, which does not allow modification of underlying system parameters such as temperature and may therefore restrict reproducibility at the system level. In addition, we did not evaluate model stability across alternative prompting strategies, which are known to influence LLM behavior. We restricted model inputs to textual case descriptions and did not evaluate GPT-5&#x2019;s multimodal capabilities for direct radiographic interpretation. While this design was intended to isolate guideline-based reasoning, it does not reflect the full multimodal nature of periodontal diagnosis and, therefore, may not represent real-world diagnostic performance using raw clinical data. Furthermore, GPT-5 was required to provide categorical staging and grading outputs, and we did not assess its ability to recognize diagnostic uncertainty or defer ambiguous cases. Only 2 borderline or equivocal cases were included, which are insufficient to evaluate GPT-5&#x2019;s performance in diagnostically challenging scenarios where clinician disagreement is common and clinical judgment plays a critical role. Finally, performance was not tested on incomplete charts or unstructured clinical notes, which may better reflect real-world variability. Therefore, future studies should incorporate more diagnostically ambiguous cases, multimodal inputs, and heterogeneous clinical documentation and systematically compare prompting strategies to better evaluate model reliability and clinical relevance.</p><p>Finally, this exploratory study was conducted with consideration of established AI reporting guidelines, including STARD or STARD-AI [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref29">29</xref>], DECIDE-AI [<xref ref-type="bibr" rid="ref32">32</xref>,<xref ref-type="bibr" rid="ref33">33</xref>], and STROBE-AI [<xref ref-type="bibr" rid="ref34">34</xref>]. While the exploratory design and reliance on publicly available clinical cases, rather than real-time clinical data, precluded full adherence to all framework components, key principles such as transparency in data sources, model use, limitations, and reproducibility were followed. Future prospective studies using large-scale, real-world clinical data will be better positioned to fully implement these reporting standards.</p></sec><sec id="s4-4"><title>Conclusions</title><p>With the growing use of LLMs by dental and medical students, clinicians, and the general public, it is important to evaluate their performance in high-stakes diagnostic and educational settings to inform safety protocols and guide responsible applications. In this study, we assessed GPT-5&#x2019;s ability to stage and grade periodontitis&#x2014;tasks central to periodontal diagnosis and student training. Contextual comparisons with prior models suggested comparable or improved performance, achieving 68% staging accuracy and 77.3% grading accuracy, with a staging &#x03BA; of 0.454, indicating fair agreement beyond chance. However, these findings should be interpreted cautiously, as differences in datasets, case formats, and evaluation conditions preclude direct head-to-head comparisons with prior studies.</p><p>Despite these observed performances, the low &#x03BA; for grading (0.179) underscores the very limited discriminatory capacity in distinguishing periodontitis grades, indicating that GPT-5 is not yet suitable for independent clinical applications. Additionally, our results demonstrated a consistent tendency for GPT-5 to overestimate disease severity. Therefore, inappropriate reliance on model outputs could increase the risk of overtreatment or unnecessary escalation of care. This highlights the importance of human oversight and the need for future evaluations of uncertainty reporting, refusal behavior, and decision-level safeguards before any clinical integration is considered.</p><p>In conclusion, while GPT-5 demonstrated potential as a supportive tool for education and clinical exploration, it is not yet ready for autonomous use. Meaningful application in periodontal diagnosis and training will depend on substantial improvements in reliability and rigorous validation in larger, more diverse, and prospectively collected clinical datasets.</p></sec></sec></body><back><ack><p>The authors thank SeTonia Cook and Jacqueline Harding for their assistance with grant management. During the preparation of this manuscript, the authors used GPT-5 for minor editing and language polishing to improve clarity. All content was subsequently reviewed and revised by the authors, who take full responsibility for the final version of the publication.</p></ack><notes><sec><title>Funding</title><p>This research was funded by the National Institute of Minority Health Disparities (grant U54MD007586), National Institute of Dental and Craniofacial Research (grant U01DE033241), National Institute of General Medical Sciences (grant R16GM149359), National Human Genome Research Institute (award UG3HG013248), National Institutes of Health (agreement 1OT2OD032581), Meharry's American Cancer Society (grant DICRIDG-21-071-01-DICRIDG), and Chan Zuckerberg Initiative (grant CZIF2022-007043). The views and conclusions contained in this paper are those of the authors and should not be interpreted as representing the official policies, either expressed or implied, of the NIH, CZI, and Meharry Medical College.</p></sec><sec><title>Data Availability</title><p>Data are supplied in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> available for download along with the published manuscript.</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">BERT</term><def><p>Bidirectional Encoder Representations from Transformers</p></def></def-item><def-item><term id="abb3">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb4">MeSH</term><def><p>Medical Subject Headings</p></def></def-item><def-item><term id="abb5">STARD</term><def><p>Standards for Reporting Diagnostic Accuracy</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="web"><article-title>GPT-5 system card</article-title><source>OpenAI</source><year>2025</year><access-date>2026-03-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://cdn.openai.com/gpt-5-system-card.pdf">https://cdn.openai.com/gpt-5-system-card.pdf</ext-link></comment></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Brin</surname><given-names>D</given-names> </name><name name-style="western"><surname>Sorin</surname><given-names>V</given-names> </name><name name-style="western"><surname>Vaid</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Comparing ChatGPT and GPT-4 performance in USMLE soft skill assessments</article-title><source>Sci Rep</source><year>2023</year><month>10</month><day>1</day><volume>13</volume><issue>1</issue><fpage>16492</fpage><pub-id pub-id-type="doi">10.1038/s41598-023-43436-9</pub-id><pub-id pub-id-type="medline">37779171</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singhal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Azizi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Large language models encode clinical knowledge</article-title><source>Nature</source><year>2023</year><month>08</month><volume>620</volume><issue>7972</issue><fpage>172</fpage><lpage>180</lpage><pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id><pub-id pub-id-type="medline">37438534</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tastan Eroglu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Babayigit</surname><given-names>O</given-names> </name><name name-style="western"><surname>Ozkan Sen</surname><given-names>D</given-names> </name><name name-style="western"><surname>Ucan Yarkac</surname><given-names>F</given-names> </name></person-group><article-title>Performance of ChatGPT in classifying periodontitis according to the 2018 classification of periodontal diseases</article-title><source>Clin Oral Investig</source><year>2024</year><month>06</month><day>29</day><volume>28</volume><issue>7</issue><fpage>407</fpage><pub-id pub-id-type="doi">10.1007/s00784-024-05799-9</pub-id><pub-id pub-id-type="medline">38951256</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rahad</surname><given-names>K</given-names> </name><name name-style="western"><surname>Martin</surname><given-names>K</given-names> </name><name name-style="western"><surname>Amugo</surname><given-names>I</given-names> </name><etal/></person-group><article-title>ChatGPT to enhance learning in dental education at a historically black medical college</article-title><source>Dent Res Oral Health</source><year>2024</year><volume>7</volume><issue>1</issue><fpage>8</fpage><lpage>14</lpage><pub-id pub-id-type="doi">10.26502/droh.0069</pub-id><pub-id pub-id-type="medline">38404561</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Ahmad</surname><given-names>B</given-names> </name><name name-style="western"><surname>Saleh</surname><given-names>K</given-names> </name><name name-style="western"><surname>Alharbi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Alqaderi</surname><given-names>H</given-names> </name><name name-style="western"><surname>Jeong</surname><given-names>YN</given-names> </name></person-group><article-title>Artificial intelligence in periodontology: performance evaluation of ChatGPT, Claude, and Gemini on the in-service examination</article-title><source>medRxiv</source><comment>Preprint posted online on  May 31, 2024</comment><pub-id pub-id-type="doi">10.1101/2024.05.29.24308155</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kung</surname><given-names>TH</given-names> </name><name name-style="western"><surname>Cheatham</surname><given-names>M</given-names> </name><name name-style="western"><surname>Medenilla</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title><source>PLOS Digit Health</source><year>2023</year><month>02</month><volume>2</volume><issue>2</issue><fpage>e0000198</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id><pub-id pub-id-type="medline">36812645</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Katz</surname><given-names>U</given-names> </name><name name-style="western"><surname>Cohen</surname><given-names>E</given-names> </name><name name-style="western"><surname>Shachar</surname><given-names>E</given-names> </name><etal/></person-group><article-title>GPT versus resident physicians &#x2014; a benchmark based on official board scores</article-title><source>NEJM AI</source><year>2024</year><month>04</month><day>12</day><volume>1</volume><issue>5</issue><pub-id pub-id-type="doi">10.1056/AIdbp2300192</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shahsavar</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Choudhury</surname><given-names>A</given-names> </name></person-group><article-title>User intentions to use ChatGPT for self-diagnosis and health-related purposes: cross-sectional survey study</article-title><source>JMIR Hum Factors</source><year>2023</year><month>05</month><day>17</day><volume>10</volume><fpage>e47564</fpage><pub-id pub-id-type="doi">10.2196/47564</pub-id><pub-id pub-id-type="medline">37195756</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Presiado</surname><given-names>M</given-names> </name><name name-style="western"><surname>Montero</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lopes</surname><given-names>L</given-names> </name><name name-style="western"><surname>Hamel</surname><given-names>L</given-names> </name></person-group><article-title>KFF health misinformation tracking poll: artificial intelligence and health information</article-title><source>Kaiser Family Foundation</source><year>2024</year><access-date>2026-03-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.kff.org/public-opinion/kff-health-misinformation-tracking-poll-artificial-intelligence-and-health-information/">https://www.kff.org/public-opinion/kff-health-misinformation-tracking-poll-artificial-intelligence-and-health-information/</ext-link></comment></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kuroiwa</surname><given-names>T</given-names> </name><name name-style="western"><surname>Sarcon</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ibara</surname><given-names>T</given-names> </name><etal/></person-group><article-title>The potential of ChatGPT as a self-diagnostic tool in common orthopedic diseases: exploratory study</article-title><source>J Med Internet Res</source><year>2023</year><month>09</month><day>15</day><volume>25</volume><fpage>e47621</fpage><pub-id pub-id-type="doi">10.2196/47621</pub-id><pub-id pub-id-type="medline">37713254</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Du</surname><given-names>D</given-names> </name><name name-style="western"><surname>Paluch</surname><given-names>R</given-names> </name><name name-style="western"><surname>Stevens</surname><given-names>G</given-names> </name><name name-style="western"><surname>M&#x00FC;ller</surname><given-names>C</given-names> </name></person-group><article-title>Exploring patient trust in clinical advice from AI-driven LLMs like ChatGPT for self-diagnosis</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 2, 2024</comment><pub-id pub-id-type="doi">10.48550/arxiv.2402.07920</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kisvarday</surname><given-names>S</given-names> </name><name name-style="western"><surname>Yan</surname><given-names>A</given-names> </name><name name-style="western"><surname>Yarahuan</surname><given-names>J</given-names> </name><etal/></person-group><article-title>ChatGPT use among pediatric health care providers: cross-sectional survey study</article-title><source>JMIR Form Res</source><year>2024</year><month>09</month><day>12</day><volume>8</volume><fpage>e56797</fpage><pub-id pub-id-type="doi">10.2196/56797</pub-id><pub-id pub-id-type="medline">39265163</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ozkan</surname><given-names>E</given-names> </name><name name-style="western"><surname>Tekin</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ozkan</surname><given-names>MC</given-names> </name><name name-style="western"><surname>Cabrera</surname><given-names>D</given-names> </name><name name-style="western"><surname>Niven</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dong</surname><given-names>Y</given-names> </name></person-group><article-title>Global health care professionals&#x2019; perceptions of large language model use in practice: cross-sectional survey study</article-title><source>JMIR Med Educ</source><year>2025</year><month>05</month><day>12</day><volume>11</volume><fpage>e58801</fpage><pub-id pub-id-type="doi">10.2196/58801</pub-id><pub-id pub-id-type="medline">40354644</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shifai</surname><given-names>N</given-names> </name><name name-style="western"><surname>van Doorn</surname><given-names>R</given-names> </name><name name-style="western"><surname>Malvehy</surname><given-names>J</given-names> </name><name name-style="western"><surname>Sangers</surname><given-names>TE</given-names> </name></person-group><article-title>Can ChatGPT vision diagnose melanoma? An exploratory diagnostic accuracy study</article-title><source>J Am Acad Dermatol</source><year>2024</year><month>05</month><volume>90</volume><issue>5</issue><fpage>1057</fpage><lpage>1059</lpage><pub-id pub-id-type="doi">10.1016/j.jaad.2023.12.062</pub-id><pub-id pub-id-type="medline">38244612</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sattler</surname><given-names>SS</given-names> </name><name name-style="western"><surname>Chetla</surname><given-names>N</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Evaluating the diagnostic accuracy of ChatGPT-4 Omni and ChatGPT-4 Turbo in identifying melanoma: comparative study</article-title><source>JMIR Dermatol</source><year>2025</year><month>03</month><day>21</day><volume>8</volume><fpage>e67551</fpage><pub-id pub-id-type="doi">10.2196/67551</pub-id><pub-id pub-id-type="medline">40117499</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cirone</surname><given-names>K</given-names> </name><name name-style="western"><surname>Akrout</surname><given-names>M</given-names> </name><name name-style="western"><surname>Abid</surname><given-names>L</given-names> </name><name name-style="western"><surname>Oakley</surname><given-names>A</given-names> </name></person-group><article-title>Assessing the utility of multimodal large language models (GPT-4 Vision and Large Language and Vision Assistant) in identifying melanoma across different skin tones</article-title><source>JMIR Dermatol</source><year>2024</year><month>03</month><day>13</day><volume>7</volume><fpage>e55508</fpage><pub-id pub-id-type="doi">10.2196/55508</pub-id><pub-id pub-id-type="medline">38477960</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Perlmutter</surname><given-names>JW</given-names> </name><name name-style="western"><surname>Milkovich</surname><given-names>J</given-names> </name><name name-style="western"><surname>Fremont</surname><given-names>S</given-names> </name><name name-style="western"><surname>Datta</surname><given-names>S</given-names> </name><name name-style="western"><surname>Mosa</surname><given-names>A</given-names> </name></person-group><article-title>Beyond the surface: assessing GPT-4&#x2019;s accuracy in detecting melanoma and suspicious skin lesions from dermoscopic images</article-title><source>Plast Surg (Oakv)</source><year>2025</year><month>02</month><day>18</day><fpage>22925503251315489</fpage><pub-id pub-id-type="doi">10.1177/22925503251315489</pub-id><pub-id pub-id-type="medline">39980664</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kavadella</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dias da Silva</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Kaklamanos</surname><given-names>EG</given-names> </name><name name-style="western"><surname>Stamatopoulos</surname><given-names>V</given-names> </name><name name-style="western"><surname>Giannakopoulos</surname><given-names>K</given-names> </name></person-group><article-title>Evaluation of ChatGPT&#x2019;s real-life implementation in undergraduate dental education: mixed methods study</article-title><source>JMIR Med Educ</source><year>2024</year><month>01</month><day>31</day><volume>10</volume><fpage>e51344</fpage><pub-id pub-id-type="doi">10.2196/51344</pub-id><pub-id pub-id-type="medline">38111256</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Eke</surname><given-names>PI</given-names> </name><name name-style="western"><surname>Thornton-Evans</surname><given-names>GO</given-names> </name><name name-style="western"><surname>Wei</surname><given-names>L</given-names> </name><name name-style="western"><surname>Borgnakke</surname><given-names>WS</given-names> </name><name name-style="western"><surname>Dye</surname><given-names>BA</given-names> </name><name name-style="western"><surname>Genco</surname><given-names>RJ</given-names> </name></person-group><article-title>Periodontitis in US adults: National Health and Nutrition Examination Survey 2009-2014</article-title><source>J Am Dent Assoc</source><year>2018</year><month>07</month><volume>149</volume><issue>7</issue><fpage>576</fpage><lpage>588</lpage><pub-id pub-id-type="doi">10.1016/j.adaj.2018.04.023</pub-id><pub-id pub-id-type="medline">29957185</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tonetti</surname><given-names>MS</given-names> </name><name name-style="western"><surname>Greenwell</surname><given-names>H</given-names> </name><name name-style="western"><surname>Kornman</surname><given-names>KS</given-names> </name></person-group><article-title>Staging and grading of periodontitis: framework and proposal of a new classification and case definition</article-title><source>J Clin Periodontol</source><year>2018</year><month>06</month><volume>45 Suppl 20</volume><fpage>S149</fpage><lpage>S161</lpage><pub-id pub-id-type="doi">10.1111/jcpe.12945</pub-id><pub-id pub-id-type="medline">29926495</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Caton</surname><given-names>JG</given-names> </name><name name-style="western"><surname>Armitage</surname><given-names>G</given-names> </name><name name-style="western"><surname>Berglundh</surname><given-names>T</given-names> </name><etal/></person-group><article-title>A new classification scheme for periodontal and peri-implant diseases and conditions - introduction and key changes from the 1999 classification</article-title><source>J Periodontol</source><year>2018</year><month>06</month><volume>89 Suppl 1</volume><issue>S1</issue><fpage>S1</fpage><lpage>S8</lpage><pub-id pub-id-type="doi">10.1002/JPER.18-0157</pub-id><pub-id pub-id-type="medline">29926946</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chapple</surname><given-names>IL</given-names> </name><name name-style="western"><surname>Mealey</surname><given-names>BL</given-names> </name><name name-style="western"><surname>Van Dyke</surname><given-names>TE</given-names> </name><etal/></person-group><article-title>Periodontal health and gingival diseases and conditions on an intact and a reduced periodontium: consensus report of workgroup 1 of the 2017 World Workshop on the Classification of Periodontal and Peri-Implant Diseases and Conditions</article-title><source>J Periodontol</source><year>2018</year><month>06</month><volume>89 Suppl 1</volume><issue>S1</issue><fpage>S74</fpage><lpage>S84</lpage><pub-id pub-id-type="doi">10.1002/JPER.17-0719</pub-id><pub-id pub-id-type="medline">29926944</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Amugo</surname><given-names>I</given-names> </name><name name-style="western"><surname>Rajakaruna</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Evaluating GPT-5 for melanoma detection using dermoscopic images</article-title><source>Diagnostics (Basel)</source><year>2025</year><month>11</month><day>29</day><volume>15</volume><issue>23</issue><fpage>3052</fpage><pub-id pub-id-type="doi">10.3390/diagnostics15233052</pub-id><pub-id pub-id-type="medline">41374433</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Deng</surname><given-names>G</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>A hitchhiker&#x2019;s guide to jailbreaking ChatGPT via prompt engineering</article-title><conf-name>SEA4DQ &#x2019;24: 4th International Workshop on Software Engineering and AI for Data Quality in Cyber-Physical Systems/Internet of Things</conf-name><conf-date>Jul 15, 2024</conf-date><pub-id pub-id-type="doi">10.1145/3663530.3665021</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Sahoo</surname><given-names>P</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>AK</given-names> </name><name name-style="western"><surname>Saha</surname><given-names>S</given-names> </name><name name-style="western"><surname>Jain</surname><given-names>V</given-names> </name><name name-style="western"><surname>Mondal</surname><given-names>S</given-names> </name><name name-style="western"><surname>Chadha</surname><given-names>A</given-names> </name></person-group><article-title>A systematic survey of prompt engineering in large language models: techniques and applications</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 5, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2402.07927</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Qi</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zeng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Xie</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Fine-tuning aligned language models compromises safety, even when users do not intend to!</article-title><source>Arxiv</source><comment>Preprint posted online on  Oct 5, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2310.03693</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bossuyt</surname><given-names>PM</given-names> </name><name name-style="western"><surname>Reitsma</surname><given-names>JB</given-names> </name><name name-style="western"><surname>Bruns</surname><given-names>DE</given-names> </name><etal/></person-group><article-title>STARD 2015: an updated list of essential items for reporting diagnostic accuracy studies</article-title><source>BMJ</source><year>2015</year><month>10</month><day>28</day><volume>351</volume><fpage>h5527</fpage><pub-id pub-id-type="doi">10.1136/bmj.h5527</pub-id><pub-id pub-id-type="medline">26511519</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sounderajah</surname><given-names>V</given-names> </name><name name-style="western"><surname>Guni</surname><given-names>A</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>X</given-names> </name><etal/></person-group><article-title>The STARD-AI reporting guideline for diagnostic accuracy studies using artificial intelligence</article-title><source>Nat Med</source><year>2025</year><month>10</month><volume>31</volume><issue>10</issue><fpage>3283</fpage><lpage>3289</lpage><pub-id pub-id-type="doi">10.1038/s41591-025-03953-8</pub-id><pub-id pub-id-type="medline">40954311</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Meyerson</surname><given-names>J</given-names> </name></person-group><article-title>Perio classification for the INBDE</article-title><source>Bootcamp.com</source><year>2025</year><access-date>2026-03-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://bootcamp.com/blog/bootcamp-coms-perio-classification-for-the-inbde">https://bootcamp.com/blog/bootcamp-coms-perio-classification-for-the-inbde</ext-link></comment></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ameli</surname><given-names>N</given-names> </name><name name-style="western"><surname>Firoozi</surname><given-names>T</given-names> </name><name name-style="western"><surname>Gibson</surname><given-names>M</given-names> </name><name name-style="western"><surname>Lai</surname><given-names>H</given-names> </name></person-group><article-title>Classification of periodontitis stage and grade using natural language processing techniques</article-title><source>PLOS Digit Health</source><year>2024</year><month>12</month><day>13</day><volume>3</volume><issue>12</issue><fpage>e0000692</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000692</pub-id><pub-id pub-id-type="medline">39671337</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vasey</surname><given-names>B</given-names> </name><name name-style="western"><surname>Novak</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ather</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ibrahim</surname><given-names>M</given-names> </name><name name-style="western"><surname>McCulloch</surname><given-names>P</given-names> </name></person-group><article-title>DECIDE-AI: a new reporting guideline and its relevance to artificial intelligence studies in radiology</article-title><source>Clin Radiol</source><year>2023</year><month>02</month><volume>78</volume><issue>2</issue><fpage>130</fpage><lpage>136</lpage><pub-id pub-id-type="doi">10.1016/j.crad.2022.09.131</pub-id><pub-id pub-id-type="medline">36639172</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><collab>DECIDE-AI Steering Group</collab></person-group><article-title>DECIDE-AI: new reporting guidelines to bridge the development-to-implementation gap in clinical artificial intelligence</article-title><source>Nat Med</source><year>2021</year><month>02</month><volume>27</volume><issue>2</issue><fpage>186</fpage><lpage>187</lpage><pub-id pub-id-type="doi">10.1038/s41591-021-01229-5</pub-id><pub-id pub-id-type="medline">33526932</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>von Elm</surname><given-names>E</given-names> </name><name name-style="western"><surname>Altman</surname><given-names>DG</given-names> </name><name name-style="western"><surname>Egger</surname><given-names>M</given-names> </name><etal/></person-group><article-title>The Strengthening the Reporting of Observational Studies in Epidemiology (STROBE) statement: guidelines for reporting observational studies</article-title><source>J Clin Epidemiol</source><year>2008</year><month>04</month><volume>61</volume><issue>4</issue><fpage>344</fpage><lpage>349</lpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2007.11.008</pub-id><pub-id pub-id-type="medline">18313558</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>GPT-5&#x2019;s performance in periodontitis staging and grading on the textual input of 25 clinical cases.</p><media xlink:href="formative_v10i1e88407_app1.docx" xlink:title="DOCX File, 60 KB"/></supplementary-material><supplementary-material id="app2"><label>Checklist 1</label><p>STARD and STARD-AI checklist for reporting GPT-5 diagnostic accuracy.</p><media xlink:href="formative_v10i1e88407_app2.pdf" xlink:title="PDF File, 324 KB"/></supplementary-material></app-group></back></article>