<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="letter"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v9i1e53335</article-id><article-id pub-id-type="doi">10.2196/53335</article-id><article-categories><subj-group subj-group-type="heading"><subject>Research Letter</subject></subj-group></article-categories><title-group><article-title>Guideline-Incorporated Large Language Model-Driven Evaluation of Medical Records Using MedCheckLLM</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Schubert</surname><given-names>Marc Cicero</given-names></name><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Soyka</surname><given-names>Stella</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wick</surname><given-names>Wolfgang</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Venkataramani</surname><given-names>Varun</given-names></name><degrees>MD, PhD</degrees></contrib></contrib-group><aff id="aff1"><institution>Department of Neurology, University Hospital Heidelberg</institution><addr-line>Im Neuenheimer Feld 400</addr-line><addr-line>Heidelberg</addr-line><country>Germany</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Mavragani</surname><given-names>Amaryllis</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Chrimes</surname><given-names>Dillon</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Han</surname><given-names>Peijin</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Varun Venkataramani, MD, PhD, Department of Neurology, University Hospital Heidelberg, Im Neuenheimer Feld 400, Heidelberg, 69120, Germany, 49 6221548630; <email>varun.venkataramani@med.uni-heidelberg.de</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>24</day><month>4</month><year>2025</year></pub-date><volume>9</volume><elocation-id>e53335</elocation-id><history><date date-type="received"><day>03</day><month>10</month><year>2023</year></date><date date-type="rev-recd"><day>02</day><month>11</month><year>2024</year></date><date date-type="accepted"><day>17</day><month>11</month><year>2024</year></date></history><copyright-statement>&#x00A9; Marc Cicero Schubert, Stella Soyka, Wolfgang Wick, Varun Venkataramani. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 24.4.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2025/1/e53335"/><abstract><sec><title>Abstract</title><p>The study introduces MedCheckLLM, a large language model&#x2013;driven framework that enhances medical record evaluation through a guideline-in-the-loop approach by integrating evidence-based guidelines.</p></sec></abstract><kwd-group><kwd>large language models</kwd><kwd>AI</kwd><kwd>electron medical records</kwd><kwd>checklists</kwd><kwd>LLM</kwd><kwd>language model</kwd><kwd>NLP</kwd><kwd>natural language processing</kwd><kwd>records</kwd><kwd>documentation</kwd><kwd>documents</kwd><kwd>framework</kwd><kwd>conceptual</kwd><kwd>machine learning</kwd><kwd>artificial intelligence</kwd><kwd>evidence</kwd><kwd>evaluate</kwd><kwd>evaluation</kwd><kwd>guideline</kwd><kwd>health care</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Large language models (LLMs) have demonstrated enormous potential in assessing complex datasets in health care across many applications [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. One underexplored area is their application for the reliable evaluation of medical documents. The automated evaluation of these documents has the potential to enhance patient safety. The system&#x2019;s reasoning process must be (1) transparent and comprehensible to human evaluators and (2) guided by established medical guidelines proven to increase patient safety [<xref ref-type="bibr" rid="ref3">3</xref>].</p><p>In this study, we introduce a framework that consists of a multistep approach for medical record evaluation that incorporates guidelines into the evaluation process (ie, guideline-in-the-loop). Our proposed algorithm, MedCheckLLM, is an LLM-driven, structured reasoning mechanism designed to automate the evaluation of medical records against evidence-based guidelines. The guidelines are deterministically accessed and returned to the LLM as input without further model fine-tuning. This strict separation of LLM and guidelines is expected to increase the validity and interpretability of the evaluations. The approach's step-by-step structure could improve transparency in clinical applications. The primary objective of this research is to introduce the conceptual framework and assess its feasibility.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><p>The MedCheckLLM algorithm begins by extracting a patient&#x2019;s diagnosis from the medical report (<xref ref-type="fig" rid="figure1">Figure 1</xref>). Based on the diagnosis, it suggests an appropriate guideline. A human medical expert makes the final guideline selection. Guidelines are then accessed independently of the LLM&#x2019;s mechanisms using programmatically built interfaces for guideline retrieval. Subsequently, guidelines are provided as input to the LLM and are either identified as already formatted in a usable checklist or converted into a checklist. This diagnosis-specific checklist is used to assess the medical report by the LLM, with a final verification by a human medical expert. To test this approach, we used expert-validated simulated medical reports (simulated dataset) and physician-generated medical reports (physician dataset). Performance was analyzed for patient histories with headaches using guidelines from the International Headache Society and the physician dataset for four further neurological diagnoses (ie, border zone infarction, meningitis, neuromyelitis optica, and subarachnoid haemorrhage). The validity of this method was further analyzed by evaluating generated doctor&#x2019;s notes with a correct diagnosis compared to doctor&#x2019;s notes with a false diagnosis. The LLMs, GPT-4 and Claude-3 were used for testing (see <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Workflow of MedCheckLLM. A medical report including medical history, diagnosis and treatment is provided as input. First, the LLM identifies the given diagnosis. Second, it suggests a medical guideline for evaluation of the medical report, with a human medical expert making the final selection. Then, independently of the LLM, the selected guideline is accessed, and diagnosis-specific text is extracted and inputted into the LLM. Subsequently, the LLM determines whether the input guideline text is in checklist format; if not, it extracts a checklist. Using this diagnosis-specific checklist, the LLM evaluates the medical report based on the diagnosis-specific checklist. Finally, a human expert assesses the LLM evaluation. Dashed arrow: Checklist extraction instead of detection of checklist format. Blue box: Component uses an LLM. Green Box: Components do not use an LLM.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e53335_fig01.png"/></fig></sec><sec id="s3" sec-type="results"><title>Results</title><p>We evaluated the medical report analysis conducted by MedCheckLLM for various headache diagnoses. In the simulated dataset, MedCheckLLM (based on GPT-4 and Claude-3, <xref ref-type="table" rid="table1">Table 1</xref>) extracted the specified diagnosis correctly in 100% of cases from a list of 61 possible diagnoses from The International Classification of Headache Disorders-3 [<xref ref-type="bibr" rid="ref4">4</xref>]. The model suggested existing evidence-based guidelines in 70.59% (12/17) of medical reports and detected the format of the guidelines as checklists in 100% of the cases (N=17). MedCheckLLM accurately evaluated 87% (67/ 77) of checklist items. Performance on the physician dataset showed an accurate evaluation in 77.4% (24/ 31) of checklist items (<xref ref-type="table" rid="table2">Table 2</xref>). It identified incorrect diagnoses where the stated diagnosis did not align with the content of the doctor&#x2019;s letters in 94.1% (16/17) of cases, while it correctly recognized 100% (N=17) of letters with matching diagnoses.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Performance of MedCheckLLM on the simulated dataset.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Elements of algorithmic structure</td><td align="left" valign="bottom">GPT-4 performance, % (n/N)</td><td align="left" valign="bottom">Claude-3 performance, % (n/N)</td><td align="left" valign="bottom">Explanation of specific task of each element</td></tr></thead><tbody><tr><td align="left" valign="top">Extracting stated diagnosis</td><td align="left" valign="top">100 (17/17)</td><td align="left" valign="top">100 (17/17)</td><td align="left" valign="top">Extract the diagnosis that is stated in the medical report</td></tr><tr><td align="left" valign="top">Suggestion of existing guidelines</td><td align="left" valign="top">70.6 (12/17)</td><td align="left" valign="top">58.8 (10/17)</td><td align="left" valign="top">Suggest a guideline that should be used to evaluate the medical report</td></tr><tr><td align="left" valign="top">Detection of checklist</td><td align="left" valign="top">100 (17/17)</td><td align="left" valign="top">100 (17/17)</td><td align="left" valign="top">Detect whether the accessed guidelines are in a structured checklist- criteria format</td></tr><tr><td align="left" valign="top">Evaluation of diagnostic criteria (checklist items)</td><td align="left" valign="top">87 (67/77)</td><td align="left" valign="top">83.8 (62/74)</td><td align="left" valign="top">Assess whether the criteria listed in the checklist are met in the medical report</td></tr><tr><td align="left" valign="top">Evaluation of letters with correct diagnosis (clinical descriptions and diagnosis match)</td><td align="left" valign="top">100 (17/17)</td><td align="left" valign="top">94.1 (16/17)</td><td align="left" valign="top">Assess whether the diagnosis stated in the medical report aligns with the clinical descriptions</td></tr><tr><td align="left" valign="top">Evaluation of letters with false diagnosis (clinical descriptions and diagnosis do not match)</td><td align="left" valign="top">91.4 (16/17)</td><td align="left" valign="top">91.4 (16/17)</td><td align="left" valign="top">Evaluate whether the diagnosis that is stated in the medical report fits the clinical descriptions</td></tr></tbody></table></table-wrap><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Performance of MedCheckLLM on the physician dataset.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Element of algorithmic structure</td><td align="left" valign="top">Stroke</td><td align="left" valign="top">Meningitis</td><td align="left" valign="top">Neuromyelitis optica</td><td align="left" valign="top">Subarachnoid hemorrhage</td></tr></thead><tbody><tr><td align="left" valign="top">Extracting stated diagnosis</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Yes</td></tr><tr><td align="left" valign="top">Suggestion of existing guidelines<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">Yes, applicable</td><td align="left" valign="top">Yes, partially applicable</td><td align="left" valign="top">Yes, applicable</td><td align="left" valign="top">Yes, partially applicable</td></tr><tr><td align="left" valign="top">Creation of checklist, level of detail<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">Yes, moderate detail</td><td align="left" valign="top">Yes, moderate detail</td><td align="left" valign="top">Yes, thorough detail</td><td align="left" valign="top">Yes, minimal detail</td></tr><tr><td align="left" valign="top">Evaluation of diagnostic criteria, % (n/N)</td><td align="left" valign="top">100 (7/7)</td><td align="left" valign="top">66.7 (4/6)</td><td align="left" valign="top">87.5 (7/8)</td><td align="left" valign="top">60 (6/10)</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>The responses were classified as yes, and partially applicable,applicable, or minimal, moderate, or thorough detail.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><p>The framework of MedCheckLLM represents a promising approach for a comprehensive, guideline-anchored review of electronic health records. It holds the potential to function as a quality assurance framework throughout patient care due to its advantages of separate partitioning of the LLM and the guidelines, rather than training guidelines into an LLM. The flexibility of this approach allows for immediate implementation of guideline updates or the option to implement customized protocols for subgroups of patients. Due to the checklist-based approach, each item can be verified individually, thus increasing the algorithm&#x2019;s interpretability, which is crucial in health care settings [<xref ref-type="bibr" rid="ref5">5</xref>]. Due to the LLM&#x2019;s subpar guideline suggestion capability, medical experts are integrated at this step to ensure that established guidelines are used. Further research is essential to advance the development of LLM-driven methods for extracting checklists from unstructured guidelines, as well-structured guidelines are crucial for detailed, high-quality checklists. Further, this framework facilitates improved data mining practices in electronic health records [<xref ref-type="bibr" rid="ref6">6</xref>]. In the future, it is crucial to address privacy concerns to ensure the ethical application of these powerful tools in real-world clinical settings [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref9">9</xref>].</p></sec></body><back><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">LLM</term><def><p>large language model</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kung</surname><given-names>TH</given-names> </name><name name-style="western"><surname>Cheatham</surname><given-names>M</given-names> </name><name name-style="western"><surname>Medenilla</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT on USMLE: Potential for AI-assisted medical education using large language models</article-title><source>PLOS Digit Health</source><year>2023</year><month>02</month><volume>2</volume><issue>2</issue><fpage>e0000198</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id><pub-id pub-id-type="medline">36812645</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schubert</surname><given-names>MC</given-names> </name><name name-style="western"><surname>Wick</surname><given-names>W</given-names> </name><name name-style="western"><surname>Venkataramani</surname><given-names>V</given-names> </name></person-group><article-title>Performance of Large Language Models on a Neurology Board-Style Examination</article-title><source>JAMA Netw Open</source><year>2023</year><month>12</month><day>1</day><volume>6</volume><issue>12</issue><fpage>e2346721</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2023.46721</pub-id><pub-id pub-id-type="medline">38060223</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thomassen</surname><given-names>O</given-names> </name><etal/></person-group><article-title>The effects of safety checklists in medicine: a systematic review</article-title><source>Acta Anaesthesiol Scand</source><year>2014</year><month>01</month><volume>58</volume><issue>1</issue><fpage>5</fpage><lpage>18</lpage><pub-id pub-id-type="doi">10.1111/aas.12207</pub-id><pub-id pub-id-type="medline">24116973</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><article-title>Headache Classification Committee of the International Headache Society (IHS) The International Classification of Headache Disorders, 3rd edition</article-title><source>Cephalalgia</source><year>2018</year><month>01</month><volume>38</volume><issue>1</issue><fpage>1</fpage><lpage>211</lpage><pub-id pub-id-type="doi">10.1177/0333102417738202</pub-id><pub-id pub-id-type="medline">29368949</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Amann</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Explainability for artificial intelligence in healthcare: a multidisciplinary perspective</article-title><source>BMC Med Inform Decis Mak</source><year>2020</year><month>11</month><day>30</day><volume>20</volume><issue>1</issue><fpage>310</fpage><pub-id pub-id-type="doi">10.1186/s12911-020-01332-6</pub-id><pub-id pub-id-type="medline">33256715</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jiang</surname><given-names>LY</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>XC</given-names> </name><name name-style="western"><surname>Nejatian</surname><given-names>NP</given-names> </name><etal/></person-group><article-title>Health system-scale language models are all-purpose prediction engines</article-title><source>Nature New Biol</source><year>2023</year><month>07</month><volume>619</volume><issue>7969</issue><fpage>357</fpage><lpage>362</lpage><pub-id pub-id-type="doi">10.1038/s41586-023-06160-y</pub-id><pub-id pub-id-type="medline">37286606</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mesk&#x00F3;</surname><given-names>B</given-names> </name></person-group><article-title>The Impact of Multimodal Large Language Models on Health Care&#x2019;s Future</article-title><source>J Med Internet Res</source><year>2023</year><month>11</month><day>2</day><volume>25</volume><fpage>e52865</fpage><pub-id pub-id-type="doi">10.2196/52865</pub-id><pub-id pub-id-type="medline">37917126</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dorr</surname><given-names>DA</given-names> </name><name name-style="western"><surname>Adams</surname><given-names>L</given-names> </name><name name-style="western"><surname>Emb&#x00ED;</surname><given-names>P</given-names> </name></person-group><article-title>Harnessing the Promise of Artificial Intelligence Responsibly</article-title><source>JAMA</source><year>2023</year><month>04</month><day>25</day><volume>329</volume><issue>16</issue><fpage>1347</fpage><lpage>1348</lpage><pub-id pub-id-type="doi">10.1001/jama.2023.2771</pub-id><pub-id pub-id-type="medline">36972068</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>C</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Guo</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>J</given-names> </name></person-group><article-title>Ethical Considerations of Using ChatGPT in Health Care</article-title><source>J Med Internet Res</source><year>2023</year><month>08</month><day>11</day><volume>25</volume><fpage>e48009</fpage><pub-id pub-id-type="doi">10.2196/48009</pub-id><pub-id pub-id-type="medline">37566454</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Large language models used in this study.</p><media xlink:href="formative_v9i1e53335_app1.pdf" xlink:title="PDF File, 68 KB"/></supplementary-material></app-group></back></article>