<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v10i1e77114</article-id><article-id pub-id-type="doi">10.2196/77114</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Integrating GPT-4o Into Data Mining in Neurosurgery: Feasibility and Proof-of-Concept Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Almeida Sales</surname><given-names>Arthur Henrique</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Beck</surname><given-names>J&#x00FC;rgen</given-names></name><degrees>Prof Dr</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Grauvogel</surname><given-names>J&#x00FC;rgen</given-names></name><degrees>Prof Dr</degrees><xref ref-type="aff" rid="aff1"/></contrib></contrib-group><aff id="aff1"><institution>Department of Neurosurgery, Faculty of Medicine, University of Freiburg</institution><addr-line>Breisacher Str. 64</addr-line><addr-line>Freiburg</addr-line><country>Germany</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Sarvestan</surname><given-names>Javad</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Elbattah</surname><given-names>Mahmoud</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Wei</surname><given-names>Yujia</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Arthur Henrique Almeida Sales, MD, Department of Neurosurgery, Faculty of Medicine, University of Freiburg, Breisacher Str. 64, Freiburg, 79106, Germany, 49 761-270 ext 50010; <email>salesarthur2@gmail.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>9</day><month>3</month><year>2026</year></pub-date><volume>10</volume><elocation-id>e77114</elocation-id><history><date date-type="received"><day>07</day><month>05</month><year>2025</year></date><date date-type="rev-recd"><day>14</day><month>11</month><year>2025</year></date><date date-type="accepted"><day>17</day><month>11</month><year>2025</year></date></history><copyright-statement>&#x00A9; Arthur Henrique Almeida Sales, J&#x00FC;rgen Beck, J&#x00FC;rgen Grauvogel. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 9.3.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2026/1/e77114"/><abstract><sec><title>Background</title><p>Large language models offer new possibilities for transforming unstructured clinical text into structured datasets. However, their performance in specialized and complex documentation environments, such as neurosurgery, remains insufficiently characterized. GPT-4o is a large language model with enhanced natural language capabilities, but its accuracy in extracting structured data from neurosurgical reports has not been systematically assessed.</p></sec><sec><title>Objective</title><p>This proof-of-concept study evaluated the feasibility and accuracy of GPT-4o for extracting predefined structured variables from unstructured neurosurgical reports of patients with vestibular schwannoma. Specific aims were to measure accuracy across variable types, assess the impact of prompt refinement, and explore the model&#x2019;s potential utility for research-oriented data mining.</p></sec><sec sec-type="methods"><title>Methods</title><p>In this retrospective single-center study, 10 consecutive patients with histologically confirmed vestibular schwannoma who underwent surgery between August and December 2023 were included. Four anonymized German-language documents per patient (discharge, surgical, histopathology, and 3-month follow-up reports) were processed using GPT-4o. Seventeen variables were extracted using a standardized zero-shot prompt. Targeted prompt refinements were subsequently applied for variables with low baseline accuracy. Two board-certified neurosurgeons independently validated all outputs, with discrepancies resolved by a senior neurosurgeon. Accuracy metrics, 95% CIs (Wilson method), and descriptive comparisons between variable types were calculated.</p></sec><sec sec-type="results"><title>Results</title><p>GPT-4o achieved 100% accuracy for structured variables requiring minimal interpretation, including patient ID, date of birth, date of surgery, histopathological diagnosis, and World Health Organization grade. Several interpretative variables, such as symptoms at presentation, symptom type, symptom duration, extent of resection, and permanence of postoperative deficits, were also extracted with 100% accuracy. In contrast, intraoperative complications and new postoperative deficits were correctly identified in only 50% (5/10) of cases using the zero-shot prompt. After targeted prompt refinement, accuracy for these variables improved substantially, reaching 90% to 100% in most cases. The mean accuracy was highest for structured categorical variables (97.5%, SD 4.6%), intermediate for binary variables (80%, SD 27.4%), and lowest for conditional text variables (66.7%, SD 28.9%), without statistically significant differences (<italic>P</italic>=.25).</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>GPT-4o demonstrated strong feasibility for structured data extraction from standardized neurosurgical reports, particularly for variables with limited semantic complexity. However, the high accuracy observed reflects a narrow and highly controlled context and should not be interpreted as evidence of general reliability across diverse clinical settings. Larger, multi-institutional, and multilingual studies are needed to determine broader applicability and potential clinical integration.</p></sec></abstract><kwd-group><kwd>artificial intelligence</kwd><kwd>AI</kwd><kwd>data mining</kwd><kwd>GPT-4o</kwd><kwd>patient privacy</kwd><kwd>automated coding</kwd><kwd>health care analytics</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>The advent of artificial intelligence (AI) in health care has revolutionized data processing, decision-making, and research methodologies. Among these innovations, natural language processing (NLP) models, such as GPT-4o (released in May 2024; OpenAI), represent a significant leap in handling unstructured data. With their capacity to synthesize, analyze, and structure information, large language models (LLMs) provide a promising avenue for transforming how clinicians and researchers interact with medical records and literature.</p><p>In neurosurgery, the challenge of efficiently extracting insights from diverse and voluminous data, including imaging reports, clinical notes, and surgical documentation, is particularly pronounced. Studies have demonstrated that NLP models can bridge these gaps, optimizing both clinical workflows and research efforts [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. Furthermore, by automating labor-intensive processes, these tools may enhance patient care, reduce costs, and streamline health care operations [<xref ref-type="bibr" rid="ref3">3</xref>].</p><p>Despite the promise, challenges remain. Privacy concerns, data security, and the need for precise tuning of AI tools pose significant barriers to their widespread adoption [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref6">6</xref>]. While LLMs present remarkable opportunities, their limitations must also be recognized. Common challenges include hallucination, biases inherited from training data, and overfitting in specialized domains. These issues have historically limited their practical use in health care, highlighting the need for thorough validation and responsible implementation.</p><p>This proof-of-concept study aimed to explore the application of GPT-4o as a data mining tool in neurosurgery. Patients with vestibular schwannoma were selected because their management involves standardized documentation, including surgical, pathological, and follow-up reports. This provides a consistent and well-defined framework for assessing the capability of GPT-4o to structure complex clinical data. The approach presented here may serve as a template for future applications across other neurosurgical conditions. Specifically, we seek to evaluate the model&#x2019;s ability to extract and provide structured data from unstructured datasets and assess its feasibility as an aid in research environments.</p><p>By leveraging GPT-4&#x2019;s NLP capabilities, we hypothesize that neurosurgical data can be extracted and structured reliably, facilitating accurate data aggregation and analysis across heterogeneous clinical reports.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Overview</title><p>This was a retrospective feasibility study conducted at a single tertiary neurosurgical center (Department of Neurosurgery at the University Hospital of Freiburg). The inclusion criteria comprised consecutive patients with histologically confirmed vestibular schwannoma who underwent surgical treatment between August and December 2023 and had complete documentation (discharge, surgical, pathology, and follow-up reports). Patients who lacked any of these reports were excluded from the analysis. <xref ref-type="fig" rid="figure1">Figure 1</xref> illustrates the data flow and extraction process.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Workflow of data extraction and validation process. AI: artificial intelligence; VS: vestibular schwannoma.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e77114_fig01.png"/></fig><p>The primary outcome of the study was the accuracy rate in extracting structured information from the provided unstructured data. The ground truth for this evaluation was established by a medical team, which manually reviewed and assessed each response provided by the AI model. This ensured that the AI&#x2019;s performance was rigorously compared to expert human judgment.</p><p>All personal data of both patients and health care professionals that were visible in the original documents were anonymized before being processed by the AI model (<xref ref-type="fig" rid="figure2">Figures 2</xref> and <xref ref-type="fig" rid="figure3">3</xref>). Consequently, it was not possible to identify any individual patients from the data provided to the AI model for analysis. It is also important to note that the original language of all medical reports was German, and these unstructured data were presented to the AI model in its native form for analysis and extraction.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Surgical report. This surgery report was used as a raw dataset for extracting and processing unstructured data into structured data. Personal data were omitted from the artificial intelligence tool to ensure compliance with ethical standards and protect patient privacy.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e77114_fig02.png"/></fig><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Discharge report. The discharge report was used as a raw dataset for extracting and processing unstructured data into structured data. Personal data were omitted from the artificial intelligence tool to ensure compliance with ethical standards and protect patient privacy.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e77114_fig03.png"/></fig></sec><sec id="s2-2"><title>Prompt Design</title><p>Prompts were designed iteratively, meaning they were developed, tested, and refined to improve clarity and extraction precision. The initial zero-shot prompt served as a baseline framework for structured data extraction. On the basis of qualitative assessment of model responses, targeted refinements were introduced to better define certain variables that were prone to misinterpretation.</p><p>For example, the variable &#x201C;intraoperative complications&#x201D; was explicitly redefined in the prompt as &#x201C;intraoperative damage to major vascular or neural structures,&#x201D; and &#x201C;new postoperative deficits&#x201D; were clarified as &#x201C;deficits developed after surgery, excluding preexisting ones.&#x201D; These refinements were introduced to standardize semantic interpretation and reduce ambiguity during data extraction.</p><p>Refinement rounds continued until no further gains in extraction accuracy were observed, while ensuring that definitions of other variables remained unaffected. All prompts were executed in GPT-4o.</p><p>This prompt shown in <xref ref-type="other" rid="box1">Textbox 1</xref> was used to generate a Microsoft Excel table with structured data extracted from medical reports written in natural language.</p><boxed-text id="box1"><title> Prompt used to generate structured data table.</title><p>I am going to upload medical reports from a few patients, and I want you to transform the textual data (unstructured) into structured data in an Excel table. The documents are: discharge report, surgery report, histopathology report, and outpatient follow-up report from 3 months post-surgery. I want you to extract data from these documents to create a structured table where each row represents a patient and each column a variable, according to the following scheme: If the information is not present in any of the provided documents, respond non applicable (n/a). The column structure is as follows:</p><list list-type="bullet"><list-item><p>Column 1: PIZ (patient identification number)</p></list-item><list-item><p>Column 2: Date of birth</p></list-item><list-item><p>Column 3: Date of surgery</p></list-item><list-item><p>Column 4: Symptom present at disease presentation (yes or no)</p></list-item><list-item><p>Column 5: If yes in column 4, specify the symptom; if no, respond &#x201C;n/a&#x201D;</p></list-item><list-item><p>Column 6: Time from symptom onset to surgery (in months)</p></list-item><list-item><p>Column 7: Intraoperative complication (yes or no)</p></list-item><list-item><p>Column 8: If yes in column 7, specify the complication; if no, respond &#x201C;n/a&#x201D;</p></list-item><list-item><p>Column 9: Postoperative deficits (yes or no)</p></list-item><list-item><p>Column 10: If yes in column 9, specify the deficit; if no, respond &#x201C;n/a&#x201D;</p></list-item><list-item><p>Column 11: Was the deficit permanent? (based on the 3-month outpatient follow-up report; yes or no)</p></list-item><list-item><p>Column 12: Histopathological diagnosis</p></list-item><list-item><p>Column 13: WHO tumor grade</p></list-item><list-item><p>Column 14: Tumor resection grade according to the 3-month follow-up report (1 for total resection, 2 for subtotal, 3 for recurrence)</p></list-item><list-item><p>Column 15: Preoperative Karnofsky score (based on the admission findings in the discharge report)</p></list-item><list-item><p>Column 16: Postoperative Karnofsky score (based on the findings upon discharge in the discharge report)</p></list-item><list-item><p>Column 17: Any new symptoms at the 3-month follow-up visit (based on the 3-month outpatient follow-up report)</p></list-item></list></boxed-text><p>This standardized prompt was used across all patient records to ensure consistency in data extraction, with the AI tasked with transforming unstructured medical data into a format suitable for subsequent analysis.</p></sec><sec id="s2-3"><title>Data Anonymization</title><p>All original medical reports were manually anonymized by the research team before being entered into the GPT-4o interface. Identifiers such as names and addresses were removed. The anonymized reports were then processed through the official ChatGPT web platform for data extraction. No identifiable or sensitive information was transmitted, stored, or shared, ensuring compliance with General Data Protection Regulation (GDPR) and the standards of the local ethics committee.</p></sec><sec id="s2-4"><title>Data Extraction</title><p>All analyses were performed using GPT-4o accessed through the official ChatGPT interface. At the time of data collection, research API (application programming interface) access was not available. The full dataset of 10 anonymized surgical cases was uploaded as a single compressed file for structured data extraction. Generated tables were manually reviewed and archived to ensure methodological traceability and reproducibility within the proof-of-concept framework.</p></sec><sec id="s2-5"><title>Validation Process</title><p>Two board-certified neurosurgeons independently performed manual data extraction from the anonymized medical reports before the GPT-4o analysis, ensuring blinding to the AI-generated results. After completion of both manual and AI-based extractions, the datasets were compared to establish the ground truth. In cases where the 2 reviewers disagreed, a third senior neurosurgeon adjudicated the final classification. Interrater agreement between the 2 primary reviewers was calculated using Cohen &#x03BA;.</p></sec><sec id="s2-6"><title>Sample Size</title><p>The sample size of 10 patients was chosen based on feasibility for this proof-of-concept design, as the objective was not to achieve statistical power but to test the model&#x2019;s capacity for accurate structured data extraction in a controlled, pilot environment.</p></sec><sec id="s2-7"><title>Statistical Analysis</title><p>For each binary variable, model outputs were compared to the ground truth to derive the counts of true positives, false positives, false negatives, and true negatives. On the basis of these, accuracy, precision, recall (sensitivity), and <italic>F</italic>1-score were calculated primarily for binary variables, where such metrics are meaningful.</p><p>For conditional text and categorical or numerical variables, performance was mainly summarized using accuracy and corresponding 95% CIs computed via the Wilson method for binomial proportions. Additionally, &#x0394; accuracy (postrefinement and prerefinement) was calculated to quantify improvement following prompt refinement.</p><p>To explore potential differences in accuracy across variable types&#x2014;binary, conditional text, and categorical or numerical&#x2014;a Kruskal-Wallis test was applied. Given the small sample size and exploratory nature of this proof-of-concept analysis, no post hoc pairwise testing or correction for multiple comparisons was performed.</p><p>All analyses were performed using SPSS Statistics (version 27.0; IBM Corp) and R software (version 4.3.2; R Foundation for Statistical Computing).</p></sec><sec id="s2-8"><title>Ethical Considerations</title><p>This research was conducted in compliance with the Code of Ethics of the World Medical Association (Declaration of Helsinki) for experiments involving human subjects. The local ethics committee of the University of Freiburg (23&#x2010;1393-S1-retro; approval date: October 31, 2023) approved the waiver of informed consent due to the retrospective design, full anonymization of patient identifiers, and minimal-risk nature of the study, consistent with institutional and national regulations. As this was a retrospective analysis of fully anonymized clinical data, no participants were contacted and no compensation was provided.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Overview</title><p>Ten consecutive patients diagnosed with intracranial acoustic neuromas who underwent surgical treatment at the Department of Neurosurgery at the University Hospital of Freiburg between August and December 2023 were included in this study. All screened patients met the inclusion criteria and were analyzed. GPT-4o generated an Excel table with structured data based on the zero-shot prompt described in the Methods section (<xref ref-type="fig" rid="figure4">Figure 4</xref>).</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Part of the GPT-4o&#x2013;generated table with structured data extracted from patient reports. These data were processed from unstructured medical records, while personal information was omitted to ensure compliance with ethical standards and protect patient privacy.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e77114_fig04.png"/></fig></sec><sec id="s3-2"><title>Extraction of Structured Data</title><p>Regarding the collection of structured data that does not require text interpretation, such as date of birth, patient ID, date of surgery, histopathological diagnosis, and World Health Organization grade, and considering the physicians&#x2019; opinion as the ground truth, the accuracy rate was 100% (<xref ref-type="table" rid="table1">Table 1</xref>).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Accuracy and 95% CIs (Wilson) for structured categorical variables.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Variable</td><td align="left" valign="bottom">Correct, n</td><td align="left" valign="bottom">Incorrect, n</td><td align="left" valign="bottom">Accuracy<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> (%; 95% CI<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup>)</td></tr></thead><tbody><tr><td align="left" valign="top">Patient ID</td><td align="left" valign="top">10</td><td align="left" valign="top">0</td><td align="left" valign="top">100.0 (72.2&#x2010;100.0)</td></tr><tr><td align="left" valign="top">Date of birth</td><td align="left" valign="top">10</td><td align="left" valign="top">0</td><td align="left" valign="top">100.0 (72.2&#x2010;100.0)</td></tr><tr><td align="left" valign="top">Date of surgery</td><td align="left" valign="top">10</td><td align="left" valign="top">0</td><td align="left" valign="top">100.0 (72.2&#x2010;100.0)</td></tr><tr><td align="left" valign="top">Histopathological diagnosis</td><td align="left" valign="top">10</td><td align="left" valign="top">0</td><td align="left" valign="top">100.0 (72.2&#x2010;100.0)</td></tr><tr><td align="left" valign="top">WHO<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> grade</td><td align="left" valign="top">10</td><td align="left" valign="top">0</td><td align="left" valign="top">100.0 (72.2&#x2010;100.0)</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>All fields were extracted with perfect accuracy (100%) across 10 cases. </p></fn><fn id="table1fn2"><p><sup>b</sup>CIs reflect the limited sample size rather than model variability.</p></fn><fn id="table1fn3"><p><sup>c</sup>WHO: World Health Organization.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-3"><title>Extraction and Processing of Unstructured Data</title><p>In the zero-shot prompt, the collection and processing of unstructured data into structured data provided heterogeneous results. The following variables presented a 100% accuracy rate when compared to the physicians&#x2019; evaluation: symptoms at presentation (yes or no), which symptoms, symptoms onset to surgery (in months), permanent deficits (yes or no), extent of resection (total, subtotal, and biopsy), and new symptoms at 3-month follow-up (<xref ref-type="table" rid="table2">Table 2</xref>).</p><p>In contrast, variables such as &#x201C;intraoperative complications (yes or no),&#x201D; &#x201C;if yes, which complications,&#x201D; &#x201C;new postoperative deficits,&#x201D; and &#x201C;if yes, which deficits&#x201D; achieved an accuracy rate as low as 50% in the zero-shot prompt. However, when the model was refined through additional prompt adjustments, as described in the Methods section, the accuracy rate improved to 90% to 100%, as presented in <xref ref-type="table" rid="table3">Tables 3</xref> and <xref ref-type="table" rid="table4">4</xref>.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Accuracy and 95% CIs (Wilson) for binary, categorical, and conditional text variables requiring interpretative capabilities.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Variable<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="bottom">Correct/total, n</td><td align="left" valign="bottom">Accuracy (%; 95% CI)</td></tr></thead><tbody><tr><td align="left" valign="top">Symptoms at presentation (yes or no)</td><td align="left" valign="top">10/10</td><td align="left" valign="top">100 (72.2-100.0)</td></tr><tr><td align="left" valign="top">Which symptom?</td><td align="left" valign="top">10/10</td><td align="left" valign="top">100 (72.2-100.0)</td></tr><tr><td align="left" valign="top">Intraoperative complication (yes or no)</td><td align="left" valign="top">5/10</td><td align="left" valign="top">50 (23.7-76.3)</td></tr><tr><td align="left" valign="top">Which complication?</td><td align="left" valign="top">5/10</td><td align="left" valign="top">50 (23.7-76.3)</td></tr><tr><td align="left" valign="top">New postoperative deficits (yes or no)</td><td align="left" valign="top">5/10</td><td align="left" valign="top">50 (23.7-76.3)</td></tr><tr><td align="left" valign="top">Which deficit?</td><td align="left" valign="top">5/10</td><td align="left" valign="top">50 (23.7-76.3)</td></tr><tr><td align="left" valign="top">Was the deficit permanent (yes or no)</td><td align="left" valign="top">10/10</td><td align="left" valign="top">100 (72.2-100.0)</td></tr><tr><td align="left" valign="top">Extent of resection (3-month follow-up)</td><td align="left" valign="top">10/10</td><td align="left" valign="top">100 (72.2-100.0)</td></tr><tr><td align="left" valign="top">New symptoms at 3-month follow-up (yes or no)</td><td align="left" valign="top">10/10</td><td align="left" valign="top">100 (72.2-100.0)</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>Variables are grouped with their corresponding descriptive fields (&#x201C;Which&#x2026;&#x201D;).</p></fn></table-wrap-foot></table-wrap><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Case-level model outputs for intraoperative complication before and after prompt refinement<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup>.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Patient</td><td align="left" valign="bottom" colspan="4">Model performance before prompt refinement</td><td align="left" valign="bottom" colspan="4">Model performance after prompt refinement</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Intraoperative complication</td><td align="left" valign="top">Assessment</td><td align="left" valign="top">Which complication</td><td align="left" valign="top">Assessment</td><td align="left" valign="top">Intraoperative complication</td><td align="left" valign="top">Assessment</td><td align="left" valign="top">Which complication</td><td align="left" valign="top">Assessment</td></tr></thead><tbody><tr><td align="left" valign="top">1</td><td align="left" valign="top">No</td><td align="left" valign="top">Correct</td><td align="left" valign="top">None</td><td align="left" valign="top">Correct</td><td align="left" valign="top">No</td><td align="left" valign="top">Correct</td><td align="left" valign="top">None</td><td align="left" valign="top">Correct</td></tr><tr><td align="left" valign="top">2</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Incorrect</td><td align="left" valign="top">Facial palsy (HB<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup> II)</td><td align="left" valign="top">Incorrect</td><td align="left" valign="top">No</td><td align="left" valign="top">Correct</td><td align="left" valign="top">None</td><td align="left" valign="top">Correct</td></tr><tr><td align="left" valign="top">3</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Incorrect</td><td align="left" valign="top">Facial palsy (HB II)</td><td align="left" valign="top">Incorrect</td><td align="left" valign="top">No</td><td align="left" valign="top">Correct</td><td align="left" valign="top">None</td><td align="left" valign="top">Correct</td></tr><tr><td align="left" valign="top">4</td><td align="left" valign="top">No</td><td align="left" valign="top">Correct</td><td align="left" valign="top">None</td><td align="left" valign="top">Correct</td><td align="left" valign="top">No</td><td align="left" valign="top">Correct</td><td align="left" valign="top">None</td><td align="left" valign="top">Correct</td></tr><tr><td align="left" valign="top">5</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Incorrect</td><td align="left" valign="top">Facial palsy (HB V)</td><td align="left" valign="top">Incorrect</td><td align="left" valign="top">No</td><td align="left" valign="top">Correct</td><td align="left" valign="top">None</td><td align="left" valign="top">Correct</td></tr><tr><td align="left" valign="top">6</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Incorrect</td><td align="left" valign="top">Facial palsy (HB IV)</td><td align="left" valign="top">Incorrect</td><td align="left" valign="top">No</td><td align="left" valign="top">Correct</td><td align="left" valign="top">None</td><td align="left" valign="top">Correct</td></tr><tr><td align="left" valign="top">7</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Incorrect</td><td align="left" valign="top">Facial palsy (HB IV)</td><td align="left" valign="top">Incorrect</td><td align="left" valign="top">No</td><td align="left" valign="top">Correct</td><td align="left" valign="top">None</td><td align="left" valign="top">Correct</td></tr><tr><td align="left" valign="top">8</td><td align="left" valign="top">No</td><td align="left" valign="top">Correct</td><td align="left" valign="top">None</td><td align="left" valign="top">Correct</td><td align="left" valign="top">No</td><td align="left" valign="top">Correct</td><td align="left" valign="top">None</td><td align="left" valign="top">Correct</td></tr><tr><td align="left" valign="top">9</td><td align="left" valign="top">No</td><td align="left" valign="top">Correct</td><td align="left" valign="top">None</td><td align="left" valign="top">Correct</td><td align="left" valign="top">No</td><td align="left" valign="top">Correct</td><td align="left" valign="top">None</td><td align="left" valign="top">Correct</td></tr><tr><td align="left" valign="top">10</td><td align="left" valign="top">No</td><td align="left" valign="top">Correct</td><td align="left" valign="top">None</td><td align="left" valign="top">Correct</td><td align="left" valign="top">No</td><td align="left" valign="top">Correct</td><td align="left" valign="top">None</td><td align="left" valign="top">Correct</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>Each variable (&#x201C;intraoperative complication&#x201D; and &#x201C;which complication&#x201D;) was independently assessed as correct or incorrect against the ground truth. The table highlights corrections achieved through prompt refinement.</p></fn><fn id="table3fn2"><p><sup>b</sup>HB: House-Brackmann.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Case-level model outputs for intraoperative complication before and after prompt refinement<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup>.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom" colspan="4">Model performance before prompt refinement</td><td align="left" valign="bottom" colspan="4">Model performance after prompt refinement</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">New postoperative deficit</td><td align="left" valign="top">Assessment</td><td align="left" valign="top">Which deficit</td><td align="left" valign="top">Assessment</td><td align="left" valign="top">New postoperative deficit</td><td align="left" valign="top">Assessment</td><td align="left" valign="top">Which deficit</td><td align="left" valign="top">Assessment</td></tr></thead><tbody><tr><td align="left" valign="top">1</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Incorrect</td><td align="left" valign="top">Mild dizziness and mild hearing loss (right)</td><td align="left" valign="top">Incorrect</td><td align="left" valign="top">No</td><td align="left" valign="top">Correct</td><td align="left" valign="top">None</td><td align="left" valign="top">Correct</td></tr><tr><td align="left" valign="top">2</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Correct</td><td align="left" valign="top">Mild facial paresis and left-sided hearing loss</td><td align="left" valign="top">Correct</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Correct</td><td align="left" valign="top">Mild facial paresis and left-sided hearing loss</td><td align="left" valign="top">Correct</td></tr><tr><td align="left" valign="top">3</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Correct</td><td align="left" valign="top">Left deafness, dizziness, and mild facial paresis (temporary)</td><td align="left" valign="top">Correct</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Correct</td><td align="left" valign="top">Facial paresis (temporary) and left deafness</td><td align="left" valign="top">Correct</td></tr><tr><td align="left" valign="top">4</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Incorrect</td><td align="left" valign="top">Mild right facial hypoesthesia and tinnitus</td><td align="left" valign="top">Incorrect</td><td align="left" valign="top">No</td><td align="left" valign="top">Correct</td><td align="left" valign="top">None</td><td align="left" valign="top">Correct</td></tr><tr><td align="left" valign="top">5</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Incorrect</td><td align="left" valign="top">Facial paresis (grade V), right facial hypoesthesia, and right deafness</td><td align="left" valign="top">Incorrect</td><td align="left" valign="top">No</td><td align="left" valign="top">Correct</td><td align="left" valign="top">None</td><td align="left" valign="top">Correct</td></tr><tr><td align="left" valign="top">6</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Incorrect</td><td align="left" valign="top">Facial paresis (grade IV), chin hypoesthesia, and left deafness</td><td align="left" valign="top">Incorrect</td><td align="left" valign="top">No</td><td align="left" valign="top">Correct</td><td align="left" valign="top">None</td><td align="left" valign="top">Correct</td></tr><tr><td align="left" valign="top">7</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Correct</td><td align="left" valign="top">Facial paresis (grade IV) and left deafness</td><td align="left" valign="top">Correct</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Correct</td><td align="left" valign="top">Facial paresis (grade IV) and left deafness</td><td align="left" valign="top">Correct</td></tr><tr><td align="left" valign="top">8</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Correct</td><td align="left" valign="top">Facial paresis (grade V) and left deafness</td><td align="left" valign="top">Correct</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Correct</td><td align="left" valign="top">Facial paresis (grade V) and left deafness</td><td align="left" valign="top">Correct</td></tr><tr><td align="left" valign="top">9</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Correct</td><td align="left" valign="top">Mild facial paresis (right)</td><td align="left" valign="top">Correct</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Correct</td><td align="left" valign="top">Mild facial paresis (right)</td><td align="left" valign="top">Correct</td></tr><tr><td align="left" valign="top">10</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Incorrect</td><td align="left" valign="top">Mild facial paresis (HB<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup> II)</td><td align="left" valign="top">Incorrect</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Incorrect</td><td align="left" valign="top">Mild facial paresis (HB II)</td><td align="left" valign="top">Incorrect</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>Each variable (&#x201C;New Postoperative Deficits&#x201D; and &#x201C;Which Deficit&#x201D;) was independently assessed as correct or incorrect against the ground truth. The table highlights corrections achieved through prompt refinement.</p></fn><fn id="table4fn2"><p><sup>b</sup>HB: House-Brackmann.</p></fn></table-wrap-foot></table-wrap><p>Model performance across interpretative binary variables is summarized in <xref ref-type="table" rid="table5">Table 5</xref>. Accuracy ranged from 50% to 100%, reflecting variable complexity in contextual interpretation. While fields such as <italic>was the deficit permanent?</italic> and <italic>new symptoms at 3-month follow-up</italic> achieved perfect accuracy, others&#x2014;particularly <italic>intraoperative complication</italic> and <italic>new postoperative deficits</italic>&#x2014;showed lower precision due to false-positive predictions.</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Model performance across interpretative binary variables.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Variable</td><td align="left" valign="bottom">TP<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup></td><td align="left" valign="bottom">FP<sup><xref ref-type="table-fn" rid="table5fn2">b</xref></sup></td><td align="left" valign="bottom">FN<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup></td><td align="left" valign="bottom">TN<sup><xref ref-type="table-fn" rid="table5fn4">d</xref></sup></td><td align="left" valign="bottom">Accuracy<sup><xref ref-type="table-fn" rid="table5fn5">e</xref></sup> (%)</td><td align="left" valign="bottom">Precision</td><td align="left" valign="bottom">Recall</td><td align="left" valign="bottom"><italic>F</italic>1-score</td></tr></thead><tbody><tr><td align="left" valign="top">Symptoms at presentation (yes or no)</td><td align="left" valign="top">10</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">100</td><td align="left" valign="top">1</td><td align="left" valign="top">1</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">Intraoperative complication (yes or no)</td><td align="left" valign="top">0</td><td align="left" valign="top">5</td><td align="left" valign="top">0</td><td align="left" valign="top">5</td><td align="left" valign="top">50</td><td align="left" valign="top">0</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table5fn6">f</xref></sup></td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">New postoperative deficits? (yes or no)</td><td align="left" valign="top">5</td><td align="left" valign="top">5</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">50</td><td align="left" valign="top">0.5</td><td align="left" valign="top">1</td><td align="left" valign="top">0.67</td></tr><tr><td align="left" valign="top">Was the deficit permanent? (yes or no)</td><td align="left" valign="top">1</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">9</td><td align="left" valign="top">100</td><td align="left" valign="top">1</td><td align="left" valign="top">1</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">New symptoms at 3-month follow-up?</td><td align="left" valign="top">1</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">9</td><td align="left" valign="top">100</td><td align="left" valign="top">1</td><td align="left" valign="top">1</td><td align="left" valign="top">1</td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>TP: true positive.</p></fn><fn id="table5fn2"><p><sup>b</sup>FP: false positive.</p></fn><fn id="table5fn3"><p><sup>c</sup>FN: false negative.</p></fn><fn id="table5fn4"><p><sup>d</sup>TN: true negative.</p></fn><fn id="table5fn5"><p><sup>e</sup>Accuracy ranged from 50% to 100%, with lower precision observed in fields requiring finer contextual understanding, such as intraoperative and postoperative deficits.</p></fn><fn id="table5fn6"><p><sup>f</sup>Not applicable.</p></fn></table-wrap-foot></table-wrap><p>When comparing mean accuracies across variable types using the Kruskal-Wallis test, no statistically significant difference was observed (H=2.79; <italic>P</italic>=.25). Nevertheless, a consistent trend toward higher accuracy for structured categorical and numerical variables (eg, <italic>patient ID, date of birth, World Health Organization grade, extent of resection,</italic> and <italic>Karnofsky scores</italic>; mean 97.5%, SD 4.6%) and lower accuracy for conditional text variables (eg, <italic>which symptom? which complication?</italic> and <italic>which deficit?</italic>; mean 66.7%, SD 28.9% supports the descriptive results. Binary variables (eg, <italic>symptoms at presentation, intraoperative complication, new postoperative deficits, was the deficit permanent,</italic> and <italic>new symptoms at 3-month follow-up</italic>) showed intermediate performance, with a mean accuracy of 80.0% (SD 27.4%).</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>The findings of this study underscore the potential of GPT-4o and similar LLMs to revolutionize health care data mining. By achieving high accuracy rates for the extraction of structured information from unstructured medical records, GPT-4o demonstrated its utility in reducing the time and effort required for data analysis. Parameters such as symptom onset, tumor grade, and extent of resection were extracted with impressive accuracy, aligning with the work of Lee et al [<xref ref-type="bibr" rid="ref7">7</xref>], who highlighted the power of NLP tools in parsing clinical trial data efficiently. However, these findings should be interpreted within the limitations of a small, single-institution feasibility study. While they demonstrate the technical viability of GPT-4o for structured data extraction, generalization to other clinical domains requires larger, multicenter validation.</p><p>However, not all data categories achieved uniform success. Variables such as intraoperative complications and new postoperative deficits presented challenges in zero-shot scenarios, emphasizing the need for prompt refinement and iterative learning to improve model performance. Similar observations have been reported by Adamson et al [<xref ref-type="bibr" rid="ref8">8</xref>], who noted that model tuning significantly enhances the precision of clinical data extraction from electronic health records.</p><p>Beyond these technical aspects, the integration of LLMs into clinical workflows presents transformative opportunities. For instance, these models can support operational planning by predicting the allocation of resources such as surgical supplies and human labor based on historical data trends. This capability, as highlighted by Obermeyer and Emanuel [<xref ref-type="bibr" rid="ref9">9</xref>], could reduce inefficiencies and improve overall patient outcomes. Moreover, automated coding of diagnoses and procedures, as discussed by Dong et al [<xref ref-type="bibr" rid="ref10">10</xref>], could streamline billing processes, thereby alleviating administrative burdens for health care institutions. It should be emphasized that this study did not assess workflow integration, time efficiency, usability, or real-time clinical applicability. Any references to potential operational or efficiency benefits are therefore speculative and reflect broader opportunities described in the literature rather than outcomes directly evaluated in this analysis.</p><p>Despite these promising applications, ethical and practical challenges must be addressed. Ensuring data privacy and security is paramount, especially when handling sensitive patient information. Adherence to privacy regulations such as the Health Insurance Portability and Accountability Act and the GDPR is critical [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>]. Furthermore, the &#x201C;black box&#x201D; nature of LLMs raises concerns about their interpretability and trustworthiness in clinical settings. Transparent AI frameworks, as advocated by Arrieta et al [<xref ref-type="bibr" rid="ref13">13</xref>], could mitigate these issues by making model predictions more explainable. In this study, model interpretability was ensured by manually auditing GPT-4o outputs and verifying their correspondence with the original clinical context. Regarding privacy, all medical reports were manually anonymized prior to being uploaded to the GPT-4o interface. The analyses were conducted using the official ChatGPT web platform, with no identifiable patient information included at any stage. No data were stored, transmitted, or processed through third-party servers outside the OpenAI environment, ensuring full compliance with institutional and GDPR standards.</p><p>Recent scholarship has underscored the ethical and operational risks of deploying LLMs in health care. Ong et al [<xref ref-type="bibr" rid="ref14">14</xref>] highlighted how bias propagation, lack of transparency, and accountability gaps remain major barriers to safe clinical implementation. Similarly, Elbattah et al [<xref ref-type="bibr" rid="ref15">15</xref>] emphasized the need for transparent auditing mechanisms and robust validation pipelines to mitigate privacy and safety risks in medical AI applications. In line with these perspectives, this study reinforces the necessity of rigorous data governance, responsible model deployment, and continuous ethical evaluation when integrating LLMs into clinical research workflows.</p><p>The role of NLP in augmenting documentation accuracy and quality has been demonstrated in neurosurgical contexts. For example, Sastry et al [<xref ref-type="bibr" rid="ref16">16</xref>] revealed that NLP models improved comorbidity documentation in inpatient admissions, illustrating their potential to enhance clinical record keeping. Similarly, Biswas et al [<xref ref-type="bibr" rid="ref17">17</xref>] explored the application of NLP in automating the detection of intraoperative elements during lumbar spine surgery, further emphasizing the model&#x2019;s utility in surgical workflows. In addition, prior work has shown that LLMs can outperform manual abstraction in specific domains and significantly reduce time and labor requirements [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>]. Compared with manual chart abstraction, which is labor intensive and time consuming, GPT-4o enabled near-instantaneous data extraction. Rule-based NLP systems, although established, require extensive customization and perform inconsistently across heterogeneous text. GPT-4o offers greater flexibility but introduces new challenges related to prompt sensitivity, hallucination risk, and limited explainability.</p><p>Another pressing concern is the potential for bias in AI models. As noted by Obermeyer et al [<xref ref-type="bibr" rid="ref20">20</xref>], algorithms trained on biased datasets can perpetuate health care disparities, underscoring the importance of curating diverse and representative training data. Additionally, regulatory hurdles must be navigated carefully to ensure that these tools are deployed responsibly and ethically, as discussed by Char et al [<xref ref-type="bibr" rid="ref21">21</xref>].</p><p>Looking ahead, the potential applications of LLMs in medicine are vast. Real-time decision support during surgeries, enabled by the synthesis of evidence from medical literature and intraoperative data, could significantly enhance surgical outcomes. Hashimoto et al [<xref ref-type="bibr" rid="ref22">22</xref>] have already explored the promises and challenges of AI in surgical contexts, laying the groundwork for future innovations. Combining LLMs with complementary AI technologies, such as computer vision for imaging analysis, further expands their utility, as emphasized by Topol [<xref ref-type="bibr" rid="ref23">23</xref>].</p><p>While GPT-4o and similar models represent a significant advancement in health care data mining, their practical implementation requires careful consideration of ethical, technical, and regulatory challenges. LLMs have the potential to not only transform data-driven research but also enhance the quality and efficiency of patient care.</p><p>This study highlights the transformative potential of LLMs such as GPT-4o in health care, particularly for mining and structuring complex medical data. The demonstrated accuracy in data extraction underscores their utility in reducing manual workloads and optimizing research and clinical processes. However, challenges such as data security, bias, and regulatory compliance remain significant hurdles. By navigating these obstacles thoughtfully, LLMs could redefine how health care systems manage and use data, ultimately improving patient outcomes and operational efficiencies. As these technologies continue to evolve, their integration must be guided by ethical frameworks and multidisciplinary collaboration to ensure their full potential is realized responsibly.</p></sec><sec id="s4-2"><title>Limitations</title><p>The small sample size represents a central limitation of this study and results in wide CIs around several accuracy estimates. Consequently, the findings should be viewed as preliminary and interpreted with caution. Although some variables reached perfect accuracy, these were predominantly straightforward structured fields requiring minimal interpretative processing. As such, these results reflect a narrow and highly controlled context and should not be extrapolated as evidence that GPT-4o performs reliably across more complex or heterogeneous clinical data extraction tasks.</p><p>Furthermore, the analysis was conducted at a single institution, and all medical reports originated from one neurosurgical service and were written in German. This linguistic and institutional homogeneity may have influenced model performance and limits the generalizability of the results to other clinical environments, documentation styles, and languages. Given the known variability in LLM performance across languages and reporting conventions, external validation is essential.</p></sec><sec id="s4-3"><title>Conclusions</title><p>Future studies involving larger, multi-institutional, and multilingual datasets, as well as more diverse clinical variables, will be necessary to assess reproducibility and determine whether these findings extend to broader clinical or operational contexts.</p></sec></sec></body><back><ack><p>No external data scientists, software engineers, or third-party experts were involved in this study. All prompt design, data processing, and analysis were performed by the authors, who have prior experience with clinical data workflows and research in neurosurgery. The authors declare the use of generative artificial intelligence (GAI) in the writing process. According to the GAI Delegation Taxonomy (2025), the following tasks were delegated to GAI tools under full human supervision: proofreading and editing. The GAI tool used was ChatGPT-4o and ChatGPT-5. Responsibility for the final manuscript lies entirely with the authors. GAI tools are not listed as authors and do not bear responsibility for the final outcomes.</p></ack><notes><sec><title>Funding</title><p>No external financial support or grants were received from any public, commercial, or not-for-profit entities for the research, authorship, or publication of this article.</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">API</term><def><p>application programming interface</p></def></def-item><def-item><term id="abb3">GDPR</term><def><p>General Data Protection Regulation</p></def></def-item><def-item><term id="abb4">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb5">NLP</term><def><p>natural language processing</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Esteva</surname><given-names>A</given-names> </name><name name-style="western"><surname>Robicquet</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ramsundar</surname><given-names>B</given-names> </name><etal/></person-group><article-title>A guide to deep learning in healthcare</article-title><source>Nat Med</source><year>2019</year><month>01</month><volume>25</volume><issue>1</issue><fpage>24</fpage><lpage>29</lpage><pub-id pub-id-type="doi">10.1038/s41591-018-0316-z</pub-id><pub-id pub-id-type="medline">30617335</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>AEW</given-names> </name><name name-style="western"><surname>Pollard</surname><given-names>TJ</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>L</given-names> </name><etal/></person-group><article-title>MIMIC-III, a freely accessible critical care database</article-title><source>Sci Data</source><year>2016</year><month>05</month><day>24</day><volume>3</volume><fpage>160035</fpage><pub-id pub-id-type="doi">10.1038/sdata.2016.35</pub-id><pub-id pub-id-type="medline">27219127</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Miotto</surname><given-names>R</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>F</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Dudley</surname><given-names>JT</given-names> </name></person-group><article-title>Deep learning for healthcare: review, opportunities and challenges</article-title><source>Brief Bioinform</source><year>2018</year><month>11</month><day>27</day><volume>19</volume><issue>6</issue><fpage>1236</fpage><lpage>1246</lpage><pub-id pub-id-type="doi">10.1093/bib/bbx044</pub-id><pub-id pub-id-type="medline">28481991</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Davenport</surname><given-names>T</given-names> </name><name name-style="western"><surname>Kalakota</surname><given-names>R</given-names> </name></person-group><article-title>The potential for artificial intelligence in healthcare</article-title><source>Future Healthc J</source><year>2019</year><month>06</month><volume>6</volume><issue>2</issue><fpage>94</fpage><lpage>98</lpage><pub-id pub-id-type="doi">10.7861/futurehosp.6-2-94</pub-id><pub-id pub-id-type="medline">31363513</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shah</surname><given-names>NH</given-names> </name><name name-style="western"><surname>Tenenbaum</surname><given-names>JD</given-names> </name></person-group><article-title>The coming age of data-driven medicine: translational bioinformatics&#x2019; next frontier</article-title><source>J Am Med Inform Assoc</source><year>2012</year><month>06</month><volume>19</volume><issue>e1</issue><fpage>e2</fpage><lpage>e4</lpage><pub-id pub-id-type="doi">10.1136/amiajnl-2012-000969</pub-id><pub-id pub-id-type="medline">22718035</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yu</surname><given-names>KH</given-names> </name><name name-style="western"><surname>Beam</surname><given-names>AL</given-names> </name><name name-style="western"><surname>Kohane</surname><given-names>IS</given-names> </name></person-group><article-title>Artificial intelligence in healthcare</article-title><source>Nat Biomed Eng</source><year>2018</year><month>10</month><volume>2</volume><issue>10</issue><fpage>719</fpage><lpage>731</lpage><pub-id pub-id-type="doi">10.1038/s41551-018-0305-z</pub-id><pub-id pub-id-type="medline">31015651</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>J</given-names> </name><name name-style="western"><surname>Yoon</surname><given-names>W</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>S</given-names> </name><etal/></person-group><article-title>BioBERT: a pre-trained biomedical language representation model for biomedical text mining</article-title><source>Bioinformatics</source><year>2020</year><month>02</month><day>15</day><volume>36</volume><issue>4</issue><fpage>1234</fpage><lpage>1240</lpage><pub-id pub-id-type="doi">10.1093/bioinformatics/btz682</pub-id><pub-id pub-id-type="medline">31501885</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Adamson</surname><given-names>B</given-names> </name><name name-style="western"><surname>Waskom</surname><given-names>M</given-names> </name><name name-style="western"><surname>Blarre</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Approach to machine learning for extraction of real-world data variables from electronic health records</article-title><source>Front Pharmacol</source><year>2023</year><month>09</month><day>15</day><volume>14</volume><fpage>1180962</fpage><pub-id pub-id-type="doi">10.3389/fphar.2023.1180962</pub-id><pub-id pub-id-type="medline">37781703</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Obermeyer</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Emanuel</surname><given-names>EJ</given-names> </name></person-group><article-title>Predicting the future - big data, machine learning, and clinical medicine</article-title><source>N Engl J Med</source><year>2016</year><month>09</month><day>29</day><volume>375</volume><issue>13</issue><fpage>1216</fpage><lpage>1219</lpage><pub-id pub-id-type="doi">10.1056/NEJMp1606181</pub-id><pub-id pub-id-type="medline">27682033</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dong</surname><given-names>H</given-names> </name><name name-style="western"><surname>Falis</surname><given-names>M</given-names> </name><name name-style="western"><surname>Whiteley</surname><given-names>W</given-names> </name><etal/></person-group><article-title>Automated clinical coding: what, why, and where we are?</article-title><source>NPJ Digit Med</source><year>2022</year><month>10</month><day>22</day><volume>5</volume><issue>1</issue><fpage>159</fpage><pub-id pub-id-type="doi">10.1038/s41746-022-00705-7</pub-id><pub-id pub-id-type="medline">36273236</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Beam</surname><given-names>AL</given-names> </name><name name-style="western"><surname>Kohane</surname><given-names>IS</given-names> </name></person-group><article-title>Big data and machine learning in health care</article-title><source>JAMA</source><year>2018</year><month>04</month><day>3</day><volume>319</volume><issue>13</issue><fpage>1317</fpage><lpage>1318</lpage><pub-id pub-id-type="doi">10.1001/jama.2017.18391</pub-id><pub-id pub-id-type="medline">29532063</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Upadrista</surname><given-names>V</given-names> </name><name name-style="western"><surname>Nazir</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tianfield</surname><given-names>H</given-names> </name></person-group><article-title>Secure data sharing with blockchain for remote health monitoring applications: a review</article-title><source>J Reliab Intell Environ</source><year>2023</year><month>05</month><day>11</day><fpage>1</fpage><lpage>20</lpage><pub-id pub-id-type="doi">10.1007/s40860-023-00204-w</pub-id><pub-id pub-id-type="medline">37359293</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Barredo Arrieta</surname><given-names>A</given-names> </name><name name-style="western"><surname>D&#x00ED;az-Rodr&#x00ED;guez</surname><given-names>N</given-names> </name><name name-style="western"><surname>Del Ser</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Explainable artificial intelligence (XAI): concepts, taxonomies, opportunities and challenges toward responsible AI</article-title><source>Inf Fusion</source><year>2020</year><month>06</month><volume>58</volume><fpage>82</fpage><lpage>115</lpage><pub-id pub-id-type="doi">10.1016/j.inffus.2019.12.012</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ong</surname><given-names>JCL</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>SYH</given-names> </name><name name-style="western"><surname>William</surname><given-names>W</given-names> </name><etal/></person-group><article-title>Ethical and regulatory challenges of large language models in medicine</article-title><source>Lancet Digit Health</source><year>2024</year><month>06</month><volume>6</volume><issue>6</issue><fpage>e428</fpage><lpage>e432</lpage><pub-id pub-id-type="doi">10.1016/S2589-7500(24)00061-X</pub-id><pub-id pub-id-type="medline">38658283</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Elbattah</surname><given-names>M</given-names> </name><name name-style="western"><surname>Arnaud</surname><given-names>E</given-names> </name><name name-style="western"><surname>Ghazali</surname><given-names>DA</given-names> </name><name name-style="western"><surname>Dequen</surname><given-names>G</given-names> </name></person-group><article-title>Exploring the ethical challenges of large language models in emergency medicine: a comparative international review</article-title><conf-name>2024 IEEE International Conference on Bioinformatics and Biomedicine (BIBM)</conf-name><conf-date>Dec 3-6, 2024</conf-date><pub-id pub-id-type="doi">10.1109/BIBM62325.2024.10822376</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sastry</surname><given-names>RA</given-names> </name><name name-style="western"><surname>Setty</surname><given-names>A</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>DD</given-names> </name><etal/></person-group><article-title>Natural language processing augments comorbidity documentation in neurosurgical inpatient admissions</article-title><source>PLoS ONE</source><year>2024</year><month>05</month><day>9</day><volume>19</volume><issue>5</issue><fpage>e0303519</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0303519</pub-id><pub-id pub-id-type="medline">38723044</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Biswas</surname><given-names>S</given-names> </name><name name-style="western"><surname>McMenemy</surname><given-names>L</given-names> </name><name name-style="western"><surname>Sarkar</surname><given-names>V</given-names> </name><etal/></person-group><article-title>Natural language processing for the automated detection of intra-operative elements in lumbar spine surgery</article-title><source>Front Surg</source><year>2023</year><month>12</month><day>18</day><volume>10</volume><fpage>1271775</fpage><pub-id pub-id-type="doi">10.3389/fsurg.2023.1271775</pub-id><pub-id pub-id-type="medline">38164290</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ge</surname><given-names>J</given-names> </name><name name-style="western"><surname>Li</surname><given-names>M</given-names> </name><name name-style="western"><surname>Delk</surname><given-names>MB</given-names> </name><name name-style="western"><surname>Lai</surname><given-names>JC</given-names> </name></person-group><article-title>A comparison of a large language model vs manual chart review for the extraction of data elements from the electronic health record</article-title><source>Gastroenterology</source><year>2024</year><month>04</month><volume>166</volume><issue>4</issue><fpage>707</fpage><lpage>709</lpage><pub-id pub-id-type="doi">10.1053/j.gastro.2023.12.019</pub-id><pub-id pub-id-type="medline">38151192</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dagli</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Ghenbot</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Ahmad</surname><given-names>HS</given-names> </name><etal/></person-group><article-title>Development and validation of a novel AI framework using NLP with LLM integration for relevant clinical data extraction through automated chart review</article-title><source>Sci Rep</source><year>2024</year><month>11</month><day>5</day><volume>14</volume><issue>1</issue><fpage>26783</fpage><pub-id pub-id-type="doi">10.1038/s41598-024-77535-y</pub-id><pub-id pub-id-type="medline">39500759</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Obermeyer</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Powers</surname><given-names>B</given-names> </name><name name-style="western"><surname>Vogeli</surname><given-names>C</given-names> </name><name name-style="western"><surname>Mullainathan</surname><given-names>S</given-names> </name></person-group><article-title>Dissecting racial bias in an algorithm used to manage the health of populations</article-title><source>Science</source><year>2019</year><month>10</month><day>25</day><volume>366</volume><issue>6464</issue><fpage>447</fpage><lpage>453</lpage><pub-id pub-id-type="doi">10.1126/science.aax2342</pub-id><pub-id pub-id-type="medline">31649194</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Char</surname><given-names>DS</given-names> </name><name name-style="western"><surname>Shah</surname><given-names>NH</given-names> </name><name name-style="western"><surname>Magnus</surname><given-names>D</given-names> </name></person-group><article-title>Implementing machine learning in health care - addressing ethical challenges</article-title><source>N Engl J Med</source><year>2018</year><month>03</month><day>15</day><volume>378</volume><issue>11</issue><fpage>981</fpage><lpage>983</lpage><pub-id pub-id-type="doi">10.1056/NEJMp1714229</pub-id><pub-id pub-id-type="medline">29539284</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hashimoto</surname><given-names>DA</given-names> </name><name name-style="western"><surname>Rosman</surname><given-names>G</given-names> </name><name name-style="western"><surname>Rus</surname><given-names>D</given-names> </name><name name-style="western"><surname>Meireles</surname><given-names>OR</given-names> </name></person-group><article-title>Artificial intelligence in surgery: promises and perils</article-title><source>Ann Surg</source><year>2018</year><month>07</month><volume>268</volume><issue>1</issue><fpage>70</fpage><lpage>76</lpage><pub-id pub-id-type="doi">10.1097/SLA.0000000000002693</pub-id><pub-id pub-id-type="medline">29389679</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Topol</surname><given-names>EJ</given-names> </name></person-group><article-title>High-performance medicine: the convergence of human and artificial intelligence</article-title><source>Nat Med</source><year>2019</year><month>01</month><volume>25</volume><issue>1</issue><fpage>44</fpage><lpage>56</lpage><pub-id pub-id-type="doi">10.1038/s41591-018-0300-7</pub-id><pub-id pub-id-type="medline">30617339</pub-id></nlm-citation></ref></ref-list></back></article>