<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v9i1e77707</article-id><article-id pub-id-type="doi">10.2196/77707</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Information Extraction of Doctoral Theses Using Two Different Large Language Models vs Health Services Researchers: Development and Usability Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Cittadino</surname><given-names>Jonas</given-names></name><degrees>MD, Dr med</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Traulsen</surname><given-names>Pia</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Schmahl</surname><given-names>Teresa</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wewetzer</surname><given-names>Larisa</given-names></name><degrees>MSc, Dr rer nat</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Cummerow</surname><given-names>Julia</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Fl&#x00E4;gel</surname><given-names>Kristina</given-names></name><degrees>MD, Dr med</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Strumann</surname><given-names>Christoph</given-names></name><degrees>Dipl-Volksw, Dr sc pol</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Goetz</surname><given-names>Katja</given-names></name><degrees>Dipl-Soz, Prof Dr</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Steinh&#x00E4;user</surname><given-names>Jost</given-names></name><degrees>Prof Dr med</degrees><xref ref-type="aff" rid="aff1"/></contrib></contrib-group><aff id="aff1"><institution>Institute of Family Medicine, University Hospital Schleswig-Holstein</institution><addr-line>Maria-Goeppert-Stra&#x00DF;e 9a</addr-line><addr-line>L&#x00FC;beck</addr-line><country>Germany</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Mavragani</surname><given-names>Amaryllis</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Zhang</surname><given-names>Chengzhi</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Zhang</surname><given-names>Zizhong</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Jonas Cittadino, MD, Dr med, Institute of Family Medicine, University Hospital Schleswig-Holstein, Maria-Goeppert-Stra&#x00DF;e 9a, L&#x00FC;beck, 23538, Germany, 49 451 3101 ext 8001; <email>jonas.cittadino@uni-luebeck.de</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>10</day><month>12</month><year>2025</year></pub-date><volume>9</volume><elocation-id>e77707</elocation-id><history><date date-type="received"><day>18</day><month>05</month><year>2025</year></date><date date-type="accepted"><day>23</day><month>10</month><year>2025</year></date></history><copyright-statement>&#x00A9; Jonas Cittadino, Pia Traulsen, Teresa Schmahl, Larisa Wewetzer, Julia Cummerow, Kristina Fl&#x00E4;gel, Christoph Strumann, Katja Goetz, Jost Steinh&#x00E4;user. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 10.12.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2025/1/e77707"/><abstract><sec><title>Background</title><p>The Archive of German-Language General Practice (ADAM) stores about 500 paper-based doctoral theses published from 1965 to today. Although they have been grouped in different categories, no deeper systematic process of information extraction (IE) has been performed yet. Recently developed large language models (LLMs) like ChatGPT have been attributed the potential to help in the IE of medical documents. However, there are concerns about LLM hallucinations. Furthermore, there have not been reports regarding their usage in nonrecent doctoral theses yet.</p></sec><sec><title>Objective</title><p>The aim of this study is to analyze if LLMs can help to extract information from doctoral theses by using GPT-4o and Gemini-1.5-Flash for paper-based doctoral theses in ADAM.</p></sec><sec sec-type="methods"><title>Methods</title><p>We randomly selected 10 doctoral theses published between 1965 and 2022. After preprocessing, we used two different LLM pipelines, using models by OpenAI and Google. Pipelines were used to extract dissertation characteristics and generate uniform abstracts. Furthermore, one pooled human-generated abstract was written for comparison. Furthermore, blinded raters were asked to evaluate LLM-generated abstracts in comparison to the human-generated ones. Bidirectional encoder representations from transformers scores were calculated as the evaluation metric.</p></sec><sec sec-type="results"><title>Results</title><p>Relevant dissertation characteristics and keywords could be extracted for all theses (n=10): institute name and location, thesis title, author name(s), and publication year. For all except one doctoral thesis, an abstract could be generated using GPT-4o, while Gemini-1.5-Flash provided abstracts in all cases (n=10). The modality of abstract generation showed no influence in raters&#x2019; evaluation using the nonparametric Kruskal-Wallis test for independent groups (<italic>P</italic>=.44). The creation of LLM-generated abstracts was estimated to be 24-36 times faster than creation by humans. Evaluation metrics showed moderate-to-high semantic similarity (mean bidirectional encoder representations from transformers <italic>F</italic><sub>1</sub>-score, GPT-4o: 0.72 and Gemini: 0.71). Translation from German into English did not result in a loss of information (n=10).</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>An accumulating body of unpublished doctoral theses makes it difficult to extract relevant evidence. Recent advances in LLMs like ChatGPT have raised expectations in text mining, but they have not yet been used in the IE of &#x201C;historic&#x201D; medical documents. This feasibility study suggests that both models (GPT-4o and Gemini-1.5-Flash) helped to accurately simplify and condense doctoral theses into relevant information, while LLM-generated abstracts were perceived as similar to human-generated ones, were semanticly similar, and took about 30 times less time to create. This pilot study demonstrates the feasibility of a regular office-scanning workflow and use of general-purpose LLMs to extract relevant information and produce accurate abstracts from ADAM doctoral theses. Taken together, this information could help researchers to better search the family medicine scientific literature over the last 60 years, helping to develop current research questions.</p></sec></abstract><kwd-group><kwd>ChatGPT</kwd><kwd>family medicine</kwd><kwd>doctoral thesis</kwd><kwd>GPT-4o</kwd><kwd>Gemini</kwd><kwd>artificial intelligence</kwd><kwd>AI</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>By their nature, archives contain a large amount of information. Thirty years ago, the German Society of General Practitioners and Family Physicians (DEGAM) began to centrally store all doctoral theses of the specialty. This collection is now part of the Archive of German-Language General Practice (ADAM) [<xref ref-type="bibr" rid="ref1">1</xref>]. ADAM was primarily established to gather historical documents from the beginning of family medicine as a specialty in German-speaking countries (Germany, Austria, and Switzerland) and now stores a large number of documents of different types and formats, including many documents written with a typewriter [<xref ref-type="bibr" rid="ref2">2</xref>]. Lately, there have been efforts to categorize 802 available theses [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. During this process, one finding was that it is hard to find detailed information, especially for theses from before 2000. This may be because an estimated 50% of all dissertations are not published in a journal but rather are paper-based only [<xref ref-type="bibr" rid="ref4">4</xref>]. Within ADAM, this is true for 553 dissertations. Therefore, as yet, there is no process for extracting relevant information, creating uniform abstracts, and making them available to the public and other researchers.</p><p>With the release of large language models (LLMs) such as ChatGPT by OpenAI and Gemini by Google, hopes have been raised regarding their use for medicine and medical documents [<xref ref-type="bibr" rid="ref5">5</xref>]. The reduction of hallucination, the term for information made up by an LLM, is crucial for medical scenarios and has been the subject of a recent project [<xref ref-type="bibr" rid="ref6">6</xref>]. Although previous work on LLMs has shown effectiveness in terms of information extraction (IE) from scientific texts [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>], to our knowledge, there has not yet been a reported use of LLMs regarding IE from doctoral theses. Hence, the aim of this feasibility study was to analyze if LLMs and natural language processing can extract relevant information and generate uniform abstracts of doctoral theses from the field of family medicine in ADAM and how these artificial intelligence&#x2013;generated abstracts are perceived by scientists.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Dissertation Characteristics</title><p>For this analysis, we randomly selected 10 dissertations from ADAM, ranging from the earliest dissertation (1965) to the most recent one (2022). All dissertations were in PDF format, either made available by the original author or generated by study personnel who scanned paper-based dissertations using a regular multifunctional office printer. When making our selection, we included different time periods of submission and both types of digitization in equal numbers. Therefore, we focused on dissertations from different time periods and different methods of digitization. Although the digitization of documents is a current fundamental question in archive research, only a few standards exist [<xref ref-type="bibr" rid="ref9">9</xref>].</p></sec><sec id="s2-2"><title>Ethical Considerations</title><p>Since all selected dissertations were publicly accessible through ADAM, a formal ethics review was deemed unnecessary. However, informed consent was obtained from all copyright holders of the selected dissertations. According to our institutional practice (University Hospital Schleswig-Holstein), projects based solely on publicly available documents with author consent do not require review by the ethics committee. This analysis followed the transparent reporting of a multivariable model for individual prognosis or diagnosis (TRIPOD)+LLM guidelines (see <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) [<xref ref-type="bibr" rid="ref10">10</xref>].</p></sec><sec id="s2-3"><title>Analysis</title><p>For each PDF file, we created an analytical pipeline including the following steps: preprocessing, extracting information from the title page, creating a uniform abstract, and translating newly generated abstracts into English. We evaluated two LLMs in this study: GPT-4o [<xref ref-type="bibr" rid="ref11">11</xref>] by OpenAI and Gemini-1.5-Flash [<xref ref-type="bibr" rid="ref12">12</xref>] by Google.</p><sec id="s2-3-1"><title>GPT-4o</title><p>Preprocessing of all files in the GPT-4o pipeline was done by bringing all files into a vertical layout format before they were rotated and/or divided depending on text alignment (<xref ref-type="fig" rid="figure1">Figure 1</xref>). If the extracted text resulted in nonsense, we used optical character recognition (OCR) [<xref ref-type="bibr" rid="ref13">13</xref>] for extraction of all pages. Then, the content of each page was stored as a string in a dictionary.</p><p>For further analysis, we used LLMs by OpenAI, namely text-davinci-003, GPT-3.5-turbo, and GPT-4o [<xref ref-type="bibr" rid="ref11">11</xref>], using a temperature [<xref ref-type="bibr" rid="ref14">14</xref>] of 0.2 to reduce hallucination. All models are pretrained and transformer-based, meaning that the model supervised its learning itself and understands context [<xref ref-type="bibr" rid="ref15">15</xref>].</p><p>We defined the first page containing a 4-digit number as the title page. Using only the title page of each file, publication characteristics were extracted by using the OpenAI model &#x201C;text-davinci-003&#x201D;: name of the institution, city of the institution, director of the institution, first and last name(s) of the author, title of the dissertation, origin of the author, and year of publication (<xref ref-type="fig" rid="figure2">Figure 2</xref>). For each dissertation, J Cittadino cross-checked if the information extracted by the model was correct.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Preprocessing of scanned dissertation title pages prior to analysis. Workflow of preprocessing steps applied to scanned dissertation title pages from German medical faculties. The process included rotation correction, text segmentation, and optical character recognition to prepare documents for automated metadata extraction.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e77707_fig01.png"/></fig><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Automated extraction of dissertation metadata using GPT models. Example of structured metadata extraction from a dissertation title page (University of L&#x00FC;beck, Institute of Family Medicine, 2022) using GPT models. The extracted fields include institute name, location, name of director, title, author information, and year of publication.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e77707_fig02.png"/></fig></sec><sec id="s2-3-2"><title>Gemini</title><p>In the Gemini pipeline, we used Google Cloud Platform for preprocessing and text generation. Using Workflows, Cloud Function, and Gemini-1.5-Flash with a temperature of 0.2, we designed the following pathway. First, DocumentAI was used to extract the text from all PDFs and store it in a JSON file [<xref ref-type="bibr" rid="ref16">16</xref>]. Only using the first page and Gemini-1.5-Flash, we then extracted the same information from each PDF as with GPT-4o: name of the institution, city of the institution, director of the institution, first and last name(s) of the author, title of the dissertation, origin of the author, and year of publication.</p></sec><sec id="s2-3-3"><title>Common Pathway</title><p>Next, for each LLM, we generated a string containing the whole text of the dissertation after the table of contents. The page number after the end of the table of contents was extracted by looping through snippets of the first 10 pages of each document using text-davinci-003 or Gemini-1.5-Flash.</p><p>Continuing the analytical pipeline, we searched the generated string for a summary section of the whole thesis. If found, the extracted paragraph was stored as <italic>abstract_whole</italic>. If not found, the text was scanned for paragraphs containing the subsections Aim of the study, Methods, Results, Discussion, and Conclusion as headings, which were extracted and summarized using text-davinci-003 or Gemini-1.5-Flash if found.</p><p>If a summary paragraph was identified, we shortened it to the first 4000 characters and used GPT-4o and Gemini-1.5-Flash to generate uniform abstracts containing the subheadings Aim of the study, Methods, Results, and Discussion. For both models, we used the same prompt and hyperparameters:</p><list list-type="bullet"><list-item><p>Prompt: &#x201C;Summarize the following summary section of a medical doctoral thesis into sections with the headings Objective, Methods, Results, and Discussion, each in no more than 2&#x2010;3 concise sentences: [summary section]&#x201D; (German: &#x201C;<italic>Fasse die folgende Zusammenfassung einer medizinischen Doktorarbeit in Abschnitte mit den &#x00DC;berschriften Zielsetzung, Methodik, Ergebnisse und Diskussion knapp in jeweils maximal 2-3 S&#x00E4;tzen zusammen: [Zusammenfassung].</italic>&#x201D;</p></list-item><list-item><p>Temperature: 0.2.</p></list-item><list-item><p>Maximum number of tokens: 950.</p></list-item></list><p>The time needed for completion of the script as well as usage costs of the OpenAI LLMs were assessed.</p></sec><sec id="s2-3-4"><title>Abstract Comparison</title><p>For each dissertation, researchers (PT, TS, J Cummerow, and LW) independently drafted an abstract and then agreed on a pooled version by discussion. The result was a third version of the abstract, this one human-generated. Afterward, the researchers estimated their time needed for drafting and agreeing on a pooled version. Then, three senior researchers (KG, KF, and CS), were asked to evaluate all three abstract versions in different orders, not knowing which two were LLM-generated and which one was human-generated. For each abstract version of each dissertation, we collected two ratings using German school marks between 1 (=best) and 6 (=worst). We then analyzed these ratings using descriptive statistics and a robust analysis procedure, the nonparametric Kruskal-Wallis test for independent groups [<xref ref-type="bibr" rid="ref17">17</xref>]. Results were visualized in a boxplot.</p></sec><sec id="s2-3-5"><title>Text Generation Evaluation</title><p>Words were counted for each generated abstract. To automatically evaluate the performance of LLM-generated abstracts, we used BERTScore (bidirectional encoder representations from transformers) [<xref ref-type="bibr" rid="ref18">18</xref>] for contextual embeddings. This method uses the pretrained language model BERT to compare the similarity between tokens by calculating precision, recall, and the harmonic mean <italic>F</italic><sub>1</sub> [<xref ref-type="bibr" rid="ref19">19</xref>]. All analysis processes were done using Python (version 3.10.1; Python Software Foundation) [<xref ref-type="bibr" rid="ref20">20</xref>]. The code is available upon request.</p></sec><sec id="s2-3-6"><title>Translation Evaluation</title><p>Finally, we used GPT-4o and Gemini-1.5-Flash to translate the newly generated abstracts into English. Following World Health Organization guidelines for translations [<xref ref-type="bibr" rid="ref21">21</xref>] to make sure that the translation process did not result in the loss of information, we retranslated the generated English translation back into German again using the same LLM. Then, generated German translations were compared to the originally generated abstract (by J Cittadino).</p></sec></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Overview</title><p>Paper-based documentation was more common (n=7) and often showed apparently poor digital quality after scanning. However, using the GPT-4o pipeline, we were able to extract text and information from all dissertations. Even so, we observed some common mistakes&#x2014;for example, the German letter &#x201C;&#x00FC;&#x201D; was often transformed to &#x201C;ii.&#x201D; Interestingly, while generating an abstract, both GPT models could still correctly understand the content and remove spelling mistakes.</p></sec><sec id="s3-2"><title>Thesis Characteristics</title><p>Information from the title page could be extracted for all theses, and the details were accurate except for two spelling mistakes (&#x201C;&#x00E9;&#x201D; instead of &#x201C;&#x00F6;&#x201D;). There was no difference between OpenAI models and Google models. Dissertations covered various topics of family medicine research completed between 1965 and 2022 (<xref ref-type="table" rid="table1">Table 1</xref>).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>GPT-4o&#x2013;generated characteristics of the included dissertations.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Name of institute</td><td align="left" valign="bottom">Institute location</td><td align="left" valign="bottom">Director of institute</td><td align="left" valign="bottom">Title of doctoral thesis</td><td align="left" valign="bottom">First name(s) of author</td><td align="left" valign="bottom">Last name(s) of author</td><td align="left" valign="bottom">Origin of author</td><td align="left" valign="bottom">Year(s)</td></tr></thead><tbody><tr><td align="left" valign="top"><italic>Institut f&#x00FC;r Medizinische Statistik der Freien Universit&#x00E4;t Berlin</italic> [Institute of Medical Statistics, Free University of Berlin]</td><td align="left" valign="top">Berlin</td><td align="left" valign="top">Prof Dr med, Dr phil Karl Freudenberg</td><td align="left" valign="top"><italic>Beziehungen zwischen Einweisungsdiagnosen und klinischen Diagnosen</italic> [Relationships between admission diagnoses and clinical diagnoses]</td><td align="left" valign="top">Helmut</td><td align="left" valign="top">Pillau</td><td align="left" valign="top">Berlin</td><td align="left" valign="top">1965</td></tr><tr><td align="left" valign="top"><italic>Universit&#x00E4;t Ulm (MNH)</italic> [University of Ulm (MNH)]</td><td align="left" valign="top">Ulm</td><td align="left" valign="top">Priv Doz, Dr S Haussler</td><td align="left" valign="top"><italic>Entwicklung kassen&#x00E4;rztlicher Leistungen bei verschiedenen Arztgruppen im K. V. Bereich Nordw&#x00FC;rttemberg (1965 - 1969)</italic> [Development of services provided by contracted physicians in various medical groups in the North W&#x00FC;rttemberg Association of Statutory Health Insurance Physicians (1965 - 1969)]</td><td align="left" valign="top">Klaus</td><td align="left" valign="top">Besel</td><td align="left" valign="top">Vohringen/Iller</td><td align="left" valign="top">1965&#x2010;1969</td></tr><tr><td align="left" valign="top"><italic>Institut f&#x00FC;r medizinische Statistik und Dokumentation</italic> [Institute of Medical Statistics and Documentation]</td><td align="left" valign="top">Kiel</td><td align="left" valign="top">Prof Dr med G Griesser</td><td align="left" valign="top"><italic>Analyse einer Allgemeinpraxis</italic> [Analysis of a general practice]</td><td align="left" valign="top">Andreas</td><td align="left" valign="top">Kernbichler</td><td align="left" valign="top">Meldorf</td><td align="left" valign="top">1973</td></tr><tr><td align="left" valign="top"><italic>Albert-Ludwigs-Universit&#x00E4;t Freiburg im Breisgau</italic> [Albert Ludwigs University of Freiburg]</td><td align="left" valign="top">Freiburg im Breisgau</td><td align="left" valign="top">Honorarprofessor Dr med HH Schr&#x00F6;mbgens</td><td align="left" valign="top"><italic>Die Lehre der Allgemeinmedizin an den deutschen Hochschulen (1966 - 1978)</italic> [Teaching of general practice at German universities (1966 - 1978)]</td><td align="left" valign="top">Claudia</td><td align="left" valign="top">Keller-R&#x00F6;ll</td><td align="left" valign="top">Deggingen</td><td align="left" valign="top">1979</td></tr><tr><td align="left" valign="top"><italic>Medizinische Poliklinik der Universit&#x00E4;t M&#x00FC;nchen</italic> [Medical Outpatient Department, University of Munich]</td><td align="left" valign="top">M&#x00FC;nchen</td><td align="left" valign="top">Prof Dr N Z&#x00E9;llner</td><td align="left" valign="top"><italic>Untersuchung zur Compliance bei der Diagnostik und Therapie des arteriellen Hypertonus in einer Hausarztpraxis in Stadtrandlage</italic> [Study on compliance in the diagnosis and treatment of arterial hypertension in a suburban family practice]</td><td align="left" valign="top">Manfred</td><td align="left" valign="top">Lohnstein</td><td align="left" valign="top">Augsburg</td><td align="left" valign="top">1983</td></tr><tr><td align="left" valign="top"><italic>Institut f&#x00FC;r Allgemeinmedizin</italic> [Institute of General Practice]</td><td align="left" valign="top">Frankfurt AM Main</td><td align="left" valign="top">Prof Dr K Jork</td><td align="left" valign="top"><italic>Selbstmedikation bei &#x00E4;lteren Menschen und Studenten</italic> [Self-medication among older adults and university students]</td><td align="left" valign="top">Elke</td><td align="left" valign="top">Iburg</td><td align="left" valign="top">Hannover</td><td align="left" valign="top">1991</td></tr><tr><td align="left" valign="top"><italic>Philipps-Universit&#x00E4;t Marburg</italic> [Philipps University of Marburg]</td><td align="left" valign="top">Marburg</td><td align="left" valign="top">Frau Prof Dr med Erika Baum</td><td align="left" valign="top"><italic>Allgemeinmedizin in Gro&#x00DF;britannien und Deutschland: Die Auswirkungen verschiedener Verg&#x00FC;tungssysteme auf die Qualit&#x00E4;t pr&#x00E4;ventiver Versorgung</italic> [General practice in Great Britain and Germany: the effects of different remuneration systems on the quality of preventive care]</td><td align="left" valign="top">Norbert</td><td align="left" valign="top">Donner-Banzhoff</td><td align="left" valign="top">Viersen/Rheinland</td><td align="left" valign="top">1993</td></tr><tr><td align="left" valign="top"><italic>Zentrum der Gesundheitswissenschaften Institut f&#x00FC;r Allgemeinmedizin</italic> [Center for Health Sciences, Institute of General Practice]</td><td align="left" valign="top">Frankfurt AM Main</td><td align="left" valign="top">Prof Dr Ferdinand M Gerlach</td><td align="left" valign="top"><italic>M&#x00F6;glichkeiten durch Delegation haus&#x00E4;rztlicher Leistungen am Beispiel von Versorgungsassistentinnen in der Hausarztpraxis (Verah)</italic> [Opportunities through delegation of family physician tasks using the example of practice assistants in family medicine (VERAH)]</td><td align="left" valign="top">Karola</td><td align="left" valign="top">Mergenthal</td><td align="left" valign="top">B&#x00FC;dingen, Hessen</td><td align="left" valign="top">2016</td></tr><tr><td align="left" valign="top"><italic>Institut f&#x00FC;r Allgemeinmedizin der Universit&#x00E4;t zu L&#x00FC;beck</italic> [Institute of Family Medicine, University of L&#x00FC;beck]</td><td align="left" valign="top">L&#x00FC;beck</td><td align="left" valign="top">Prof Dr med Steinh&#x00E4;user</td><td align="left" valign="top"><italic>Nasa-Task Load Index - Ein Instrument, um sich der Komplexit&#x00E4;t von Beratungsanl&#x00E4;ssen in der Allgemeinmedizin zu n&#x00E4;hern</italic> [NASA Task Load Index: a tool for approaching the complexity of consultation situations in general practice]</td><td align="left" valign="top">Britta</td><td align="left" valign="top">Galler</td><td align="left" valign="top">Stade</td><td align="left" valign="top">2020</td></tr><tr><td align="left" valign="top"><italic>Institut f&#x00FC;r Allgemeinmedizin der Universit&#x00E4;t zu L&#x00FC;beck</italic> [Institute of Family Medicine, University of L&#x00FC;beck]</td><td align="left" valign="top">L&#x00FC;beck</td><td align="left" valign="top">Prof Dr med Jost Steinh&#x00E4;user</td><td align="left" valign="top"><italic>Patientenorientierte Versorgung im Niedergelassenen Bereich - Kooperation zwischen Haus&#x00E4;rzt*Innen und ambulanten Pflegediensten</italic> [Patient oriented care in the outpatient sector: cooperation between family physicians and ambulatory nursing services]</td><td align="left" valign="top">Juliane Sophie</td><td align="left" valign="top">Erdmann</td><td align="left" valign="top">Hannover</td><td align="left" valign="top">2022</td></tr></tbody></table></table-wrap></sec><sec id="s3-3"><title>Abstract Summary</title><p>Paragraphs containing a summary section of the whole dissertation could be found in 9 of 10 documents. In the case where no summary was found, a paragraph was extracted from the Discussion subsection and summarized using GPT-4o, while Gemini-1.5-Flash was able to provide a whole abstract. Thus, GPT-4o generated abstracts in 9 of 10 cases and Gemini-1.5-Flash in all 10 cases.</p><p>In the text extraction, we observed some common mistakes&#x2014;for example, the German letter &#x201C;&#x00FC;&#x201D; was often transformed to &#x201C;ii.&#x201D; The GPT model could still correctly understand the content and remove spelling mistakes while generating an abstract.</p><p>Running the script a second time resulted in the same extraction of all dissertation characteristics in 9 of 10 cases for GPT-4o and in all cases for Gemini-1.5-Flash. In one case, a different institute director was found, which was extracted as &#x201C;Korefferent&#x201D; and not &#x201C;Referent.&#x201D; Generated abstracts, however, differed in their wording while containing the same information.</p><p>Running the full script using the GPT-4o pipeline lasted 45.38 minutes (2720 s) and cost US $2.67. Additionally, we estimated a scanning time of 5-10 minutes for each dissertation. Each researcher spent about 1.5 hours drafting an abstract and about 0.5 hours while creating the pooled version.</p></sec><sec id="s3-4"><title>Abstract Comparison</title><p>For each modality of abstract generation&#x2014;GPT-4o, Gemini-1.5-Flash, or human-generated&#x2014;two scores from different rater were collected. Scores covered the whole range from 1 to 6 for all modalities. Although there was no statistical difference in the nonparametric Kruskal-Wallis test for independent groups (<italic>P</italic>=.44), GPT-4o showed the best mean rating (2.44), while human-generated (3.00) and Gemini-1.5-Flash (3.25) were evaluated lower (<xref ref-type="fig" rid="figure3">Figure 3</xref>). Standard deviations (SD) were in the same range for all modalities (GPT-4o: 1.42; Gemini-1.5-Flash: 1.71; human-generated: 1.17).</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Distribution of abstract quality ratings across GPT models and human raters. Comparison of abstract quality ratings (1=best, 6=worst) generated by 3 sources (GPT-4o, Gemini-1.5-Flash, and human evaluators) for a sample of German medical dissertations. Boxplots show median, interquartile range, and variability of the assigned scores.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e77707_fig03.png"/></fig></sec><sec id="s3-5"><title>Text Generation Evaluation</title><p>LLM-generated abstracts consisted of fewer words compared to human-generated ones (mean word counts were 141 for GPT-4o, 137 for Gemini, and 352 for human-generated abstracts). Mean BERTScores showed moderate-to-high congruence between LLMs and human-generated abstracts (mean <italic>F</italic><sub>1</sub>: GPT-4o: 0.72 and Gemini: 0.71, respectively; <xref ref-type="table" rid="table2">Table 2</xref>).</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>BERTScore for the 10 dissertations for GPT-4o and Gemini.<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Dissertation ID</td><td align="left" valign="top">GPT-4o BERTScore</td><td align="left" valign="top">Gemini BERTScore</td></tr></thead><tbody><tr><td align="left" valign="top">1</td><td align="char" char="." valign="top">0.0</td><td align="left" valign="top">0.725</td></tr><tr><td align="left" valign="top">2</td><td align="char" char="." valign="top">0.685</td><td align="left" valign="top">0.709</td></tr><tr><td align="left" valign="top">3</td><td align="char" char="." valign="top">0.71</td><td align="left" valign="top">0.698</td></tr><tr><td align="left" valign="top">4</td><td align="char" char="." valign="top">0.728</td><td align="left" valign="top">0.718</td></tr><tr><td align="left" valign="top">5</td><td align="char" char="." valign="top">0.691</td><td align="left" valign="top">0.7</td></tr><tr><td align="left" valign="top">6</td><td align="char" char="." valign="top">0.723</td><td align="left" valign="top">0.731</td></tr><tr><td align="left" valign="top">7</td><td align="char" char="." valign="top">0.719</td><td align="left" valign="top">0.649</td></tr><tr><td align="left" valign="top">8</td><td align="char" char="." valign="top">0.758</td><td align="left" valign="top">0.715</td></tr><tr><td align="left" valign="top">9</td><td align="char" char="." valign="top">0.73</td><td align="left" valign="top">0.714</td></tr><tr><td align="left" valign="top">10</td><td align="char" char="." valign="top">0.731</td><td align="left" valign="top">0.727</td></tr><tr><td align="left" valign="top">All</td><td align="char" char="." valign="top">0.72</td><td align="left" valign="top">0.707</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>BERT: bidirectional encoder representations from transformers.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-6"><title>Translation Evaluation</title><p>When abstracts generated by both LLMs were translated into English and then back to German by using GPT-4o, the retranslated new German abstracts did not show a loss of information in comparison to the original German abstracts, although the wording was not exactly the same in all 10 cases.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Main Findings</title><p>Results from this feasibility study suggest that LLMs can be used to extract relevant general characteristics from doctoral theses. Further, the models (GPT-4o and Gemini-1.5-Flash) accurately generated uniform abstracts 90% and 100% of the time, respectively. Machine-based translation of these abstracts into English did not show a loss of information. When asked, other researchers perceived these LLM-generated abstracts as similar to human-generated abstracts. BERTScores showed moderate-to-high similarity between LLM- versus human-generated abstracts. However, as hallucinations could not be totally eliminated, LLMs provide a tool to explore but not analyze medical dissertations.</p></sec><sec id="s4-2"><title>What Is Already Known</title><p>Medical knowledge is growing rapidly, and it is close to impossible to keep up with new insights and information [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref23">23</xref>]. Although health care systems currently use both paper-based and electronic health records [<xref ref-type="bibr" rid="ref24">24</xref>], OCR can help to precisely extract text from scanned documents [<xref ref-type="bibr" rid="ref13">13</xref>]. Lastly, accumulating evidence has showed that previously used rule-based or machine learning&#x2013;based IE have been outperformed by deep neural networks even if pretrained, and they can help process a lot of data [<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref26">26</xref>]. Although new, publicly available LLMs like ChatGPT and Gemini could possibly help in IE of medical documents, although ethical concerns have been mentioned [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]. Further, many projects have examined how and how well LLMs can help in IE of medical documents [<xref ref-type="bibr" rid="ref28">28</xref>-<xref ref-type="bibr" rid="ref32">32</xref>].</p><p>Although Recall-Oriented Understudy for Gisting Evaluation (ROUGE) [<xref ref-type="bibr" rid="ref33">33</xref>] for overlapping unigrams is a known metric for the evaluation of text generation, we choose the BERTScore as our evaluation metric. BERTScore captures the contextual embedding and was thus more suitable to our study aim to examine how well LLMs can capture the content of long dissertations and generate an abstract including the relevant information [<xref ref-type="bibr" rid="ref18">18</xref>]. The observed LLM performance in summarizing dissertations in our study was similar to reported <italic>F</italic><sub>1</sub> BERTScores in a Dutch study examining the differences in note-taking between humans and a digital scribe in a clinical workflow using various GPT models [<xref ref-type="bibr" rid="ref34">34</xref>].</p></sec><sec id="s4-3"><title>What This Study Adds</title><p>To our knowledge, this is the first study examining the usage of GPT-4o and Gemini-1.5-Flash for IE in German doctoral theses. We actively chose a feasible method of digitization by using a standard multifunctional office printer, which is not in line with national standards of archiving documents, to simulate everyday conditions [<xref ref-type="bibr" rid="ref9">9</xref>]. Our results suggest, however, that standardized IE through LLMs can be used to create uniform abstracts for dissertations that otherwise would remain difficult to access. Interestingly, these abstracts do not seem to be inferior to human-generated ones, indicated by the ratings of senior researchers. By costing as little as about US $0.25 and taking only 10-15 minutes (scanning: 5&#x2010;10 min, creation: 5 min) on average for each thesis, past results could be made more accessible and therefore easier to use for further research by using this analytical pipeline. This is especially true when considering that the creation of one pooled human-generated abstract took about 6 hours (360 min) and did not result in a higher rating from senior researchers, making it 24-36 times slower and more expensive.</p><p>A Dutch study examining note-taking performance reported that LLM-generated texts consisted of more words than human-generated texts (137 vs 101 words) [<xref ref-type="bibr" rid="ref34">34</xref>]. However, we observed that human-generated abstracts were about 2.5 times longer than the ones from LLMs. This might be due to the need not to miss any important information when writing an abstract as a human and therefore generating a longer text. Further, LLMs might have difficulties capturing the whole extent of a dissertation, thus generating a shorter text. Two dissertations with low <italic>F</italic><sub>1</sub> BERTScores (IDs 2 and 5) were both written on typewriters, which might have resulted in informational loss during the OCR process, thus possibly accounting for a lower evaluation score. Conversely, we observed that dissertations written in Microsoft Word and converted to PDF achieved higher <italic>F</italic><sub>1</sub> BERTScores (IDs 8 and 10).</p><p>Hallucinations in medical LLMs persist [<xref ref-type="bibr" rid="ref6">6</xref>], so users should be cautious about fully trusting the results. At this scientific stage, we propose using LLMs for medical dissertations as a tool for exploring&#x2014;for example, for abstract generation&#x2014;but recommend verifying generated information. By using this tool, we can unlock the rich data from doctoral theses in ADAM, making this valuable knowledge publicly accessible and strengthening up-to-date research.</p></sec><sec id="s4-4"><title>Strengths and Limitations</title><p>For this study, we only presented a small dataset including 10 doctoral theses of which the results can only be used for hypothesis generation and do not claim to be generalizable. Although we included files from 1965 and 2022, representing examples of the oldest and most recent documents in ADAM, further studies should include more dissertations to be more representative.</p><p>By using two recent LLMs (GPT-4o and Gemini-1.5-Flash), we could directly compare their respective results and performance. In this pilot study, however, we did not include additional models, which should be a focus of further studies. Additionally, because of the small sample size, we did not calculate interrater reliability among senior researchers, which might have biased ratings results. Although BERTScores have been effectively used in evaluating text generation by LLMs, they lack an understanding of nuances in medical language and therefore should be used with caution [<xref ref-type="bibr" rid="ref35">35</xref>]. This underlines the need for an evaluation metric specifically to be used in medical contexts as potential threats from misinformation are among the most discussed risks of LLM usage [<xref ref-type="bibr" rid="ref36">36</xref>].</p></sec><sec id="s4-5"><title>Conclusions</title><p>In this feasibility study using 10 medical dissertations completed between 1965 and 2022, we provide a proof of concept for the usage of LLMs, namely GPT-4o and Gemini-1.5-Flash, for IE from theses in ADAM. Within the constraints of a small, intentionally heterogeneous sample, results indicate promising feasibility. Using these models, typical publication characteristics as well as uniform abstracts could be generated, and they were not perceived differently to human-generated abstracts, while being about 30 times faster to generate.</p><p>However, as LLM hallucinations persist, research applications of LLMs should be approached with caution. Further research should include more dissertations, which will help researchers understand trends in various time periods and possibly allow for the grouping and summarizing of similar dissertations.</p></sec></sec></body><back><ack><p>Neither ChatGPT nor any other large language model were used during the writing or editing of this article.</p></ack><notes><sec><title>Funding</title><p>Funding was provided by the <italic>Foundation Perspektive Haus&#x00E4;rztinnen und Haus&#x00E4;rzte</italic> of the German General Practitioners&#x2019; Association (German: <italic>Stiftung Perspektive Hausarzt des Deutschen Haus&#x00E4;rztinnen- und Haus&#x00E4;rzteverbands e.V.</italic>).</p></sec><sec><title>Data Availability</title><p>Data will be made available from the corresponding author upon a reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: J Cittadino, JS; methodology: J Cittadino, JS; validation: PT, TS, LW, J Cummerow, KG, CS, KF; formal analysis: J Cittadino; investigation: J Cittadino, JS; resources: JS; data curation: JS; writing&#x2014; original draft preparation: J Cittadino; writing&#x2014;review and editing: PT, TS, LW, J Cummerow, KG, CS, KF, JS; visualization: J Cittadino; supervision: JS; project administration: J Cittadino; funding acquisition: JS. All authors have read and agreed to the published version of the manuscript.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">ADAM</term><def><p><italic>Archiv der deutschsprachigen Allgemeinmedizi</italic>n (English: Archive of German-language General Practice)</p></def></def-item><def-item><term id="abb2">BERT</term><def><p>bidirectional encoder representations from transformers</p></def></def-item><def-item><term id="abb3">DEGAM</term><def><p>German Society of General Practitioners and Family Physicians</p></def></def-item><def-item><term id="abb4">IE</term><def><p>information extraction</p></def></def-item><def-item><term id="abb5">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb6">OCR</term><def><p>optical character recognition</p></def></def-item><def-item><term id="abb7">ROUGE</term><def><p>Recall-Oriented Understudy for Gisting Evaluation</p></def></def-item><def-item><term id="abb8">TRIPOD</term><def><p>transparent reporting of a multivariable model for individual prognosis or diagnosis</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Mader</surname><given-names>FH</given-names> </name></person-group><source>Von Der Allgemeinen Medizin Zur Allgemeinmedizin: Festschrift 50 Jahre DEGAM: 1966-2016</source><year>2016</year><access-date>2025-11-30</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.degam.de/files/inhalt/pdf/festschrift_2016/degam_jubilaeum_festschrift.pdf">https://www.degam.de/files/inhalt/pdf/festschrift_2016/degam_jubilaeum_festschrift.pdf</ext-link></comment></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Zwierlein</surname><given-names>R</given-names> </name></person-group><source>ADAM &#x2013; the newborn archive of general practice in Germany</source><year>2019</year><access-date>2025-10-15</access-date><comment><ext-link ext-link-type="uri" xlink:href="http://dmhs1917.dk/wp-content/uploads/2019/06/Archive-General-Practice-Germany.pdf">http://dmhs1917.dk/wp-content/uploads/2019/06/Archive-General-Practice-Germany.pdf</ext-link></comment></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Sosna</surname><given-names>N</given-names> </name><name name-style="western"><surname>Steinh&#x00E4;user</surname><given-names>J</given-names> </name></person-group><year>2022</year><access-date>2025-11-30</access-date><conf-name>56 Kongress f&#x00FC;r Allgemeinmedizin und Familienmedizin</conf-name><conf-date>Sep 15-17, 2022</conf-date><conf-loc>Greifswald, Germany</conf-loc><publisher-name>German Medical Science GMS Publishing House</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.egms.de/static/de/meetings/degam2022/22degam202.shtml">https://www.egms.de/static/de/meetings/degam2022/22degam202.shtml</ext-link></comment><pub-id pub-id-type="doi">10.3205/22degam202</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sosna</surname><given-names>N</given-names> </name><name name-style="western"><surname>Steinh&#x00E4;user</surname><given-names>J</given-names> </name></person-group><article-title>Exploring general practice research in Germany: a systematic review of dissertation topics from 1965-2023</article-title><source>Scand J Prim Health Care</source><year>2024</year><month>09</month><volume>42</volume><issue>3</issue><fpage>393</fpage><lpage>398</lpage><pub-id pub-id-type="doi">10.1080/02813432.2024.2329213</pub-id><pub-id pub-id-type="medline">38488125</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Janssen</surname><given-names>BV</given-names> </name><name name-style="western"><surname>Kazemier</surname><given-names>G</given-names> </name><name name-style="western"><surname>Besselink</surname><given-names>MG</given-names> </name></person-group><article-title>The use of ChatGPT and other large language models in surgical science</article-title><source>BJS Open</source><year>2023</year><month>03</month><day>7</day><volume>7</volume><issue>2</issue><fpage>zrad032</fpage><pub-id pub-id-type="doi">10.1093/bjsopen/zrad032</pub-id><pub-id pub-id-type="medline">36960954</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Jeong</surname><given-names>H</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Medical hallucinations in foundation models and their impact on healthcare</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 26, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2503.05777</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Fornasiere</surname><given-names>R</given-names> </name><name name-style="western"><surname>Brunello</surname><given-names>N</given-names> </name><name name-style="western"><surname>Scotti</surname><given-names>V</given-names> </name><name name-style="western"><surname>Carman</surname><given-names>M</given-names> </name><name name-style="western"><surname>Freihat</surname><given-names>AA</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Freihat</surname><given-names>AA</given-names> </name></person-group><article-title>Medical information extraction with large language models</article-title><year>2024</year><access-date>2025-11-27</access-date><conf-name>Proceedings of the 7th International Conference on Natural Language and Speech Processing (ICNLSP 2024)</conf-name><fpage>456</fpage><lpage>466</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2024.icnlsp-1.47/">https://aclanthology.org/2024.icnlsp-1.47/</ext-link></comment></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dagdelen</surname><given-names>J</given-names> </name><name name-style="western"><surname>Dunn</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Structured information extraction from scientific text with large language models</article-title><source>Nat Commun</source><year>2024</year><month>02</month><day>15</day><volume>15</volume><issue>1</issue><fpage>1418</fpage><pub-id pub-id-type="doi">10.1038/s41467-024-45563-x</pub-id><pub-id pub-id-type="medline">38360817</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="web"><article-title>DFG-Vordruck 12151&#x2013;12/16 &#x2013; Praxisregeln &#x201C;Digitalisierung.&#x201D; [Website in German]</article-title><source>Deutsche Forschungsgemeinschaft</source><year>2016</year><access-date>2025-10-15</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.dfg.de/resource/blob/176108/12-151-v1216-de.pdf">https://www.dfg.de/resource/blob/176108/12-151-v1216-de.pdf</ext-link></comment></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gallifant</surname><given-names>J</given-names> </name><name name-style="western"><surname>Afshar</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ameen</surname><given-names>S</given-names> </name><etal/></person-group><article-title>The TRIPOD-LLM reporting guideline for studies using large language models</article-title><source>Nat Med</source><year>2025</year><month>01</month><volume>31</volume><issue>1</issue><fpage>60</fpage><lpage>69</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-03425-5</pub-id><pub-id pub-id-type="medline">39779929</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="other"><person-group person-group-type="author"><collab>OpenAI</collab><name name-style="western"><surname>Hurst</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lerer</surname><given-names>A</given-names> </name><name name-style="western"><surname>Goucher</surname><given-names>AP</given-names> </name><etal/></person-group><article-title>GPT-4o system card</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 25, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2410.21276</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="other"><person-group person-group-type="author"><collab>Team G</collab><name name-style="western"><surname>Georgiev</surname><given-names>P</given-names> </name><name name-style="western"><surname>Lei</surname><given-names>VI</given-names> </name><etal/></person-group><article-title>Gemini 1.5: unlocking multimodal understanding across millions of tokens of context</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 1, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2403.05530</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hom</surname><given-names>J</given-names> </name><name name-style="western"><surname>Nikowitz</surname><given-names>J</given-names> </name><name name-style="western"><surname>Ottesen</surname><given-names>R</given-names> </name><name name-style="western"><surname>Niland</surname><given-names>JC</given-names> </name></person-group><article-title>Facilitating clinical research through automation: Combining optical character recognition with natural language processing</article-title><source>Clin Trials</source><year>2022</year><month>10</month><volume>19</volume><issue>5</issue><fpage>504</fpage><lpage>511</lpage><pub-id pub-id-type="doi">10.1177/17407745221093621</pub-id><pub-id pub-id-type="medline">35608136</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Beutel</surname><given-names>G</given-names> </name><name name-style="western"><surname>Geerits</surname><given-names>E</given-names> </name><name name-style="western"><surname>Kielstein</surname><given-names>JT</given-names> </name></person-group><article-title>Artificial hallucination: GPT on LSD?</article-title><source>Crit Care</source><year>2023</year><month>04</month><day>18</day><volume>27</volume><issue>1</issue><fpage>148</fpage><pub-id pub-id-type="doi">10.1186/s13054-023-04425-6</pub-id><pub-id pub-id-type="medline">37072798</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chandra</surname><given-names>A</given-names> </name><name name-style="western"><surname>T&#x00FC;nnermann</surname><given-names>L</given-names> </name><name name-style="western"><surname>L&#x00F6;fstedt</surname><given-names>T</given-names> </name><name name-style="western"><surname>Gratz</surname><given-names>R</given-names> </name></person-group><article-title>Transformer-based deep learning for predicting protein properties in the life sciences</article-title><source>Elife</source><year>2023</year><month>01</month><day>18</day><volume>12</volume><fpage>e82819</fpage><pub-id pub-id-type="doi">10.7554/eLife.82819</pub-id><pub-id pub-id-type="medline">36651724</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Pezoa</surname><given-names>F</given-names> </name><name name-style="western"><surname>Reutter</surname><given-names>JL</given-names> </name><name name-style="western"><surname>Suarez</surname><given-names>F</given-names> </name><name name-style="western"><surname>Ugarte</surname><given-names>M</given-names> </name><name name-style="western"><surname>Vrgo&#x010D;</surname><given-names>D</given-names> </name></person-group><article-title>Foundations of JSON schema</article-title><year>2016</year><month>04</month><day>11</day><conf-name>WWW &#x2019;16</conf-name><conf-date>Apr 11-15, 2016</conf-date><pub-id pub-id-type="doi">10.1145/2872427.2883029</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kruskal</surname><given-names>WH</given-names> </name><name name-style="western"><surname>Wallis</surname><given-names>WA</given-names> </name></person-group><article-title>Use of ranks in one-criterion variance analysis</article-title><source>J Am Stat Assoc</source><year>1952</year><month>12</month><volume>47</volume><issue>260</issue><fpage>583</fpage><lpage>621</lpage><pub-id pub-id-type="doi">10.1080/01621459.1952.10483441</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>T</given-names> </name><name name-style="western"><surname>Kishore</surname><given-names>V</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>F</given-names> </name><name name-style="western"><surname>Weinberger</surname><given-names>KQ</given-names> </name><name name-style="western"><surname>Artzi</surname><given-names>Y</given-names> </name></person-group><source>BERTScore: evaluating text generation with BERT</source><year>2020</year><access-date>2025-10-15</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openreview.net/forum?id=SkeHuCVFDr">https://openreview.net/forum?id=SkeHuCVFDr</ext-link></comment></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Christen</surname><given-names>P</given-names> </name><name name-style="western"><surname>Hand</surname><given-names>DJ</given-names> </name><name name-style="western"><surname>Kirielle</surname><given-names>N</given-names> </name></person-group><article-title>A review of the F-Measure: its history, properties, criticism, and alternatives</article-title><source>ACM Comput Surv</source><year>2024</year><month>03</month><day>31</day><volume>56</volume><issue>3</issue><fpage>1</fpage><lpage>24</lpage><pub-id pub-id-type="doi">10.1145/3606367</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Rossum</surname><given-names>GV</given-names> </name><name name-style="western"><surname>Drake</surname><given-names>FL</given-names> </name></person-group><article-title>Python 3 reference manual</article-title><source>CreateSpace</source><year>2009</year><pub-id pub-id-type="other">1-4414-1269-7</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="report"><person-group person-group-type="author"><collab>World Health Organization</collab></person-group><article-title>Translation and cultural adaptation of health questionnaires</article-title><year>2016</year><access-date>2025-11-30</access-date><publisher-name>Geneva: World Health Organization</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://applications.emro.who.int/imemrf/J_Pak_Med_Assoc/J_Pak_Med_Assoc_2003_53_4_142_147.pdf">https://applications.emro.who.int/imemrf/J_Pak_Med_Assoc/J_Pak_Med_Assoc_2003_53_4_142_147.pdf</ext-link></comment></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Densen</surname><given-names>P</given-names> </name></person-group><article-title>Challenges and opportunities facing medical education</article-title><source>Trans Am Clin Climatol Assoc</source><year>2011</year><volume>122</volume><fpage>48</fpage><lpage>58</lpage><pub-id pub-id-type="medline">21686208</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Landhuis</surname><given-names>E</given-names> </name></person-group><article-title>Scientific literature: information overload</article-title><source>Nature New Biol</source><year>2016</year><month>07</month><day>21</day><volume>535</volume><issue>7612</issue><fpage>457</fpage><lpage>458</lpage><pub-id pub-id-type="doi">10.1038/nj7612-457a</pub-id><pub-id pub-id-type="medline">27453968</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Adane</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gizachew</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kendie</surname><given-names>S</given-names> </name></person-group><article-title>The role of medical data in efficient patient care delivery: a review</article-title><source>Risk Manag Healthc Policy</source><year>2019</year><volume>12</volume><fpage>67</fpage><lpage>73</lpage><pub-id pub-id-type="doi">10.2147/RMHP.S179259</pub-id><pub-id pub-id-type="medline">31114410</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Landolsi</surname><given-names>MY</given-names> </name><name name-style="western"><surname>Hlaoua</surname><given-names>L</given-names> </name><name name-style="western"><surname>Ben Romdhane</surname><given-names>L</given-names> </name></person-group><article-title>Information extraction from electronic medical documents: state of the art and future research directions</article-title><source>Knowl Inf Syst</source><year>2023</year><volume>65</volume><issue>2</issue><fpage>463</fpage><lpage>516</lpage><pub-id pub-id-type="doi">10.1007/s10115-022-01779-1</pub-id><pub-id pub-id-type="medline">36405956</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hahn</surname><given-names>U</given-names> </name><name name-style="western"><surname>Oleynik</surname><given-names>M</given-names> </name></person-group><article-title>Medical information extraction in the age of deep learning</article-title><source>Yearb Med Inform</source><year>2020</year><month>08</month><volume>29</volume><issue>1</issue><fpage>208</fpage><lpage>220</lpage><pub-id pub-id-type="doi">10.1055/s-0040-1702001</pub-id><pub-id pub-id-type="medline">32823318</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Harrer</surname><given-names>S</given-names> </name></person-group><article-title>Attention is not all you need: the complicated case of ethically using large language models in healthcare and medicine</article-title><source>EBioMedicine</source><year>2023</year><month>04</month><volume>90</volume><fpage>104512</fpage><pub-id pub-id-type="doi">10.1016/j.ebiom.2023.104512</pub-id><pub-id pub-id-type="medline">36924620</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hsu</surname><given-names>E</given-names> </name><name name-style="western"><surname>Roberts</surname><given-names>K</given-names> </name></person-group><article-title>LLM-IE: a Python package for biomedical generative information extraction with large language models</article-title><source>JAMIA Open</source><year>2025</year><month>04</month><volume>8</volume><issue>2</issue><fpage>ooaf012</fpage><pub-id pub-id-type="doi">10.1093/jamiaopen/ooaf012</pub-id><pub-id pub-id-type="medline">40078164</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wiest</surname><given-names>IC</given-names> </name><name name-style="western"><surname>Wolf</surname><given-names>F</given-names> </name><name name-style="western"><surname>Le&#x00DF;mann</surname><given-names>ME</given-names> </name><etal/></person-group><article-title>LLM-AIx: an open source pipeline for Information Extraction from unstructured medical text based on privacy preserving large language models</article-title><source>medRxiv</source><year>2024</year><month>09</month><day>3</day><fpage>2024.09.02.24312917</fpage><pub-id pub-id-type="doi">10.1101/2024.09.02.24312917</pub-id><pub-id pub-id-type="medline">39281753</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sciannameo</surname><given-names>V</given-names> </name><name name-style="western"><surname>Pagliari</surname><given-names>DJ</given-names> </name><name name-style="western"><surname>Urru</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Information extraction from medical case reports using OpenAI InstructGPT</article-title><source>Comput Methods Programs Biomed</source><year>2024</year><month>10</month><volume>255</volume><fpage>108326</fpage><pub-id pub-id-type="doi">10.1016/j.cmpb.2024.108326</pub-id><pub-id pub-id-type="medline">39029416</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schopow</surname><given-names>N</given-names> </name><name name-style="western"><surname>Osterhoff</surname><given-names>G</given-names> </name><name name-style="western"><surname>Baur</surname><given-names>D</given-names> </name></person-group><article-title>Applications of the natural language processing tool ChatGPT in clinical practice: comparative study and augmented systematic review</article-title><source>JMIR Med Inform</source><year>2023</year><month>11</month><day>28</day><volume>11</volume><fpage>e48933</fpage><pub-id pub-id-type="doi">10.2196/48933</pub-id><pub-id pub-id-type="medline">38015610</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>DM</given-names> </name><name name-style="western"><surname>Rong</surname><given-names>R</given-names> </name><etal/></person-group><article-title>A critical assessment of using ChatGPT for extracting structured data from clinical notes</article-title><source>NPJ Digit Med</source><year>2024</year><month>05</month><day>1</day><volume>7</volume><issue>1</issue><fpage>106</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01079-8</pub-id><pub-id pub-id-type="medline">38693429</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Lin</surname><given-names>CY</given-names> </name></person-group><article-title>ROUGE: a package for automatic evaluation of summaries</article-title><year>2004</year><access-date>2025-10-15</access-date><conf-name>Text Summarization Branches Out</conf-name><conf-date>Jul 21-26, 2004</conf-date><conf-loc>Barcelona, Spain</conf-loc><fpage>74</fpage><lpage>81</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/W04-1013/">https://aclanthology.org/W04-1013/</ext-link></comment></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>van Buchem</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Kant</surname><given-names>IMJ</given-names> </name><name name-style="western"><surname>King</surname><given-names>L</given-names> </name><name name-style="western"><surname>Kazmaier</surname><given-names>J</given-names> </name><name name-style="western"><surname>Steyerberg</surname><given-names>EW</given-names> </name><name name-style="western"><surname>Bauer</surname><given-names>MP</given-names> </name></person-group><article-title>Impact of a digital scribe system on clinical documentation time and quality: usability study</article-title><source>JMIR AI</source><year>2024</year><month>09</month><day>23</day><volume>3</volume><fpage>e60020</fpage><pub-id pub-id-type="doi">10.2196/60020</pub-id><pub-id pub-id-type="medline">39312397</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Jung</surname><given-names>H</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Choi</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Enhancing clinical efficiency through llm: discharge note generation for cardiac patients</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 8, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2404.05144</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Preiksaitis</surname><given-names>C</given-names> </name><name name-style="western"><surname>Ashenburg</surname><given-names>N</given-names> </name><name name-style="western"><surname>Bunney</surname><given-names>G</given-names> </name><etal/></person-group><article-title>The role of large language models in transforming emergency medicine: scoping review</article-title><source>JMIR Med Inform</source><year>2024</year><month>05</month><day>10</day><volume>12</volume><fpage>e53787</fpage><pub-id pub-id-type="doi">10.2196/53787</pub-id><pub-id pub-id-type="medline">38728687</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>TRIPOD+LLM checklist.</p><media xlink:href="formative_v9i1e77707_app1.pdf" xlink:title="PDF File, 176 KB"/></supplementary-material></app-group></back></article>