<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JFR</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id>
      <journal-title>JMIR Formative Research</journal-title>
      <issn pub-type="epub">2561-326X</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v8i1e53216</article-id>
      <article-id pub-id-type="pmid">38329787</article-id>
      <article-id pub-id-type="doi">10.2196/53216</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Investigating the Impact of Prompt Engineering on the Performance of Large Language Models for Standardizing Obstetric Diagnosis Text: Comparative Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Mavragani</surname>
            <given-names>Amaryllis</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Harris</surname>
            <given-names>Marcelline</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Chen</surname>
            <given-names>Huiling</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>Lei</given-names>
          </name>
          <degrees>MS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0005-2757-1748</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Bi</surname>
            <given-names>Wenshuai</given-names>
          </name>
          <degrees>MS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0009-3011-896X</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Zhao</surname>
            <given-names>Suling</given-names>
          </name>
          <degrees>MS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-4024-8532</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Ma</surname>
            <given-names>Yinyao</given-names>
          </name>
          <degrees>MS, MD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2755-2169</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Lv</surname>
            <given-names>Longting</given-names>
          </name>
          <degrees>MS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0008-6009-4922</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Meng</surname>
            <given-names>Chenwei</given-names>
          </name>
          <degrees>MS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0009-0493-906X</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Fu</surname>
            <given-names>Jingru</given-names>
          </name>
          <degrees>MS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0008-2067-5618</ext-link>
        </contrib>
        <contrib id="contrib8" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Lv</surname>
            <given-names>Hanlin</given-names>
          </name>
          <degrees>PhD, MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>BGI Research</institution>
            <addr-line>Building 11, Beishan Industrial Zone</addr-line>
            <addr-line>Yantian District</addr-line>
            <addr-line>Shenzhen, 518083</addr-line>
            <country>China</country>
            <phone>86 18707190886</phone>
            <email>lvhanlin@genomics.cn</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-1876-7846</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>BGI Research</institution>
        <addr-line>Shenzhen</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>The People's Hospital of Guangxi Zhuang Autonomous Region</institution>
        <addr-line>Guangxi</addr-line>
        <country>China</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Hanlin Lv <email>lvhanlin@genomics.cn</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2024</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>8</day>
        <month>2</month>
        <year>2024</year>
      </pub-date>
      <volume>8</volume>
      <elocation-id>e53216</elocation-id>
      <history>
        <date date-type="received">
          <day>29</day>
          <month>9</month>
          <year>2023</year>
        </date>
        <date date-type="rev-request">
          <day>20</day>
          <month>12</month>
          <year>2023</year>
        </date>
        <date date-type="rev-recd">
          <day>25</day>
          <month>12</month>
          <year>2023</year>
        </date>
        <date date-type="accepted">
          <day>11</day>
          <month>1</month>
          <year>2024</year>
        </date>
      </history>
      <copyright-statement>©Lei Wang, Wenshuai Bi, Suling Zhao, Yinyao Ma, Longting Lv, Chenwei Meng, Jingru Fu, Hanlin Lv. Originally published in JMIR Formative Research (https://formative.jmir.org), 08.02.2024.</copyright-statement>
      <copyright-year>2024</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on https://formative.jmir.org, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://formative.jmir.org/2024/1/e53216" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>The accumulation of vast electronic medical records (EMRs) through medical informatization creates significant research value, particularly in obstetrics. Diagnostic standardization across different health care institutions and regions is vital for medical data analysis. Large language models (LLMs) have been extensively used for various medical tasks. Prompt engineering is key to use LLMs effectively.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aims to evaluate and compare the performance of LLMs with various prompt engineering techniques on the task of standardizing obstetric diagnostic terminology using real-world obstetric data.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>The paper describes a 4-step approach used for mapping diagnoses in electronic medical records to the International Classification of Diseases, 10th revision, observation domain. First, similarity measures were used for mapping the diagnoses. Second, candidate mapping terms were collected based on similarity scores above a threshold, to be used as the training data set. For generating optimal mapping terms, we used two LLMs (ChatGLM2 and Qwen-14B-Chat [QWEN]) for zero-shot learning in step 3. Finally, a performance comparison was conducted by using 3 pretrained bidirectional encoder representations from transformers (BERTs), including BERT, whole word masking BERT, and momentum contrastive learning with BERT (MC-BERT), for unsupervised optimal mapping term generation in the fourth step.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>LLMs and BERT demonstrated comparable performance at their respective optimal levels. LLMs showed clear advantages in terms of performance and efficiency in unsupervised settings. Interestingly, the performance of the LLMs varied significantly across different prompt engineering setups. For instance, when applying the self-consistency approach in QWEN, the <italic>F</italic><sub>1</sub>-score improved by 5%, with precision increasing by 7.9%, outperforming the zero-shot method. Likewise, ChatGLM2 delivered similar rates of accurately generated responses. During the analysis, the BERT series served as a comparative model with comparable results. Among the 3 models, MC-BERT demonstrated the highest level of performance. However, the differences among the versions of BERT in this study were relatively insignificant.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>After applying LLMs to standardize diagnoses and designing 4 different prompts, we compared the results to those generated by the BERT model. Our findings indicate that QWEN prompts largely outperformed the other prompts, with precision comparable to that of the BERT model. These results demonstrate the potential of unsupervised approaches in improving the efficiency of aligning diagnostic terms in daily research and uncovering hidden information values in patient data.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>obstetric data</kwd>
        <kwd>similarity embedding</kwd>
        <kwd>term standardization</kwd>
        <kwd>large language models</kwd>
        <kwd>LLMs</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>The advancement of medical informatization has resulted in the accumulation of vast amounts of electronic medical records (EMRs) in hospitals, giving rise to medical big data [<xref ref-type="bibr" rid="ref1">1</xref>]. These data hold significant research value. Using obstetrics as an example, the implementation of China’s “three-child” policy in 2021 has led to an increasing proportion of women with advanced maternal age and multiparity. Studies indicate that as maternal age and parity increase, the occurrence of pregnancy complications and adverse pregnancy outcomes also tends to rise, posing new challenges for obstetrics across health care institutions at all levels [<xref ref-type="bibr" rid="ref2">2</xref>]. Extracting valuable information from obstetric EMRs could significantly benefit clinical research aimed at improving pregnancy success rates.</p>
      <p>However, due to varying writing habits among doctors, diagnostic descriptions in medical records lack standardization, which hinders the analysis and use of medical data. Consequently, mapping clinical diagnostic descriptions to a standard terminology database is vital for medical data analysis. This process enables the standardization of medical terms across different health care institutions and regions, preventing misunderstandings and confusion caused by varying terminologies. It positively impacts health care quality, reduces medical costs, enhances doctor-patient relationships, and promotes the development of medical science.</p>
      <p>The emergence of large language models (LLMs), represented by ChatGPT, has caused a surge in interest in their application across various fields of research. In the medical domain, LLMs have been extensively used for tasks such as intelligent medical history collection and preliminary diagnosis, personalized treatment and drug recommendations, medical record documentation and report generation, literature retrieval and analysis, and medical education and training [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref6">6</xref>]. Kanjee et al [<xref ref-type="bibr" rid="ref7">7</xref>] assessed ChatGPT’s ability to accurately diagnose challenging medical cases and suggested that generative artificial intelligence (AI) models hold promise as potential aids to human diagnostic cognition. Research by Agbavor and Liang [<xref ref-type="bibr" rid="ref8">8</xref>] demonstrated that GPT-3–generated text embeddings can reliably distinguish Alzheimer disease patients from healthy controls and infer cognitive test scores of patients, potentially enhancing early dementia diagnosis. Palanica et al [<xref ref-type="bibr" rid="ref9">9</xref>] explored ChatGPT’s potential applications in psychological counseling, emotional support, and mental illness screening while discussing related challenges and future research directions.</p>
      <p>LLMs have also played a crucial role in medical research. Clinical research often involves large amounts of unlabeled natural language data, and LLMs’ zero-shot learning ability allows them to effectively process such data. Agrawal et al [<xref ref-type="bibr" rid="ref10">10</xref>] showed that ChatGPT excels in extracting zero-shot and few-shot information from clinical texts. Hu et al [<xref ref-type="bibr" rid="ref11">11</xref>] revealed ChatGPT’s potential in zero-shot clinical entity recognition tasks. Furthermore, Lamichhane's [<xref ref-type="bibr" rid="ref12">12</xref>] 3 text-based experiments on mental health classification demonstrated ChatGPT’s potential in zero-shot text classification tasks.</p>
      <p>LLMs are chatbot technologies based on natural language processing and deep learning; they learn language patterns and knowledge from a large amount of text data to realize natural conversations with humans. The key to effectively using LLMs is to set an optimal prompt [<xref ref-type="bibr" rid="ref13">13</xref>].</p>
      <p>In few-shot learning, designing appropriate prompts can help LLMs learn better from a small number of training samples and improve performance [<xref ref-type="bibr" rid="ref13">13</xref>]. Even in zero-shot learning scenarios, appropriate prompts can guide LLMs to use contextual information to output correct results [<xref ref-type="bibr" rid="ref14">14</xref>]. Prompt engineering has been widely used in various fields of natural language processing, such as question answering, text generation, and sentiment classification, as well as other tasks. By carefully designing prompts, LLMs can better understand the task requirements and context and generate more accurate and useful outputs [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. In addition, prompt engineering is an efficient method that does not rely on large-scale computing resources. It can narrow the gap between the pretraining and fine-tuning stages, improve the model’s learning ability and generalization ability on a small amount of data, and fully exploit the model’s potential performance [<xref ref-type="bibr" rid="ref15">15</xref>].</p>
      <p>Chain-of-thought (CoT) prompts were proposed by Wei et al [<xref ref-type="bibr" rid="ref17">17</xref>], who experimented with the effect of CoT prompts on multiple tasks, including mathematical problems, logical reasoning, reading comprehension, and common sense reasoning; they compared it with other prompt engineering techniques and pointed out that CoT prompts could significantly improve the model’s performance on these tasks and even allow the model to show complex reasoning abilities, such as induction, deduction, and analogy. The basic idea of CoT prompts is that, when giving a question or task, instead of directly asking the model to give an answer or result, the user asks the model to give a CoT, that is, a series of intermediate reasoning steps in which each step is a complete sentence, and the last step is the answer or result. The advantage of this is that it can make the model better understand the meaning and goal of the question or task, avoid irrelevant or wrong outputs, and also make it easier for human users to check and evaluate the model’s output.</p>
      <p>The goal of self-consistency prompts is to improve the quality and consistency of the generated results by requiring the model to make consistency judgments on the previously generated text [<xref ref-type="bibr" rid="ref18">18</xref>]. When using self-consistency prompts, the user first provides an initial text as a prompt and then lets the model continue to generate the subsequent text. Next, the user replaces the “greedy decoding” in the CoT prompt with sampling from the language model’s decoder to generate a set of diverse reasoning paths; finally, the user marginalizes the reasoning paths and aggregates them by selecting the most consistent answer in the final answer. This can force the model to maintain self-consistency when generating text, avoiding contradictions and incoherence.</p>
      <p>This paper delves into the potential of LLMs for zero-shot or unsupervised learning in the domain of standardizing diagnostic terminology in obstetrics. By leveraging a composite approach that merges different prompt engineering techniques with LLMs, our goal is to identify the most fitting pipeline for unsupervised scenarios.</p>
      <p>As most of the LLMs used in the Chinese domain use the Chinese version of the International Classification of Diseases, 10th revision (ICD-10-CN), as their core training corpus [<xref ref-type="bibr" rid="ref19">19</xref>], in order to compare the performance of LLMs and supervised learning algorithms horizontally on a baseline, we used standard diagnostic terminology in the ICD-10-CN as the alignment target throughout this study.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Task Overview</title>
        <p>The approach can be divided into 4 steps: (1) mapping the diagnosis in EMRs to the observation domain of the ICD-10-CN via embedded similarity; (2) collecting the candidate mapping terms with similarity above the threshold as the training data set; (3) using 2 LLMs, ChatGLM2 [<xref ref-type="bibr" rid="ref20">20</xref>] and Qwen-14B-Chat (QWEN) [<xref ref-type="bibr" rid="ref21">21</xref>], with zero-shot learning to generate the optimal mapping terms; and (4) using 3 pretrained bidirectional encoder representations from transformers (BERTs), BERT [<xref ref-type="bibr" rid="ref22">22</xref>], whole word masking BERT (BERT-WWM) [<xref ref-type="bibr" rid="ref23">23</xref>], and momentum contrastive learning with BERT (MC-BERT) [<xref ref-type="bibr" rid="ref24">24</xref>], for unsupervised generation of the optimal mapping terms for performance comparison. The entire workflow is illustrated in <xref rid="figure1" ref-type="fig">Figure 1</xref>.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>The 4-step approach of this study. For optimal term selection, the combination of a large language model (LLM) and prompt engineering contributes to the unsupervised learning approach to select the optimal terms from 10 candidates. BERT: bidirectional encoder representations from transformers; BERT-WMM: whole word masking BERT; MC-BERT: momentum contrastive learning with BERT; QWEN: Qwen-14B-Chat.</p>
          </caption>
          <graphic xlink:href="formative_v8i1e53216_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Data Preparation</title>
        <p>In this study, the raw data were collected from the obstetric EMR data of the People’s Hospital of Guangxi Zhuang Autonomous Region from April 2014 to April 2022; these data contained only diagnostic reports. Sample data are shown in <xref ref-type="boxed-text" rid="box1">Textbox 1</xref>.</p>
        <boxed-text id="box1" position="float">
          <title>A sample data of diagnoses for ID 720444 is listed below with a translated version. All data processed in this research were in Chinese.</title>
          <p>
            <bold>Discharge diagnoses</bold>
          </p>
          <p>1. 头位顺产</p>
          <p>2. 单胎活产</p>
          <p>3. 孕1产1妊娠39+4周</p>
          <p>4. 羊水偏少</p>
          <p>
            <bold>Translations</bold>
          </p>
          <p>1. Vertex delivery</p>
          <p>2. Singleton live birth</p>
          <p>3. Pregnancy: G1P1, 39+4 weeks</p>
          <p>4. Oligohydramnios</p>
        </boxed-text>
        <p>The raw data set underwent data preprocessing by removing punctuation marks and meaningless special symbols to avoid potential interference with subsequent word segmentation operations.</p>
        <p>We implemented LLMs in an intranet security environment. Both ChatGLM2 and QWEN were used exclusively on physically isolated graphical processing units, with access facilitated via OpenAI format and FastAPI (built on PyTorch 2.0). Temperature settings for the LLMs were configured at 0, with <italic>max_token</italic> parameters tailored on a task-by-task basis.</p>
        <p>The standard vocabulary referred to in the following text consists of the diagnostic categories belonging to the observation domain of ICD-10-CN.</p>
      </sec>
      <sec>
        <title>Embedding Learning</title>
        <p>We used the conditional random fields (CRF) model [<xref ref-type="bibr" rid="ref24">24</xref>] to segment the text and obtained original-diagnosis raw data aligned with standard vocabulary terms. The principle of CRF is to treat word segmentation as a character position classification problem. Character position information is often defined as follows: <italic>B</italic> represents the beginning of a word, <italic>M</italic> denotes the middle of a word, <italic>E</italic> signifies the end of a word, and <italic>S</italic> indicates a single-character word. Feature functions are constructed to describe the relationship between each character and label and the transition between adjacent labels. Using training data, we learn the weights of feature functions to maximize conditional probability. The Viterbi algorithm predicts new input sequences and finds the most probable label sequence; according to the label sequence, we construct word segmentation results from characters between <italic>B</italic> and <italic>E</italic> and single characters <italic>S</italic>. As shown in <xref ref-type="boxed-text" rid="box2">Textbox 2</xref>, we conducted CRF word segmentation on diagnoses in EMRs.</p>
        <boxed-text id="box2" position="float">
          <title>Sample of word segmentation with the conditional random fields model. The data below represent a preliminary diagnosis of placenta abruption.</title>
          <p>Original word: 初步 诊 断 为 胎 盘 早剥;</p>
          <p>After CRF annotation: 初/B 步/M 诊/M 断/E 为/S 胎/B 盘/M 早/M 剥/E</p>
          <p>According to the label, the word segmentation result is as follows: 初步 诊 断/为/胎 盘 早剥.</p>
        </boxed-text>
        <p>To calculate the similarity between diagnoses in the raw data set and terms in the standard vocabulary, we used the BERT-medicine model to transform diagnoses and terms into embeddings for storage. The BERT-medicine model is specifically designed to improve the model’s understanding of medical terms and symptoms by introducing a medical domain–specific vocabulary list, lexicon, and pretraining tasks.</p>
        <p>The main structure of the BERT-medicine model is the BERT, and the main inputs are the raw word vectors of each word or phrase in the text. In this study, we used diagnoses in the raw data set as the input text sequences. The BERT model extracted the contextual information of the text through a self-attention mechanism and learned the bidirectional linguistic representations, so as to obtain a semantic representation of each word in its context. The final output embedding vector is represented by the sum of character embedding, partition embedding, and position embedding, which constitute the input sequence.</p>
      </sec>
      <sec>
        <title>Similarity Computation</title>
        <p>The feature embedding of the diagnosis is denoted by <inline-graphic xlink:href="formative_v8i1e53216_fig4.png" xlink:type="simple" mimetype="image"/>, the feature embedding of the standard terms is denoted by <inline-graphic xlink:href="formative_v8i1e53216_fig5.png" xlink:type="simple" mimetype="image"/>, and their similarity is calculated using the cosine similarity with the following formula:</p>
        <disp-formula>
          <graphic xlink:href="formative_v8i1e53216_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>The proposed approach is evaluated through the following steps: Standard terms with a similarity score higher than 0.9 are considered candidates for diagnosis keywords and are then verified by medical experts. The normalized precision and recall are calculated, and the precision-recall curve is obtained. Since a particular diagnosis might have multiple similar standard terms, we aimed to identify as many similar terms as possible, and we thus expected high recall and precision. To obtain candidate terms, we collected the original diagnosis and the 10 most similar standard terms having a similarity score greater than or equal to 0.855.</p>
      </sec>
      <sec>
        <title>Optimal Term Selection</title>
        <p>To comprehensively evaluate the performance of LLMs in the standardization of obstetric diagnostic terminology, we used 4 different prompts, with the prompt design ranging from simple to complex. This started with the prompt trained on zero samples (the zero-shot learning prompt); next were the prompt trained on a small number of samples, the in-context learning prompt, the CoT prompt, and finally the self-consistency prompt. The specific flow chart of LLM training is shown in <xref rid="figure2" ref-type="fig">Figure 2</xref>.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Prompt examples under 4 distinct prompt engineering methods: (A) the zero-shot learning prompt, (B) the in-context learning prompt, (C) the chain-of-thought prompt, and (D) the self-consistency prompt. In the experiment, the raw data being in Chinese led us to use Chinese prompts, which are translated for readability purposes. However, “diagnosis” and “candidate terms” are displayed in their original Chinese format. LLM: large language model.</p>
          </caption>
          <graphic xlink:href="formative_v8i1e53216_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>The zero-shot learning prompt was meant to guide the LLMs’ output by directly telling them the purpose of this study. The target task of this study was to find the standard-term expression for the diagnosis, that is, to let the LLMs determine the word with the highest similarity. Therefore, we directly told the LLMs to find the most similar word to the input word among the candidate words for the standard term. The LLMs determined the similarity between words based on their own learned knowledge, and then output the word with the highest similarity to the input word as the output result.</p>
        <p>The purpose of in-context learning prompts is to give context hints and let LLMs learn by analogy from few shots to output results that more closely meet the requirements [<xref ref-type="bibr" rid="ref25">25</xref>]. Its input is in the form of {question, answer}, that is, in the input, the question and result are given to the LLM as a template, and it answers the same type of questions in a specific way according to the specific answer.</p>
        <p>The input form of CoT prompts is similar to in-context learning prompts, that is, {question, answer}, with the difference that the answer contains the intermediate steps of thinking. In order to reduce human costs, we used LLMs to generate CoT prompts, and then encapsulated them into the prompt inputs.</p>
        <p>The key method of self-consistency prompts in this study was to input the CoT prompts from the previous section multiple times, obtain multiple results, randomly sample a group of output results, and use the majority voting method to decide the final result. Next, we will demonstrate the experimental process with different prompts through specific examples, shown in <xref rid="figure3" ref-type="fig">Figure 3</xref>.</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Detailed illustration of the technical intricacies underlying this study. The process of mapping nonstandardized local diagnostic text to standardized International Classification of Diseases, 10th revision, Chinese version (ICD-10-CN) terms involves preliminary similarity-based selection through the vector database, followed by optimal solution selection performed by large language models (LLMs) based on semantic comprehension.</p>
          </caption>
          <graphic xlink:href="formative_v8i1e53216_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Evaluation</title>
        <p>The evaluation metrics in this study to assess the model’s performance were precision, recall, and <italic>F</italic><sub>1</sub>-score [<xref ref-type="bibr" rid="ref26">26</xref>]. We classified words that matched the original word and the standard word as positives, and those that did not match as negatives. There were 4 possible classification outcomes: true positive, in which the model correctly identified a positive as positive; false negative, in which the model mistakenly classified a positive as negative; true negative, in which the model correctly identified a negative as negative; and false positive, in which the model mistakenly classified a negative as positive. Using these classification outcomes, we could calculate precision, recall, and <italic>F</italic><sub>1</sub>-score to evaluate the model’s performance in standardizing diagnoses.</p>
        <p>Precision, recall, and <italic>F</italic><sub>1</sub>-score (the reconciled mean of precision and recall) were defined as follows:</p>
        <disp-formula>
          <graphic xlink:href="formative_v8i1e53216_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>The study was approved by the People’s Hospital of the Guangxi Zhuang Autonomous Region in China (KT-KJT-2021-67), and all pregnancy data were deidentified and anonymized.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Overview</title>
        <p>For similarity computation, according to experimental tests, an average precision of 0.88 met the requirement for high precision and recall. The corresponding threshold value at this point was 0.855. Therefore, the threshold value for calculations of similarity was determined to be 0.855, which was used to filter out standard terms that were not similar enough to the diagnosis.</p>
        <p>After collecting the candidate data set, we used 2 LLMs and 4 techniques for prompt engineering. Subsequently, we mapped the LLM outputs to the most suitable candidate terms from the ICD-10-CN standard vocabulary, enabling us to calculate precision, recall, and <italic>F</italic><sub>1</sub>-score. In order to undertake entity normalization, we selected the classic BERT series, comprising BERT, MC-BERT, and BERT-WWM, as our comparison models. We then compared their performance with the results obtained using the LLMs with 4 different prompts. The outcomes of this comparison are presented in <xref ref-type="table" rid="table1">Table 1</xref>.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Metric performance comparison across large language model and bidirectional encoder representations from transformers (BERT) series.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="370"/>
            <col width="200"/>
            <col width="0"/>
            <col width="200"/>
            <col width="200"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Model and prompt engineering approach</td>
                <td>Precision, %</td>
                <td colspan="2">Recall, %</td>
                <td><italic>F</italic><sub>1</sub>-score, %</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="2">BERT<sup>a</sup></td>
                <td>91.93</td>
                <td colspan="2">91.95</td>
                <td>91.94</td>
              </tr>
              <tr valign="top">
                <td colspan="2">Momentum contrastive learning with BERT (MC-BERT)<sup>a</sup></td>
                <td>92.34</td>
                <td colspan="2">92.37</td>
                <td>92.35</td>
              </tr>
              <tr valign="top">
                <td colspan="2">BERT-whole word masking<sup>a</sup></td>
                <td>92.13</td>
                <td colspan="2">92.17</td>
                <td>92.15</td>
              </tr>
              <tr valign="top">
                <td colspan="6">
                  <bold>ChatGLM2</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Zero shot</td>
                <td colspan="2">75.02</td>
                <td>89.90</td>
                <td>81.79</td>
              </tr>
              <tr valign="top">
                <td colspan="6">
                  <bold>BERT</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>In context</td>
                <td colspan="2">85.13</td>
                <td>86.60</td>
                <td>85.85</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Chain of thought</td>
                <td colspan="2">86.52</td>
                <td>88.93</td>
                <td>82.51</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Self consistency</td>
                <td colspan="2">88.53</td>
                <td>90.11</td>
                <td>89.31</td>
              </tr>
              <tr valign="top">
                <td colspan="6">
                  <bold>Qwen-14B-Chat</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Zero shot</td>
                <td colspan="2">84.01</td>
                <td>86.72</td>
                <td>85.53</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>In context</td>
                <td colspan="2">88.25</td>
                <td>91.18</td>
                <td>89.69</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Chain of thought</td>
                <td colspan="2">89.92</td>
                <td>91.30</td>
                <td>90.60</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Self consistency</td>
                <td colspan="2">90.91</td>
                <td>92.13</td>
                <td>91.51</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>Prompt engineering not applicable to these models.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>It is evident from the table that the LLMs and BERT displayed comparable performance at their optimal levels, indicating that the LLMs provided a performance and time advantage under unsupervised conditions. Furthermore, the LLMs exhibited varied performance under different prompt engineering setups. Taking QWEN as an example, the implementation of the self-consistency approach improved the <italic>F</italic><sub>1</sub>-score by 5% and precision by 7.9% compared to the zero-shot method. Similarly, the same proportion of correctly generated responses was observed in ChatGLM2’s performance, with a range from 9.19% to 18.02%. Thus, QWEN achieved better performance than ChatGLM2 in all 4 prompt engineering approaches.</p>
        <p>The BERT series were additional comparison models and exhibited more comparable results in this task. Among the 3 models shown in <xref ref-type="table" rid="table2">Table 2</xref>, MC-BERT delivered the best performance. However, in this study, the disparity between the 3 versions of BERT was relatively small.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Cluster results of standardized terms. Original words in Chinese translated to English via ChatGPT.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="60"/>
            <col width="140"/>
            <col width="130"/>
            <col width="140"/>
            <col width="130"/>
            <col width="140"/>
            <col width="130"/>
            <col width="130"/>
            <thead>
              <tr valign="top">
                <td>ID</td>
                <td>Word 0</td>
                <td>Word 1</td>
                <td>Word 2</td>
                <td>Word 3</td>
                <td>Word 4</td>
                <td>Word 5</td>
                <td>Word 6</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>2</td>
                <td>α-Thalassemia</td>
                <td>β-Thalassemia</td>
                <td>δ-β-Thalassemia</td>
                <td>Intermedia thalassemia</td>
                <td>Major thalassemia</td>
                <td>Combined thalassemia</td>
                <td>Thalassemia</td>
              </tr>
              <tr valign="top">
                <td>23</td>
                <td>Acute mixed-type fetal distress</td>
                <td>Acute fetal distress</td>
                <td>Acute fetal heart-type fetal distress</td>
                <td>Acute amniotic fluidtype fetal distress</td>
                <td>Chronic fetal distress</td>
                <td>Chronic fetal -heart type fetal distress</td>
                <td>Chronic amniotic fluid–type fetal distress</td>
              </tr>
              <tr valign="top">
                <td>55</td>
                <td>Fetal cardiac malformations</td>
                <td>Fetal limb malformations</td>
                <td>Fetus with multiple malformations</td>
                <td>Fetal ear malformations</td>
                <td>Fetal malformations</td>
                <td>Fetal structural anomalies</td>
                <td>Fetal kidney malformations</td>
              </tr>
              <tr valign="top">
                <td>73</td>
                <td>Uterine interstitial leiomyoma</td>
                <td>Uterine subserosal leiomyoma</td>
                <td>Uterine intramural leiomyoma</td>
                <td>Uterine submucosal leiomyoma</td>
                <td>Uterine mucosal leiomyoma</td>
                <td>Uterine leiomyoma</td>
                <td>Uterine multiple leiomyoma</td>
              </tr>
              <tr valign="top">
                <td>76</td>
                <td>Intrahepatic bile duct stones</td>
                <td>Hepatobiliary stones</td>
                <td>Biliary stones</td>
                <td>—<sup>a</sup></td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
              </tr>
              <tr valign="top">
                <td>25</td>
                <td>Severe pulmonary arterial hypertension</td>
                <td>Mild pulmonary arterial hypertension</td>
                <td>Moderate pulmonary arterial hypertension</td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
              </tr>
              <tr valign="top">
                <td>20</td>
                <td>Central pelvic stenosis</td>
                <td>Pelvic stenosis</td>
                <td>Pelvic outlet stenosis</td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
              </tr>
              <tr valign="top">
                <td>36</td>
                <td>Acute bronchitis</td>
                <td>Acute tracheitis</td>
                <td>Chronic bronchitis</td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
              </tr>
              <tr valign="top">
                <td>91</td>
                <td>Pregnancy-related reproductive tract infection</td>
                <td>Pregnancy-related urinary tract infection</td>
                <td>Pregnancy-related urethral infection</td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>Not applicable.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Additional Research</title>
        <p>In this study, we used the Louvain algorithm to mine terms from the standard data set output by the LLMs and obtained 1100 relatively common diagnostic terms. In the medical field, different medical institutions and professionals may use different terms to describe the same or similar clinical diagnoses, which can cause difficulties and misunderstandings in data exchange, statistics, and analysis. Therefore, standardizing clinical diagnostic terms is an important task. The standardized terms can be used to unify treatment plans and disease statistics, as well as to build clinical diagnostic knowledge bases. The data in our study were clustered into 107 clusters, and each cluster was analyzed separately, resulting in a diagnostic clustering table. Part of the results of the clustering table are shown in <xref ref-type="table" rid="table2">Table 2</xref>.</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Results</title>
        <p>This paper proposes an effective unsupervised standardization method for obstetric diagnosis. Through a multi-metrics comparison of different LLMs under various prompt engineering strategies, we found that unsupervised LLMs coupled with effective prompt engineering can achieve performance comparable to supervised learning.</p>
        <p>A comparison of different prompt engineering strategies showed that although the models’ baseline performance under zero-shot settings varied, they generally showed significant improvement after incorporating strategies such as CoT, which also highlights the importance of effective prompts for LLMs.</p>
        <p>The goal of our alignment in this study is the ICD-10-CN terminology, which belongs to the core vocabulary of the Chinese medical field. LLMs trained on Chinese language data usually include it as part of the training corpus [<xref ref-type="bibr" rid="ref19">19</xref>], and the performance of the baseline model allows prompt engineering to further improve the alignment performance.</p>
      </sec>
      <sec>
        <title>Comparison With Prior Work</title>
        <p>Compared to previous research that primarily relied on BERT-based methods to map diagnostic descriptions from EMRs to standard terminologies, this study explores a novel approach based on LLMs. Among BERT models, we identified MC-BERT as the top performer, achieving an <italic>F</italic><sub>1</sub>-score of 0.9235.</p>
        <p>Beyond the conventional BERT methods, we examined 4 mainstream prompt strategies and found that the self-consistency method outperformed the others, achieving an <italic>F</italic><sub>1</sub>-score of 0.9233. This level of performance matches that of supervised learning, opening up new possibilities for terminology mapping research in the medical domain.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>As all data were sourced from real-world patient information, and even though we anonymized the data through multiple strategies and only used a portion of the diagnostic text information without any personal identifying information, there is still a risk associated with uploading patient data to an open network. Additionally, as our research objective was to align and standardize Chinese text based on Chinese target terminologies, the choice of LLMs used in this study was limited. The development of LLMs in the Chinese domain is advancing rapidly, and there are many newly released versions that we have yet to explore.</p>
        <p>Moreover, our alignment target was for scientific exploration. In future studies, we will attempt to train target vocabulary that is more suited to the scientific research context into the model through methods such as global optimization and exploring semantic alignment scenarios.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>This paper investigates the capability of LLMs in standardizing clinical medical terms. By using LLMs to standardize diagnostic terms extracted from real-world obstetric EMRs and designing 4 different prompts for LLMs, we were able to compare their output results with those of the BERT model. Our findings demonstrate that QWEN mostly achieved the best performance and had precision on par with the BERT model, which illustrates that an unsupervised approach improved the efficiency of aligning diagnostic terms in daily research and to uncover the hidden value of patient data information.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">BERT</term>
          <def>
            <p>bidirectional encoder representations from transformers</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">BERT-WMM</term>
          <def>
            <p>whole word masking bidirectional encoder representations from transformers</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">CoT</term>
          <def>
            <p>chain-of-thought</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">CRF</term>
          <def>
            <p>conditional random fields</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">EMR</term>
          <def>
            <p>electronic medical record</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">ICD-10-CN</term>
          <def>
            <p>Chinese version of the International Classification of Diseases, 10th revision</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">LLM</term>
          <def>
            <p>large language model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">MC-BERT</term>
          <def>
            <p>momentum contrastive learning with bidirectional encoder representations from transformers</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">QWEN</term>
          <def>
            <p>Qwen-14B-Chat</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This study was supported by Guangxi Key Research and Development Program (AB22035056). We thank the China National GeneBank for technical support.</p>
    </ack>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>The data sets generated during and/or analyzed during this study are not publicly available due to privacy and ethical restrictions but are available from the corresponding author on reasonable request.</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jensen</surname>
              <given-names>PB</given-names>
            </name>
            <name name-style="western">
              <surname>Jensen</surname>
              <given-names>LJ</given-names>
            </name>
            <name name-style="western">
              <surname>Brunak</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Mining electronic health records: towards better research applications and clinical care</article-title>
          <source>Nat Rev Genet</source>
          <year>2012</year>
          <month>05</month>
          <day>02</day>
          <volume>13</volume>
          <issue>6</issue>
          <fpage>395</fpage>
          <lpage>405</lpage>
          <pub-id pub-id-type="doi">10.1038/nrg3208</pub-id>
          <pub-id pub-id-type="medline">22549152</pub-id>
          <pub-id pub-id-type="pii">nrg3208</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Morales-Suárez-Varela</surname>
              <given-names>María</given-names>
            </name>
            <name name-style="western">
              <surname>Clemente-Bosch</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Peraita-Costa</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Llopis-Morales</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Martínez</surname>
              <given-names>Isabel</given-names>
            </name>
            <name name-style="western">
              <surname>Llopis-González</surname>
              <given-names>Agustín</given-names>
            </name>
          </person-group>
          <article-title>Maternal physical activity during pregnancy and the effect on the mother and newborn: a systematic review</article-title>
          <source>J Phys Act Health</source>
          <year>2021</year>
          <month>01</month>
          <day>01</day>
          <volume>18</volume>
          <issue>1</issue>
          <fpage>130</fpage>
          <lpage>147</lpage>
          <pub-id pub-id-type="doi">10.1123/jpah.2019-0348</pub-id>
          <pub-id pub-id-type="medline">33361475</pub-id>
          <pub-id pub-id-type="pii">jpah.2019-0348</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sallam</surname>
              <given-names>Malik</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT utility in healthcare education, research, and practice: systematic review on the promising perspectives and valid concerns</article-title>
          <source>Healthcare (Basel)</source>
          <year>2023</year>
          <month>03</month>
          <day>19</day>
          <volume>11</volume>
          <issue>6</issue>
          <fpage>9</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=healthcare11060887"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/healthcare11060887</pub-id>
          <pub-id pub-id-type="medline">36981544</pub-id>
          <pub-id pub-id-type="pii">healthcare11060887</pub-id>
          <pub-id pub-id-type="pmcid">PMC10048148</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Carey</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Isaac</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <source>The AI Revolution in Medicine: GPT-4 and Beyond</source>
          <year>2023</year>
          <month>04</month>
          <day>14</day>
          <publisher-loc>London, UK</publisher-loc>
          <publisher-name>Pearson Education</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kung</surname>
              <given-names>TH</given-names>
            </name>
            <name name-style="western">
              <surname>Cheatham</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Medenilla</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sillos</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>De Leon</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Elepaño</surname>
              <given-names>Camille</given-names>
            </name>
            <name name-style="western">
              <surname>Madriaga</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Aggabao</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Diaz-Candido</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Maningo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tseng</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title>
          <source>PLOS Digit Health</source>
          <year>2023</year>
          <month>02</month>
          <volume>2</volume>
          <issue>2</issue>
          <fpage>e0000198</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36812645"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id>
          <pub-id pub-id-type="medline">36812645</pub-id>
          <pub-id pub-id-type="pii">PDIG-D-22-00371</pub-id>
          <pub-id pub-id-type="pmcid">PMC9931230</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jeblick</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Schachtner</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Dexl</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Mittermeier</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Stüber</surname>
              <given-names>Anna Theresa</given-names>
            </name>
            <name name-style="western">
              <surname>Topalis</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Weber</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Wesp</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Sabel</surname>
              <given-names>BO</given-names>
            </name>
            <name name-style="western">
              <surname>Ricke</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ingrisch</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT makes medicine easy to swallow: an exploratory case study on simplified radiology reports</article-title>
          <source>Eur Radiol</source>
          <year>2023</year>
          <month>10</month>
          <day>05</day>
          <fpage>1</fpage>
          <lpage>9</lpage>
          <pub-id pub-id-type="doi">10.1007/s00330-023-10213-1</pub-id>
          <pub-id pub-id-type="medline">37794249</pub-id>
          <pub-id pub-id-type="pii">10.1007/s00330-023-10213-1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kanjee</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Crowe</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Rodman</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Accuracy of a generative artificial intelligence model in a complex diagnostic challenge</article-title>
          <source>JAMA</source>
          <year>2023</year>
          <month>07</month>
          <day>03</day>
          <volume>330</volume>
          <issue>1</issue>
          <fpage>78</fpage>
          <lpage>80</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37318797"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jama.2023.8288</pub-id>
          <pub-id pub-id-type="medline">37318797</pub-id>
          <pub-id pub-id-type="pii">2806457</pub-id>
          <pub-id pub-id-type="pmcid">PMC10273128</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Agbavor</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Predicting dementia from spontaneous speech using large language models</article-title>
          <source>PLOS Digit Health</source>
          <year>2022</year>
          <month>12</month>
          <volume>1</volume>
          <issue>12</issue>
          <fpage>e0000168</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36812634"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pdig.0000168</pub-id>
          <pub-id pub-id-type="medline">36812634</pub-id>
          <pub-id pub-id-type="pii">PDIG-D-22-00226</pub-id>
          <pub-id pub-id-type="pmcid">PMC9931366</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Palanica</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Flaschner</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Thommandram</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Fossat</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Physicians' perceptions of chatbots in health care: cross-sectional web-based survey</article-title>
          <source>J Med Internet Res</source>
          <year>2019</year>
          <month>04</month>
          <day>05</day>
          <volume>21</volume>
          <issue>4</issue>
          <fpage>e12887</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2019/4/e12887/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/12887</pub-id>
          <pub-id pub-id-type="medline">30950796</pub-id>
          <pub-id pub-id-type="pii">v21i4e12887</pub-id>
          <pub-id pub-id-type="pmcid">PMC6473203</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Agrawal</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Hegselmann</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Lang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Sontag</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Large language models are few-shot clinical information extractors</article-title>
          <source>Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing</source>
          <year>2022</year>
          <publisher-loc>Abu Dhabi, United Arab Emirates</publisher-loc>
          <publisher-name>Association for Computational Linguistics</publisher-name>
          <fpage>1998</fpage>
          <lpage>2022</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Ameer</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Zuo</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Zero-shot clinical entity recognition using ChatGPT</article-title>
          <source>arXiv. Posted online March 29, 2023</source>
          <pub-id pub-id-type="doi">10.48550/arXiv.2303.16416</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lamichhane</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Evaluation of ChatGPT for NLP-based mental health applications</article-title>
          <source>arXiv. Preprint posted online March 28, 2023</source>
          <pub-id pub-id-type="doi">10.48550/arXiv.2303.15727</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Yuan</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Hayashi</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Neubig</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Pre-train, prompt, and predict: a systematic survey of prompting methods in natural language processing</article-title>
          <source>ACM Comput Surv</source>
          <year>2023</year>
          <month>01</month>
          <day>16</day>
          <volume>55</volume>
          <issue>9</issue>
          <fpage>1</fpage>
          <lpage>35</lpage>
          <pub-id pub-id-type="doi">10.1145/3560815</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Smola</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Automatic chain of thought prompting in large language models</article-title>
          <source>arXiv. Preprint posted online October 7, 2023</source>
          <pub-id pub-id-type="doi">10.48550/arXiv.2210.03493</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lester</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Al-Rfou</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Constant</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>The power of scale for parameter-efficient prompt tuning</article-title>
          <source>arXiv. Preprint posted online April 18, 2021</source>
          <pub-id pub-id-type="doi">10.18653/v1/2021.emnlp-main.243</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>White</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Hays</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sandborn</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Olea</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Gilbert</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>A prompt pattern catalog to enhance prompt engineering with ChatGPT</article-title>
          <source>arXiv. Preprint posted online February 21, 2023</source>
          <pub-id pub-id-type="doi">10.1007/978-1-4842-9852-7_4</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Schuurmans</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Bosma</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ichter</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Xia</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Chain-of-thought prompting elicits reasoning in large language models</article-title>
          <source>Adv Neural Inf Process Syst</source>
          <year>2022</year>
          <volume>35</volume>
          <fpage>24824</fpage>
          <lpage>24837</lpage>
          <pub-id pub-id-type="doi">10.48550/iv.2201.11903</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Schuurmans</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Le</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Chi</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Narang</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Self-consistency improves chain of thought reasoning in language models</article-title>
          <source>arXiv. Preprint posted online March 21, 2022</source>
          <pub-id pub-id-type="doi">10.48550/arXiv.2203.11171</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Boyle</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kascenas</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lok</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Liakata</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Automated clinical coding using off-the-shelf large language models</article-title>
          <source>arXiv. Preprint posted online October 10, 2023</source>
          <pub-id pub-id-type="doi">10.48550/arXiv.2310.06552</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Du</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Qian</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Ding</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Qiu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>GLM: general language model pretraining with autoregressive blank infilling</article-title>
          <source>arXiv. Preprint posted online March 18, 2021</source>
          <pub-id pub-id-type="doi">10.48550/arXiv.2103.10360</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bai</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bai</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Cui</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Dang</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Deng</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Qwen technical reports</article-title>
          <source>arXiv. Preprint posted online September 28, 2023</source>
          <pub-id pub-id-type="doi">10.48550/arXiv.2309.16609</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Devlin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Toutanova</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Bert: Pre-training of deep bidirectional transformers for language understanding</article-title>
          <source>Proceedings of the 2019 Annual Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</source>
          <year>2019</year>
          <conf-name>NAACL-HLT 2019</conf-name>
          <conf-date>January 2019</conf-date>
          <conf-loc>Minneapolis, MN</conf-loc>
          <fpage>4171</fpage>
          <lpage>4187</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cui</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Che</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Qin</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Pre-Training With Whole Word Masking for Chinese BERT</article-title>
          <source>IEEE/ACM Trans. Audio Speech Lang. Process</source>
          <year>2021</year>
          <volume>29</volume>
          <fpage>3504</fpage>
          <lpage>3514</lpage>
          <pub-id pub-id-type="doi">10.1109/TASLP.2021.3124365</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Jia</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Yin</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Dong</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Hua</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Conceptualized representation learning for Chinese biomedical text mining</article-title>
          <source>arXiv. Preprint posted online August 25, 2020</source>
          <pub-id pub-id-type="doi">10.48550/arXiv.2008.10813</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dong</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Dai</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>A survey on in-context learning</article-title>
          <source>arXiv. Preprint posted online December 31, 2022</source>
          <pub-id pub-id-type="doi">10.48550/arXiv.2301.00234</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Powers</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Evaluation: from precision, recall and F-measure to ROC, informedness, markedness and correlation</article-title>
          <source>arXiv. Preprint posted online October 11, 2020</source>
          <pub-id pub-id-type="doi">10.48550/arXiv.2010.16061</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
