<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v9i1e67311</article-id><article-id pub-id-type="doi">10.2196/67311</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Application of the Bidirectional Encoder Representations from Transformers Model for Predicting the Abbreviated Injury Scale in Patients with Trauma: Algorithm Development and Validation Study</article-title></title-group><contrib-group><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Tang</surname><given-names>Jun</given-names></name><degrees>MEng</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Li</surname><given-names>Yang</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Luo</surname><given-names>Keyu</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lai</surname><given-names>Jiangyuan</given-names></name><degrees>BM</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yin</surname><given-names>Xiang</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Wu</surname><given-names>Dongdong</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Information, Daping Hospital, Army Medical University</institution><addr-line>No.10 Daping Changjiang Branch Road, Yuzhong District</addr-line><addr-line>Chongqing</addr-line><country>China</country></aff><aff id="aff2"><institution>Department of Emergency Medicine, Medical Center of Trauma and War Injury, Daping Hospital, Army Medical University</institution><addr-line>Chongqing</addr-line><country>China</country></aff><aff id="aff3"><institution>National Key Laboratory of Trauma and Chemical Poisoning</institution><addr-line>Chongqing</addr-line><country>China</country></aff><aff id="aff4"><institution>Department of Orthopedics, Daping Hospital, Army Medical University</institution><addr-line>No.10 Daping Changjiang Branch Road, Yuzhong District, Chongqing City</addr-line><addr-line>Chongqing</addr-line><country>China</country></aff><aff id="aff5"><institution>Department of Traumatic Surgery, School of Basic Medicine, Army Medical University</institution><addr-line>Chongqing</addr-line><country>China</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Mavragani</surname><given-names>Amaryllis</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Williamson</surname><given-names>Frances</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Wang</surname><given-names>Muding</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Hartka</surname><given-names>Thomas</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Feng</surname><given-names>Wei</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Dongdong Wu, PhD, Department of Information, Daping Hospital, Army Medical University, No.10 Daping Changjiang Branch Road, Yuzhong District, Chongqing, 400042, China, 86 18302302369; <email>604269346@qq.com</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>29</day><month>5</month><year>2025</year></pub-date><volume>9</volume><elocation-id>e67311</elocation-id><history><date date-type="received"><day>08</day><month>10</month><year>2024</year></date><date date-type="rev-recd"><day>23</day><month>03</month><year>2025</year></date><date date-type="accepted"><day>07</day><month>04</month><year>2025</year></date></history><copyright-statement>&#x00A9; Jun Tang, Yang Li, Keyu Luo, Jiangyuan Lai, Xiang Yin, Dongdong Wu. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 29.5.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2025/1/e67311"/><abstract><sec><title>Background</title><p>Deaths related to physical trauma impose a heavy burden on society, and the Abbreviated Injury Scale (AIS) is an important tool for injury research. AIS covers injuries to various parts of the human body and scores them based on the severity of the injury. In practical applications, the complex AIS coding rules require experts to encode by consulting patient medical records, which inevitably increases the difficulty, time, and cost of evaluation of patient and also puts higher demands on the workload of information collection and processing. In some cases, the sheer number of patients or the inability to access detailed medical records necessary for coding further complicates independent AIS codes.</p></sec><sec><title>Objective</title><p>This study aims to use advanced deep learning techniques to predict AIS codes based on easily accessible diagnostic information of patients to improve the accuracy of trauma assessment.</p></sec><sec sec-type="methods"><title>Methods</title><p>We used a dataset of patients with trauma (n=26,810) collected by the Chongqing Daping Hospital between October 2013 and June 2024. We mainly selected the patient&#x2019;s diagnostic information, injury description, cause of injury, injury region, injury types, and present illness history as the key feature inputs. We used a robust optimization Bidirectional Encoder Representations from Transformers (BERT) pretraining method to embed these features and constructed a prediction model based on BERT. This model aims to predict AIS codes and comprehensively evaluate its performance through a 5-fold cross-validation. We compared the BERT model with previous research results and current mainstream machine learning methods to verify its advantages in prediction tasks. In addition, we also conducted external validation of the model using 244 external data points from the Chongqing Emergency Center.</p></sec><sec sec-type="results"><title>Results</title><p>The BERT model proposed in this paper performs significantly better than the comparison model on independent test datasets with an accuracy of 0.8971, which surpassed the previous study by 10 % points. In addition, the area under the curve (AUC value of the BERT model is 0.9970, and the <italic>F</italic><sub>1</sub>-score is 0.8434. In the external dataset, the accuracy, AUC, and <italic>F</italic><sub>1</sub>-score results of the model are 0.7131, 0.8586, and 0.6801, respectively. These results indicate that our model has high generalization ability and prediction accuracy.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The BERT model we proposed is mainly based on diagnostic information to predict AIS codes, and its prediction accuracy is superior to previous investigations and current mainstream machine learning methods. It has a high generalization ability in external datasets.</p></sec></abstract><kwd-group><kwd>trauma</kwd><kwd>abbreviated injury scale</kwd><kwd>deep learning</kwd><kwd>diagnostic information</kwd><kwd>transformer model</kwd><kwd>validation study</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>With the frequent occurrence of traffic crashes and the intensification of natural disasters, injuries have become the main cause of morbidity and mortality worldwide. According to the World Health Organization&#x2019;s (WHO) 2022 report [<xref ref-type="bibr" rid="ref1">1</xref>], approximately 4.4 million people die, and tens of millions endure from nonfatal injuries every year due to such incidents.</p><p>The Abbreviated Injury Scale (AIS) [<xref ref-type="bibr" rid="ref2">2</xref>] is the most widely used injury severity coding system, developed and periodically refined by the AIS Committee under the Association for the Advancement of Automotive Medicine (AAAM). AIS serves as the foundation for several severity scoring systems, such as the Injury Severity Score (ISS) [<xref ref-type="bibr" rid="ref3">3</xref>], the Maximum Abbreviated Injury Score [<xref ref-type="bibr" rid="ref4">4</xref>], and the New Injury Severity Score [<xref ref-type="bibr" rid="ref5">5</xref>]. Since 2008, the AIS score or ISS score has been used as a criterion for evaluating trauma centers in various countries and has now developed into a globally recognized trauma scoring system.</p><p>However, the AIS coding system is a highly refined and complex scoring system that covers injuries to various parts of the human body and scores them based on the severity of the injury. In practical applications, AIS codes often rely on the subjective judgment and rich clinical experience of medical professionals, which may lead to certain coding differences between different medical institutions or personnel. While advances in trauma care have improved overall outcomes, significant disparities persist across sociodemographic groups. Low-income populations experience 38% longer prehospital delays for penetrating injuries compared with high-income counterparts [<xref ref-type="bibr" rid="ref6">6</xref>], potentially biasing AIS severity assessments due to delayed clinical documentation. This dual challenge of subjective variability and systemic bias further increases the difficulty of accurate AIS code prediction.</p><p>The application of artificial intelligence (AI) models in medicine is increasing and many are based on AIS codes to predict mortality and prognosis outcomes [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]. While few studies have used diagnostic-related information to predict AIS codes. Although the neural machine translation (NMT) [<xref ref-type="bibr" rid="ref9">9</xref>] model uses International Classification of Diseases (ICD) codes and other relevant information to predict AIS codes, the accurate acquisition of ICD codes itself necessitates substantial coding effort and also depends on detailed medical records and other clinical information during the patient&#x2019;s diagnostic process. Therefore, this also puts higher demands on the workload of information collection and processing. To overcome these shortcomings, we hope to use advanced deep learning (DL) techniques to directly predict AIS codes based on easily accessible diagnostic information, thereby improving the accuracy of trauma assessment for patients.</p><p>Therefore, we aim to use patient with trauma data from Chongqing Daping Hospital from October 2013 to June 2024 to construct a Bidirectional Encoder Representations from Transformers (BERT) [<xref ref-type="bibr" rid="ref10">10</xref>] model based on DL for predicting the AIS codes corresponding to specific trauma. In this model, we use the patient&#x2019;s diagnostic information as the main input feature and compare it with the NMT model from previous research.</p></sec><sec id="s1-2"><title>Related Work</title><p>In recent years, AI technology has been frequently used to discover complex correlations between various features in medical applications [<xref ref-type="bibr" rid="ref11">11</xref>-<xref ref-type="bibr" rid="ref13">13</xref>], such as individual injuries and mortality [<xref ref-type="bibr" rid="ref14">14</xref>]. Lee et al [<xref ref-type="bibr" rid="ref15">15</xref>] developed an ensemble model based on deep neural networks, incorporating the ICD, triage scale, procedure codes, and other clinical features as inputs to predict in-hospital mortality among patients with physical trauma. This model achieved an area under the curve (AUC) of up to 0.9507, outperforming advanced predictive models such as AdaBoost and XGBoost. Kang et al [<xref ref-type="bibr" rid="ref8">8</xref>] created an AI algorithm grounded in DL models, leveraging the AIS codes to predict in-hospital mortality. By comparing their model with conventional ISS and New Injury Severity Score systems, they demonstrated the superior accuracy and AUC value of their proposed model. Tran et al [<xref ref-type="bibr" rid="ref16">16</xref>] used ICD-10 codes and machine learning (ML) algorithms to develop a mortality prediction model via the National Trauma Data Bank. A comparison of its performance with that of logistic regression, ISS, and Trauma Mortality Prediction Model (TMPM-ICD10) validated that their XGBoost&#x2013;based ML model exhibited superior performance. In terms of AIS code prediction, Hartka et al [<xref ref-type="bibr" rid="ref9">9</xref>] proposed the use of an NMT model to convert ICD codes into AIS codes and compared its accuracy in assessing injury severity with that of two established conversion methods: the ICD-AIS map [<xref ref-type="bibr" rid="ref17">17</xref>] and the ICD Programs for Injury Classification in R (ICDPIC-R) package [<xref ref-type="bibr" rid="ref18">18</xref>]. Their results demonstrated that the NMT model achieved the highest accuracy across all injury severity classifications.</p><p>In the past few years, advanced pretrained language representation models such as BERT, Robustly Optimized BERT Pretraining Approach (RoBERTa), and HFL (a Chinese BERT pretraining model) have made remarkable breakthroughs in the field of natural language processing, demonstrating significant performance gains in various tasks such as text classification, sentiment analysis, and question answering [<xref ref-type="bibr" rid="ref19">19</xref>]. However, to our current knowledge, despite the increasingly widespread application of DL techniques in the field of medical information processing, DL methods with pretrained language representation models have not yet been widely used for predicting AIS codes. Although the NMT model has shown some accuracy in predicting AIS codes, the AIS coding system, as a standardized tool for assessing the severity of injuries, occupies an important position in trauma medicine and emergency medicine. It can provide objective and comparable injury assessment for clinical doctors based on the specific injury situation of patients, which is of great significance for guiding treatment decisions and evaluating prognosis.</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Patients and Dataset</title><p>The Chongqing Daping Hospital Trauma Database contains data about patients&#x2019; diagnostic information, injury description, age, sex, place of injury, cause of injury, external cause code 1 (ECode1), external cause code 2 (ECode2), injury region, injury types, present illness history, and AIS codes, where the AIS codes are based on the AIS2015 version [<xref ref-type="bibr" rid="ref20">20</xref>], provided by professionally trained doctors according to the specific injury situation of the patient. To ensure the accuracy of the coding, the hospital has adopted a dual coding system: one doctor is responsible for preliminary coding, while the other doctor conducts follow-up checks.</p><p>The Daping Hospital Trauma Database contains data from 26,810 patients registered between October 1, 2013 and June 30, 2024 with the exclusion criteria of (1) patients transferred to another hospital, (2) patients who died in the emergency department before admission to the ICU or general ward, (3) data in the Daping Hospital Trauma Database with a feature loss rate &#x2265;30%, (4) samples with any missing AIS codes or diagnostic information, and (5) data with less than 30 AIS code categories. According to the exclusion criteria, 13,216 pieces of data met the requirements. In addition, we also used an external dataset of 244 Chongqing Emergency Centers that met the inclusion and exclusion criteria.</p><p>We divided the Daping Hospital Trauma Database dataset into training data and testing data. The training dataset includes data from October 1, 2013 to December 31, 2022, which is used to train our model to learn the mapping relationship from input features to target variables. The test dataset includes data from January 1, 2023 to June 30, 2024. This partitioning dataset method has multiple benefits: the test dataset uses the most recent data, which can better evaluate the model&#x2019;s adaptability. By having the model learn historical data during the training phase and then face different but relevant data during the testing phase, it can encourage the model to learn more generalized features, which helps improve the accuracy of predicting future unknown data.</p><p>The number of training, testing, and external datasets is 10,827, 2389, and 244, respectively, with 337 types of AIS codes included in the training dataset, 332 types in the testing dataset, and 83 in the external dataset. In the training dataset, the number of AIS code 853161.3 is the highest, reaching 475 (accounting for 4.38%), while the number of AIS code 854221.2 is the lowest, at 25 (accounting for 0.23%). In the test dataset, the number of AIS code 853161.3 is also the highest, at 119 (accounting for 4.98%), while the number of AIS code 910200.1 is the lowest, at 6 (accounting for 0.25%). In the external dataset, the number of AIS code 853161.3 is also the highest, reaching 10 (accounting for 4.10%), while the number of AIS code 856151.2 is the lowest, at 1 (accounting for 0.41%). The injuries covered by our dataset are mainly concentrated in areas such as the skin, limbs, and head and neck, especially those types of injuries that are most common in practical work and have a direct impact on clinical decision-making and treatment plans.</p><p>The testing dataset (n=2389) was used only to independently test our developed model and not for training or internal validation. We first performed a 5-fold cross-validation using the training data to prevent overfitting. The training dataset (n=10,827) was randomly shuffled and stratified into 5 equal groups with 4 groups used for training and 1 group used for validation. This process was repeated 5 times by shifting the internal validation group. Then, the overall performance of the model was evaluated through independent testing data. Finally, the generalization ability of the model was validated through multicenter validation using external data.</p></sec><sec id="s2-2"><title>BERT Prediction Model Development</title><sec id="s2-2-1"><title>BERT Model Architecture</title><p>As shown in <xref ref-type="fig" rid="figure1">Figure 1</xref>, we developed a DL&#x2013;based BERT model for predicting AIS codes. The model uses the masked language modeling technique of the pretrained model RoBERTa [<xref ref-type="bibr" rid="ref10">10</xref>] to learn contextual information about data of patient with trauma. This approach can capture both forward and backward contextual information of the input sequences to achieve a deeper understanding of input textual data.</p><p>To provide further clarity on the data used in our model, we have included a <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> with examples of model inputs and outputs. These examples illustrate the structure of the data and how our model processes it to generate AIS scores.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>The Bidirectional Encoder Representations from Transformers model architecture. AIS: Abbreviated Injury Scale.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e67311_fig01.png"/></fig><sec id="s2-2-1-1"><title>BERT Pretraining</title><p>Our BERT model is a pretraining model based on a modified version of the RoBERTa model, and we similarly carry out in-depth optimizations of the BERT model, including the implementation of dynamically tuned masking strategies and enhanced text encoding processing. The significant advantage of these optimizations is that by randomly performing masking operations on the input data, different masking patterns are used for the same training data in different training epochs, thus effectively increasing data diversity during model training without the need to expand the training dataset. In addition, by adopting a larger batch size, more training data, and longer training time, the BERT model can learn richer linguistic features and potential patterns in the data, which significantly improves the prediction performance of the model.</p><p>Specifically, the vocabulary list used for training includes the patient&#x2019;s diagnostic information, injury description, age, sex, place of injury, cause of injury, procedure codes (ECode1 and ECode2), injury region, injury types, and present illness history, as well as the necessary special tokens (eg,&#x003C;CLS_TOKEN&#x003E;,&#x003C;SEP_TOKEN&#x003E;,&#x003C;PAD_TOKEN&#x003E;, and &#x003C;MASK_TOKEN&#x003E;). The entire text sequence is treated as a sentence with the sequence being identified by the start token &#x003C;CLS_TOKEN&#x003E; and the end token &#x003C;SEP_TOKEN&#x003E;;&#x003C;PAD_TOKEN&#x003E; is used to pad the sequence to a uniform length, and &#x003C;MASK_TOKEN&#x003E; is used for masking. We chose the masked language modeling method for training to accommodate the need for contextual understanding in AIS code prediction. During training, we randomly perturbed 15% of the trauma diagnostic information elements in the input sequences, similar to the RoBERTa setup, where 80% of the tokens of these 15% selected elements were replaced with &#x003C;MASK_TOKEN&#x003E;, requiring the model to predict the correct AIS codes at these &#x003C;MASK_TOKEN&#x003E; locations during training, 10% of the tokens were replaced with other trauma diagnostic information randomly selected to increase the difficulty of training and generalization, and the remaining 10% remained unchanged and served as positive samples for model learning. During pretraining, the model predicts what kind of residue it is in the masked position. For each batch, the loss is defined as:</p><disp-formula id="E2"><mml:math id="eqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>L</mml:mi><mml:mi>o</mml:mi><mml:mi>s</mml:mi><mml:mi>s</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mi>b</mml:mi><mml:mi>a</mml:mi><mml:mi>t</mml:mi><mml:mi>c</mml:mi><mml:mi>h</mml:mi><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow></mml:mrow></mml:mfrac><mml:munder><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mi>q</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mi>b</mml:mi><mml:mi>a</mml:mi><mml:mi>t</mml:mi><mml:mi>c</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:munder><mml:munder><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>k</mml:mi></mml:mrow></mml:munder><mml:mi>l</mml:mi><mml:mi>o</mml:mi><mml:mi>g</mml:mi><mml:mspace width="mediummathspace"/><mml:mrow><mml:mover><mml:mi>p</mml:mi><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>s</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where <inline-formula><mml:math id="ieqn1"><mml:mover accent="true"><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover><mml:mo>(</mml:mo><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>|</mml:mo><mml:msub><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mo>\</mml:mo><mml:mi mathvariant="normal">m</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">k</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:math></inline-formula> represents the probability that the model predicts the element <inline-formula><mml:math id="ieqn2"><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> at the <inline-formula><mml:math id="ieqn3"><mml:mi>i</mml:mi></mml:math></inline-formula>th masked position, given all the sequence information except for the masked positions.</p></sec><sec id="s2-2-1-2"><title>BERT Fine-Tuning</title><p>We consider the task of AIS code prediction as a multivariate labeling classification task, where the BERT model is used to predict the AIS codes for each instance in the input sequence. To accomplish this, we map AIS code categories to unique integer labels, which are used as supervised learning objectives. We add a multivariate classification header on top of the pretrained model, whose output dimension matches the number of AIS codes categories. During training, the BERT model receives input sequences and extracts contextually relevant feature representations. These features are then passed to the multivariate classification header to generate predicted probabilities corresponding to each AIS code category. We use a cross-entropy loss function to compute the difference between the predicted probability distribution and the true AIS code label. The loss during fine-tuning is defined as:</p><disp-formula id="E3"><mml:math id="eqn2"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>L</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>N</mml:mi></mml:mfrac><mml:munderover><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:munderover><mml:mi>l</mml:mi><mml:mi>o</mml:mi><mml:mi>g</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mrow><mml:mover><mml:mi>y</mml:mi><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where <inline-formula><mml:math id="ieqn4"><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is the true category index of the <inline-formula><mml:math id="ieqn5"><mml:mi>i</mml:mi></mml:math></inline-formula>th sample, and <inline-formula><mml:math id="ieqn6"><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msub></mml:math></inline-formula> is the probability that the <inline-formula><mml:math id="ieqn7"><mml:mi>i</mml:mi></mml:math></inline-formula>th sample predicted by the model belongs to category <inline-formula><mml:math id="ieqn8"><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>.</p></sec></sec></sec><sec id="s2-3"><title>BERT Model Configuration</title><p>After a careful hyperparameter search, we determined the optimal model configuration: an 8-layer BERT architecture including an input layer, six 384-unit hidden layers, and an output layer, which together form the encoder-decoder transformer components with 5 transformer blocks. In the process of determining the 8-layer architecture, we conducted subsequent experiments and tried different configurations of BERT models with 4, 8, and 12 layers and comprehensively evaluated their performance. The results showed that the 8-layer architecture demonstrated excellent performance on multiple evaluation metrics, so we ultimately chose this architecture for our research. The model is also configured with 12 attention heads and an embedding size of 128. During optimization, we used the Adam optimizer to adjust the model weights with a learning rate of 0.0001, a batch size of 64, and the Gaussian Error Linear Unit as the activation function. To further reduce the number of parameters and computational costs, we implemented low-rank factorization techniques in the embedding layer.</p><p>The BERT model was trained via early stopping with training concluding after the validation loss did not improve after 10 epochs. The model was trained on a computing cluster with 48 GB of memory and one Graphics Processing Unit (NVIDIA RTX A6000). We implemented the model through the Pytorch framework.</p></sec><sec id="s2-4"><title>Evaluation Methods</title><p>We use accuracy, AUC, and <italic>F</italic><sub>1</sub>-scores as evaluation metrics. Because our prediction task is a multiclass problem and all categories of AIS codes are considered equally important, we chose the macro averaging method when calculating AUC and <italic>F</italic><sub>1</sub>-scores to ensure that the model&#x2019;s performance across all AIS code categories is comprehensively reflected.</p></sec><sec id="s2-5"><title>Comparison Method</title><p>To validate the performance of our BERT model, we selected a series of representative methods for comparison. Specifically, we used previous research (NMT) and current mainstream ML methods, including K-nearest neighbors (KNN), multilayer perceptron (MLP), XGBoost, AdaBoost, and decision tree (DT). The following is a detailed introduction to these methods:</p><sec id="s2-5-1"><title>Neural Machine Translation</title><p>The NMT model proposed by Hartka et al [<xref ref-type="bibr" rid="ref9">9</xref>] is a DL technique commonly used for human language translation. The model is implemented using OpenNMT, an open-source toolkit developed by the Harvard NLP team and SYSTRAN for NMT, to convert ICD codes into exact AIS codes. This paper shares the same goals and tasks as their work. However, accurately obtaining ICD codes not only requires a lot of coding work but also relies on detailed medical records and other clinical information during the patient&#x2019;s diagnostic process. In contrast, the BERT model mainly relies on easily accessible diagnostic information to predict specific AIS codes. The NMT model and the BERT model both adopt a similar Transformer architecture. In their experimental configuration, the NMT model includes 6 hidden layers with 512 units, 8 attention heads, a loss rate of 0.1, weights adjusted by the Adam optimizer, learning rate decay determined by Noam decay, and classification cross entropy used as the training loss function.</p></sec><sec id="s2-5-2"><title>Machine Learning</title><p>For ML methods, we use Word2Vec word embedding technology to convert text data into a format suitable for ML algorithm processing.</p><sec id="s2-5-2-1"><title>K-Nearest Neighbor</title><p>KNN is a simple but effective classification algorithm. It is based on distance metrics such as Euclidean distance to find the k samples in the training set that are most similar to the test samples and predicts the category of the test samples based on the categories of these neighbors. In our experiment, the value of k was set to 3.</p></sec><sec id="s2-5-2-2"><title>Multilayer Perceptron</title><p>MLP is a feedforward neural network consisting of an input layer, a hidden layer, and an output layer. It approximates complex functional relationships through multilayer nonlinear transformations. In our experiment, MLP used 2 hidden layers with the first layer having 20 neurons and the second layer having 50 neurons.</p></sec><sec id="s2-5-2-3"><title>XGBoost</title><p>XGBoost is an ensemble learning method based on gradient boosting, which constructs strong classifiers by combining multiple weak classifiers. XGBoost has achieved significant performance improvements in multiple fields, especially when dealing with large-scale datasets and high-dimensional features. In our experiment, we used multiclass log loss as an evaluation metric, which is a commonly used choice in multiclass classification problems.</p></sec><sec id="s2-5-2-4"><title>AdaBoost</title><p>AdaBoost is an adaptive boosting algorithm that constructs strong classifiers by adjusting the weights of each weak classifier. AdaBoost has demonstrated strong performance in handling classification tasks, especially when dealing with imbalanced datasets. In our experiment, a DT stump (with a depth of 1) was used as the weak classifier.</p></sec><sec id="s2-5-2-5"><title>Decision Tree</title><p>DT is an intuitive classification and regression method. It generates decision paths through a series of conditional judgments, thereby achieving classification or regression prediction of samples. In our experiment, the default Gini impurity was used as a measure of splitting quality, and the model was constructed by recursively segmenting the feature space.</p></sec></sec></sec><sec id="s2-6"><title>Ethical Considerations</title><p>This study is an observational study; the data used were reviewed and approved by the Internal Review Board of Chongqing Daping Hospital (approval number: 2024_219), and informed consent from patients was exempted.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>BERT Model Training</title><p>The Daping Hospital dataset used in this study contains a total of 13,216 records, including 10,827 in the training dataset and 2389 in the independent testing dataset. In addition, we obtained 244 external data from the Chongqing Emergency Center. <xref ref-type="table" rid="table1">Table 1</xref> provides a detailed list of the demographic characteristics and injury status of these datasets. In both the training and testing datasets, the IQR of age is 33 years. In terms of gender distribution, males accounted for 62.5%, 59.3%, and 63.9% of the training, testing, and external datasets, respectively. In terms of injury causes, the most common were falls (accounting for 57.2%, 57.9%, and 41.0% in the 3 datasets) and traffic crashes (accounting for 14.4%, 16.6%, and 43.9%, respectively). To reflect the distribution of severity data for single injuries, we presented the severity distribution of individual injuries based on the data after the decimal point of the AIS code. Our dataset mainly includes data for mild, moderate, and severe injuries. It is worth noting that in all 3 datasets, moderate injuries account for the highest proportion, at 45%, 51.5%, and 49.6%, respectively.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Demographic and injury characteristics of patients with trauma in the datasets (N=13460).</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Variables</td><td align="left" valign="bottom">Training dataset<break/>(2013/10&#x2010;2022/12)</td><td align="left" valign="bottom">Testing dataset<break/>(2023/01&#x2010;2024/06)</td><td align="left" valign="bottom">External dataset<break/>(2023/09&#x2010;2023/10)</td></tr></thead><tbody><tr><td align="left" valign="top">Total number of patients</td><td align="left" valign="top">10,827</td><td align="left" valign="top">2389</td><td align="left" valign="top">244</td></tr><tr><td align="left" valign="top">Age range (years), IQR</td><td align="left" valign="top">1&#x2010;102 (33)</td><td align="left" valign="top">3&#x2010;98 (33)</td><td align="left" valign="top">14&#x2010;97 (31)</td></tr><tr><td align="left" valign="top">Males (%)</td><td align="left" valign="top">6771 (62.5)</td><td align="left" valign="top">1419 (59.3)</td><td align="left" valign="top">156 (63.9)</td></tr><tr><td align="left" valign="top"><bold>Mechanism of injury, n (%)</bold></td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Traffic crash</td><td align="left" valign="top">1557 (14.4)</td><td align="left" valign="top">397 (16.6)</td><td align="left" valign="top">107 (43.9)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Falls</td><td align="left" valign="top">6189 (57.2)</td><td align="left" valign="top">1382 (57.9)</td><td align="left" valign="top">100 (41.0)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Blunt</td><td align="left" valign="top">123 (1.1)</td><td align="left" valign="top">80 (3.3)</td><td align="left" valign="top">22 (9.0)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Sports injury</td><td align="left" valign="top">329 (3.0)</td><td align="left" valign="top">140 (5.9)</td><td align="left" valign="top">10 (4.1)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Other</td><td align="left" valign="top">2629 (24.3)</td><td align="left" valign="top">390 (16.3)</td><td align="left" valign="top">5 (2.0)</td></tr><tr><td align="left" valign="top"><bold>Severity of injury, n (%)</bold></td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Mild injury</td><td align="left" valign="top">2468 (22.8)</td><td align="left" valign="top">392 (16.4)</td><td align="left" valign="top">40 (16.4)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Moderate injury</td><td align="left" valign="top">4870 (45.0)</td><td align="left" valign="top">1231 (51.5)</td><td align="left" valign="top">121 (49.6)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Severe injury</td><td align="left" valign="top">3489 (32.2)</td><td align="left" valign="top">766 (32.1)</td><td align="left" valign="top">83 (34.0)</td></tr></tbody></table></table-wrap></sec><sec id="s3-2"><title>Overall Predictive Performance of the BERT Model</title><p>We first compared the performance of the BERT model with the NMT model and several advanced ML models, including KNN, MLP, XGBoost, AdaBoost, and DT. Comparison results on our independent test dataset are shown in <xref ref-type="table" rid="table2">Table 2</xref>. We used accuracy, AUC, and <italic>F</italic><sub>1</sub>-scores as evaluation metrics. It is worth noting that since the NMT model only provides prediction results for AIS code accuracy, we only present the accuracy of NMT in <xref ref-type="table" rid="table2">Table 2</xref>. For other metrics not provided by the NMT model, we uniformly use the symbol &#x201C;&#x2014;&#x201D; for annotation.</p><p>The performance of our proposed BERT model is significantly better than all comparison models across all indicators. Specifically, the accuracy of the BERT model is as high as 0.8971, while the accuracy of the NMT model is only 0.7380 with a difference of over 10 % points between the two. In addition, the AUC value of the BERT model is 0.9970, and the <italic>F</italic><sub>1</sub>-score is 0.8434. Among all the ML methods compared, the DT method achieved excellent performance in accuracy and AUC of 0.8506 and 0.9945, respectively, and the XGBoost method achieved the best results in the <italic>F</italic><sub>1</sub> index at 0.7586, but they still failed to surpass the performance of our BERT model. These comparative experiments fully demonstrate that our BERT model has high prediction accuracy.</p><p><xref ref-type="fig" rid="figure2">Figure 2</xref> shows the training curve of the BERT model with the (<bold>A</bold>) and (<bold>B</bold>) graphs displaying their results on the training and testing datasets, respectively. The x-axis represents epochs and is set to 50, whereas the y-axis represents the values of accuracy, AUC, and <italic>F</italic><sub>1</sub>-score. The figure shows that the BERT model tends to be stable in all the metrics when the training or test dataset reaches epoch 26. The test dataset shows that the model performs well in terms of overall prediction accuracy and accurately classifies samples into the correct categories. For both the training and testing datasets, the AUC is close to 1, and high AUC values further demonstrate the model&#x2019;s strong ability to distinguish between positive and negative samples, maintaining excellent performance at almost all possible classification thresholds.</p><p>To demonstrate the performance advantage of our fine-tuned BERT model, we compared it with the base version of BERT (BERT-base) and the pretrained model HFL/chinese-roberts-wwm-ext [<xref ref-type="bibr" rid="ref21">21</xref>]. The comparison results are shown in <xref ref-type="table" rid="table3">Table 3</xref>. Compared to the BERT-base and HFL models, the BERT model exhibits higher performance in accuracy, reaching 0.8971, while also significantly leading in the <italic>F</italic><sub>1</sub>-score, at 0.8434. The results of these two indicators are 2.99% and 7.54% higher than the HFL model, respectively, indicating that our BERT model achieves a better balance between accuracy and recall in classification tasks. Although the BERT is not significantly different from HFL and BERT-base on AUC, its advantages in key evaluation metrics highlight BERT&#x2019;s superior overall performance in handling specific tasks in this paper.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Prediction results of the Bidirectional Encoder Representations from Transformers model and comparative model in the test dataset.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Accuracy</td><td align="left" valign="bottom">AUC<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="bottom"><italic>F</italic><sup><sub>1</sub></sup>-scores</td></tr></thead><tbody><tr><td align="left" valign="top">NMT<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">0.7380</td><td align="left" valign="top">NA<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">NA</td></tr><tr><td align="left" valign="top">KNN<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup></td><td align="left" valign="top">0.7935</td><td align="left" valign="top">0.9414</td><td align="left" valign="top">0.6879</td></tr><tr><td align="left" valign="top">MLP<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td><td align="left" valign="top">0.8064</td><td align="left" valign="top">0.9886</td><td align="left" valign="top">0.6194</td></tr><tr><td align="left" valign="top">XGBoost<sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup></td><td align="left" valign="top">0.8374</td><td align="left" valign="top">0.9937</td><td align="left" valign="top">0.7586</td></tr><tr><td align="left" valign="top">AdaBoost<sup><xref ref-type="table-fn" rid="table2fn7">g</xref></sup></td><td align="left" valign="top">0.8506</td><td align="left" valign="top">0.9860</td><td align="left" valign="top">0.7050</td></tr><tr><td align="left" valign="top">DT<sup><xref ref-type="table-fn" rid="table2fn8">h</xref></sup></td><td align="left" valign="top">0.8506</td><td align="left" valign="top">0.9945</td><td align="left" valign="top">0.7049</td></tr><tr><td align="left" valign="top">BERT<sup><xref ref-type="table-fn" rid="table2fn9">i</xref></sup></td><td align="left" valign="top">0.8971</td><td align="left" valign="top">0.9970</td><td align="left" valign="top">0.8434</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>AUC: area under the curve.</p></fn><fn id="table2fn2"><p><sup>b</sup>NMT: neural machine translation.</p></fn><fn id="table2fn3"><p><sup>c</sup>NA: not available.</p></fn><fn id="table2fn4"><p><sup>d</sup>KNN: K-Nearest Neighbor.</p></fn><fn id="table2fn5"><p><sup>e</sup>MLP: multilayer perceptron.</p></fn><fn id="table2fn6"><p><sup>f</sup>XGBoost: Extreme Gradient Boosting.</p></fn><fn id="table2fn7"><p><sup>g</sup>AdaBoost: adaptive boosting</p></fn><fn id="table2fn8"><p><sup>h</sup>DT: decision tree.</p></fn><fn id="table2fn9"><p><sup>i</sup>BERT: Bidirectional Encoder Representations from Transformers</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>The visualization curve of Bidirectional Encoder Representations from Transformers model&#x2019;s predictive performance on training and testing datasets. AUC: area under the receiver operating characteristic curve.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e67311_fig02.png"/></fig><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Comparison between our Bidirectional Encoder Representations from Transformers model and other pretrained models in the test dataset.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Accuracy</td><td align="left" valign="bottom">AUC<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-scores</td></tr></thead><tbody><tr><td align="left" valign="top">BERT-base<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="top">0.8559</td><td align="left" valign="top">0.9971</td><td align="left" valign="top">0.7284</td></tr><tr><td align="left" valign="top"><sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup>HFL</td><td align="left" valign="top">0.8672</td><td align="left" valign="top">0.9973</td><td align="left" valign="top">0.7680</td></tr><tr><td align="left" valign="top">BERT</td><td align="left" valign="top">0.8971</td><td align="left" valign="top">0.9970</td><td align="left" valign="top">0.8434</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>AUC: area under the curve.</p></fn><fn id="table3fn2"><p><sup>b</sup>BERT: Bidirectional Encoder Representations from Transformers.</p></fn><fn id="table3fn3"><p><sup>c</sup>HFL: a Chinese BERT pretraining model.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-3"><title>Ablation Study</title><p>To verify the maximum contribution of specific input data feature combinations to the model, we designed a series of ablation studies. The input features of the original experiment include patients&#x2019; diagnostic information, age, sex, injury description, place of injury, cause of injury, procedure codes (ECode1 and ECode2), injury region, injury types, and present illness history.</p><p>First, we conducted a single-factor ablation study to observe the impact of removing each feature item one by one on the performance of the model. The experimental results show that diagnostic information and injury description are the most significant data types that affect the performance of the model. Based on the findings of the single-factor ablation study, we further designed the following multifactor ablation study to explore the importance of diagnostic information and injury description in feature combination. The specific results are shown in <xref ref-type="table" rid="table4">Table 4</xref>.</p><p>As shown in <xref ref-type="table" rid="table4">Table 4</xref>, after removing diagnostic information, the accuracy and <italic>F</italic><sub>1</sub>-score of the model significantly decreased. Although the AUC remained high, it also decreased, indicating that diagnostic information features have a significant impact on the performance of the model. After removing the injury description, the performance of the model also decreased, but the decrease was smaller compared to removing diagnostic information. The ablation study demonstrated that diagnostic information contributed the most to the model, followed by the injury description, and the feature combination we used achieved the best results.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Results of ablation study in the test dataset.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Accuracy</td><td align="left" valign="bottom">AUC<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-scores</td></tr></thead><tbody><tr><td align="left" valign="top">Diagnostic information removed<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="left" valign="top">0.6033</td><td align="left" valign="top">0.9523</td><td align="left" valign="top">0.4668</td></tr><tr><td align="left" valign="top">Injury description removed<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td><td align="left" valign="top">0.8888</td><td align="left" valign="top">0.9875</td><td align="left" valign="top">0.8047</td></tr><tr><td align="left" valign="top">Diagnostic information and injury description removed<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup></td><td align="left" valign="top">0.6014</td><td align="left" valign="top">0.9438</td><td align="left" valign="top">0.4699</td></tr><tr><td align="left" valign="top">BERT<sup><xref ref-type="table-fn" rid="table4fn5">e</xref></sup></td><td align="left" valign="top">0.8971</td><td align="left" valign="top">0.9970</td><td align="left" valign="top">0.8434</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>AUC: area under the curve.</p></fn><fn id="table4fn2"><p><sup>b</sup>Diagnostic information removed: Remove diagnostic information based on all basic features.</p></fn><fn id="table4fn3"><p><sup>c</sup>Injury description removed: Remove the injury description based on all basic features.</p></fn><fn id="table4fn4"><p><sup>d</sup>Diagnostic information and injury description removed (simultaneously remove diagnostic information and injury description): Based on all basic features, simultaneously remove diagnostic information and injury description.</p></fn><fn id="table4fn5"><p><sup>e</sup>BERT: Bidirectional Encoder Representations from Transformers.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-4"><title>External Validation</title><p>External validation of the constructed BERT model was conducted using data from 244 patients with trauma at Chongqing Emergency Center. This external dataset has a similar data structure to the training dataset, containing a total of 83 AIS code categories. The experimental results of the external dataset are shown in <xref ref-type="table" rid="table5">Table 5</xref> with accuracy, AUC, and <italic>F</italic><sub>1</sub> of 0.7131, 0.8586, and 0.6801, respectively. Compared with the test dataset, the performance of the BERT model slightly decreases on external datasets, which may be due to differences in data distribution between different medical institutions. But the overall performance is still satisfactory, indicating that the BERT model has strong generalization ability.</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Validation results of Bidirectional Encoder Representations from Transformers model on external datasets.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Dataset</td><td align="left" valign="bottom">Accuracy</td><td align="left" valign="bottom">AUC<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup></td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-scores</td></tr></thead><tbody><tr><td align="left" valign="top">Test dataset</td><td align="left" valign="top">0.8971</td><td align="left" valign="top">0.9970</td><td align="left" valign="top">0.8434</td></tr><tr><td align="left" valign="top">External dataset</td><td align="left" valign="top">0.7131</td><td align="left" valign="top">0.8586</td><td align="left" valign="top">0.6801</td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>AUC: area under the curve.</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>In this study, we successfully constructed a BERT&#x2013;based DL model using data of patient with trauma from Chongqing Daping Hospital to predict AIS codes, achieving an accuracy of 89.71%. This model, leveraging patient diagnostic information as primary input features, demonstrated superior performance compared to existing advanced AIS prediction models, including the previously studied NMT framework [<xref ref-type="bibr" rid="ref9">9</xref>]. Additionally, we validated the model&#x2019;s high generalization ability using data from an external center, thereby fulfilling our objective of enhancing trauma assessment through DL.</p><p>Key innovations such as dynamic masking strategies, low-rank embedding decomposition, and bidirectional contextual modeling enabled the model to capture nuanced clinical semantics while maintaining computational efficiency. Notably, the model directly outputs complete AIS codes&#x2014;a critical advancement over rule-based methods like the ICD-AIS map [<xref ref-type="bibr" rid="ref17">17</xref>] and the ICDPIC-R package [<xref ref-type="bibr" rid="ref18">18</xref>] tools, which lack granular code prediction capabilities. These results underscore the potential of transformer-based architectures to enhance trauma assessment workflows, particularly in scenarios requiring rapid, large-scale injury coding.</p><p>Our findings align with emerging evidence supporting transformer models in clinical text processing [<xref ref-type="bibr" rid="ref10">10</xref>], yet extend prior work by addressing the unique challenges of AIS coding. Unlike NMT models that process sequential tokens independently via recurrent mechanisms [<xref ref-type="bibr" rid="ref9">9</xref>], the BERT model&#x2019;s bidirectional attention dynamically links contextual elements of injury descriptions. In addition, the pretrained biomedical embeddings provided higher precision in rare injury terminology recognition compared to NMT&#x2019;s task-specific training.</p><p>The model&#x2019;s external validation performance further reinforces its clinical utility. While annual variations in trauma patterns typically degrade conventional models, our temporal split testing revealed stable predictive accuracy. This robustness suggests the framework could adapt to shifting trauma trends without frequent retraining. Moreover, direct AIS code generation eliminates the multistep mapping required by ICD-based tools [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>], reducing error propagation risks in mass casualty scenarios where rapid triage coding is critical.</p><p>These findings suggest that BERT may become a powerful tool for injury research. Although independent coding of AIS injuries by trained medical professionals and comprehensive medical data remains the gold standard in this field, in some cases, such as when the number of patients is large or detailed medical records are difficult to obtain, independent coding becomes impractical. At this point, given input features that satisfy the model, our BERT model can automatically provide prediction results for AIS codes, providing highly accurate AIS code predictions for individual patients.</p></sec><sec id="s4-2"><title>Limitations</title><p>Our research has several limitations. First, during the data collection and processing phase, a large amount of data was excluded due to the lack of information on injury description and present illness history, which had a significant impact on the integrity and representativeness of the final dataset. Second, while low-rank decomposition improved efficiency, BERT&#x2019;s inherent sequence length restrictions (&#x2264;512 tokens) may truncate complex trauma descriptions. Third, the Chinese-language training data raises questions about cross-lingual applicability, given known variations in medical terminologies across languages [<xref ref-type="bibr" rid="ref22">22</xref>]. Finally, as with most DL systems, the model&#x2019;s black-box nature limits clinical interpretability.</p><p>Future studies should address these gaps by (1) integrating multimodal data (eg, imaging reports) to compensate for text incompleteness, (2) benchmarking against large language models with superior few-shot learning capacities, and (3) developing hybrid systems that combine BERT&#x2019;s predictive power with large language model&#x2013;driven explainability features.</p></sec><sec id="s4-3"><title>Conclusions</title><p>The BERT model we propose is mainly based on diagnostic information to predict AIS codes, and its prediction accuracy is superior to existing methods. These findings highlight the potential of advanced AI techniques to enhance clinical decision-making processes and improve the efficiency and accuracy of AIS code prediction.</p><p>By automating a task that traditionally requires hours of expert review per case, our framework could democratize high-quality trauma registries in resource-limited settings. Crucially, the model does not seek to replace human coders but provides a scalable adjunct for high-volume scenarios&#x2014;a balance increasingly advocated in AI-augmented health care [<xref ref-type="bibr" rid="ref23">23</xref>]. As trauma systems worldwide adopt electronic health records, such tools may transform retrospective coding into a prospective clinical decision aid, ultimately bridging the gap between injury documentation and precision trauma care. Future iterations incorporating multi-institutional data and explainability interfaces could further establish BERT-derived models as indispensable tools in computational trauma epidemiology.</p></sec></sec></body><back><ack><p>This work was supported by the National Natural Science Foundation of China (72374206), the Artificial Intelligence Medical Research Project of Daping Hospital in Chongqing, China (ZXAIYB015), the Basic Research Project of the Army Characteristic Medical Center (ZXJCYJ202414), and the Chongqing Science and Health Joint Medical Research Project (2024MSXM084).</p></ack><notes><sec><title>Data Availability</title><p>The dataset used in this study is not publicly available but is available from the corresponding author on reasonable request.</p></sec></notes><fn-group><fn fn-type="other"><label>Author Note</label><p>The algorithm code is currently not publicly available. Researchers can apply for academic permission to use the model code by contacting the corresponding author via email.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AAAM</term><def><p>Association for the Advancement of Automotive Medicine</p></def></def-item><def-item><term id="abb2">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb3">AIS</term><def><p>Abbreviated Injury Scale</p></def></def-item><def-item><term id="abb4">AUC</term><def><p>area under the curve</p></def></def-item><def-item><term id="abb5">BERT</term><def><p>Bidirectional Encoder Representations from Transformers</p></def></def-item><def-item><term id="abb6">DL</term><def><p>deep learning</p></def></def-item><def-item><term id="abb7">DT</term><def><p>decision tree</p></def></def-item><def-item><term id="abb8">ECode</term><def><p>external cause code</p></def></def-item><def-item><term id="abb9">ICD</term><def><p>International Classification of Diseases</p></def></def-item><def-item><term id="abb10">ISS</term><def><p>Injury Severity Score</p></def></def-item><def-item><term id="abb11">KNN</term><def><p>K-nearest neighbor</p></def></def-item><def-item><term id="abb12">ML</term><def><p>machine learning</p></def></def-item><def-item><term id="abb13">MLP</term><def><p>multilayer perceptron</p></def></def-item><def-item><term id="abb14">NMT</term><def><p>neural machine translation</p></def></def-item><def-item><term id="abb15">RoBERTa</term><def><p>Robustly Optimized BERT Pretraining Approach</p></def></def-item><def-item><term id="abb16">WHO</term><def><p>World Health Organization</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><collab>World Health Organization</collab></person-group><article-title>Global Status Report on Violence Prevention 2022</article-title><source>World Health Organization</source><year>2022</year><comment><ext-link ext-link-type="uri" xlink:href="https://www.who.int/news-room/fact-sheets/detail/injuries-and-violence">https://www.who.int/news-room/fact-sheets/detail/injuries-and-violence</ext-link></comment></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><article-title>Rating the severity of tissue damage: I. The abbreviated scale</article-title><source>JAMA</source><year>1971</year><month>01</month><day>11</day><volume>215</volume><issue>2</issue><fpage>277</fpage><lpage>280</lpage><pub-id pub-id-type="doi">10.1001/jama.1971.03180150059012</pub-id><pub-id pub-id-type="medline">5107365</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Baker</surname><given-names>SP</given-names> </name><name name-style="western"><surname>O&#x2019;Neill</surname><given-names>B</given-names> </name><name name-style="western"><surname>Haddon</surname><given-names>W</given-names>  <suffix>Jr</suffix></name><name name-style="western"><surname>Long</surname><given-names>WB</given-names> </name></person-group><article-title>The injury severity score: a method for describing patients with multiple injuries and evaluating emergency care</article-title><source>J Trauma</source><year>1974</year><month>03</month><volume>14</volume><issue>3</issue><fpage>187</fpage><lpage>196</lpage><pub-id pub-id-type="medline">4814394</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Baker</surname><given-names>SP</given-names> </name><name name-style="western"><surname>O&#x2019;Neill</surname><given-names>B</given-names> </name></person-group><article-title>The injury severity score: an update</article-title><source>J Trauma</source><year>1976</year><month>11</month><volume>16</volume><issue>11</issue><fpage>882</fpage><lpage>885</lpage><pub-id pub-id-type="doi">10.1097/00005373-197611000-00006</pub-id><pub-id pub-id-type="medline">994270</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kuo</surname><given-names>SCH</given-names> </name><name name-style="western"><surname>Kuo</surname><given-names>PJ</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>YC</given-names> </name><etal/></person-group><article-title>Comparison of the new exponential injury severity score with the Injury Severity Score and the New Injury Severity Score in trauma patients: a cross-sectional study</article-title><source>PLoS ONE</source><year>2017</year><volume>12</volume><issue>11</issue><fpage>e0187871</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0187871</pub-id><pub-id pub-id-type="medline">29121653</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Haider</surname><given-names>AH</given-names> </name><name name-style="western"><surname>Weygandt</surname><given-names>PL</given-names> </name><name name-style="western"><surname>Bentley</surname><given-names>JM</given-names> </name><etal/></person-group><article-title>Disparities in trauma care and outcomes in the United States: a systematic review and meta-analysis</article-title><source>J Trauma Acute Care Surg</source><year>2013</year><month>05</month><volume>74</volume><issue>5</issue><fpage>1195</fpage><lpage>1205</lpage><pub-id pub-id-type="doi">10.1097/TA.0b013e31828c331d</pub-id><pub-id pub-id-type="medline">23609267</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Peng</surname><given-names>HT</given-names> </name><name name-style="western"><surname>Siddiqui</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Rhind</surname><given-names>SG</given-names> </name><etal/></person-group><article-title>Artificial intelligence and machine learning for hemorrhagic trauma care</article-title><source>Mil Med Res</source><year>2023</year><month>02</month><day>16</day><volume>10</volume><issue>1</issue><fpage>6</fpage><pub-id pub-id-type="doi">10.1186/s40779-023-00444-0</pub-id><pub-id pub-id-type="medline">36793066</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kang</surname><given-names>WS</given-names> </name><name name-style="western"><surname>Chung</surname><given-names>H</given-names> </name><name name-style="western"><surname>Ko</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Artificial intelligence to predict in-hospital mortality using novel anatomical injury score</article-title><source>Sci Rep</source><year>2021</year><month>12</month><day>7</day><volume>11</volume><issue>1</issue><fpage>23534</fpage><pub-id pub-id-type="doi">10.1038/s41598-021-03024-1</pub-id><pub-id pub-id-type="medline">34876644</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hartka</surname><given-names>T</given-names> </name><name name-style="western"><surname>Chernyavskiy</surname><given-names>P</given-names> </name><name name-style="western"><surname>Glass</surname><given-names>G</given-names> </name><etal/></person-group><article-title>Evaluation of neural machine translation for conversion of international classification of disease codes to the Abbreviated Injury Scale</article-title><source>Accid Anal Prev</source><year>2023</year><month>10</month><volume>191</volume><fpage>107183</fpage><pub-id pub-id-type="doi">10.1016/j.aap.2023.107183</pub-id><pub-id pub-id-type="medline">37418869</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Ott</surname><given-names>M</given-names> </name><name name-style="western"><surname>Goyal</surname><given-names>N</given-names> </name><etal/></person-group><article-title>RoBERTa: a robustly optimized BERT pretraining approach</article-title><source>Arxiv</source><comment>Preprint posted online on  Jul 26, 2019</comment><pub-id pub-id-type="doi">10.48550/arXiv.1907.11692</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Korndorffer</surname><given-names>JR</given-names>  <suffix>Jr</suffix></name><name name-style="western"><surname>Hawn</surname><given-names>MT</given-names> </name><name name-style="western"><surname>Spain</surname><given-names>DA</given-names> </name><etal/></person-group><article-title>Situating artificial intelligence in surgery: a focus on disease severity</article-title><source>Ann Surg</source><year>2020</year><month>09</month><day>1</day><volume>272</volume><issue>3</issue><fpage>523</fpage><lpage>528</lpage><pub-id pub-id-type="doi">10.1097/SLA.0000000000004207</pub-id><pub-id pub-id-type="medline">33759839</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Swinckels</surname><given-names>L</given-names> </name><name name-style="western"><surname>Bennis</surname><given-names>FC</given-names> </name><name name-style="western"><surname>Ziesemer</surname><given-names>KA</given-names> </name><etal/></person-group><article-title>The use of deep learning and machine learning on longitudinal electronic health records for the early detection and prevention of diseases: scoping review</article-title><source>J Med Internet Res</source><year>2024</year><month>08</month><day>20</day><volume>26</volume><fpage>e48320</fpage><pub-id pub-id-type="doi">10.2196/48320</pub-id><pub-id pub-id-type="medline">39163096</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lammers</surname><given-names>D</given-names> </name><name name-style="western"><surname>Marenco</surname><given-names>C</given-names> </name><name name-style="western"><surname>Morte</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Machine learning for military trauma: novel massive transfusion predictive models in combat zones</article-title><source>J Surg Res</source><year>2022</year><month>02</month><volume>270</volume><issue>369-375</issue><fpage>369</fpage><lpage>375</lpage><pub-id pub-id-type="doi">10.1016/j.jss.2021.09.017</pub-id><pub-id pub-id-type="medline">34736129</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xie</surname><given-names>P</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Xiao</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Development and validation of an explainable deep learning model to predict in-hospital mortality for patients with acute myocardial infarction: algorithm development and validation study</article-title><source>J Med Internet Res</source><year>2024</year><month>05</month><day>10</day><volume>26</volume><fpage>e49848</fpage><pub-id pub-id-type="doi">10.2196/49848</pub-id><pub-id pub-id-type="medline">38728685</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kang</surname><given-names>WS</given-names> </name><name name-style="western"><surname>Seo</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Model for predicting in-hospital mortality of physical trauma patients using artificial intelligence techniques: nationwide population-based study in Korea</article-title><source>J Med Internet Res</source><year>2022</year><month>12</month><day>13</day><volume>24</volume><issue>12</issue><fpage>e43757</fpage><pub-id pub-id-type="doi">10.2196/43757</pub-id><pub-id pub-id-type="medline">36512392</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tran</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>W</given-names> </name><name name-style="western"><surname>Verma</surname><given-names>A</given-names> </name><etal/></person-group><article-title>The derivation of an International classification of diseases, tenth revision&#x2013;based trauma-related mortality model using machine learning</article-title><source>J Trauma Acute Care Surg</source><year>2022</year><month>03</month><day>1</day><volume>92</volume><issue>3</issue><fpage>561</fpage><lpage>566</lpage><pub-id pub-id-type="doi">10.1097/TA.0000000000003416</pub-id><pub-id pub-id-type="medline">34554135</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Loftis</surname><given-names>KL</given-names> </name><name name-style="western"><surname>Price</surname><given-names>JP</given-names> </name><name name-style="western"><surname>Gillich</surname><given-names>PJ</given-names> </name><etal/></person-group><article-title>Development of an expert based ICD-9-CM and ICD-10-CM map to AIS 2005 update 2008</article-title><source>Traffic Inj Prev</source><year>2016</year><month>09</month><volume>17 Suppl 1</volume><issue>1-5</issue><fpage>1</fpage><lpage>5</lpage><pub-id pub-id-type="doi">10.1080/15389588.2016.1191069</pub-id><pub-id pub-id-type="medline">27586094</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Clark</surname><given-names>DE</given-names> </name><name name-style="western"><surname>Black</surname><given-names>AW</given-names> </name><name name-style="western"><surname>Skavdahl</surname><given-names>DH</given-names> </name><name name-style="western"><surname>Hallagan</surname><given-names>LD</given-names> </name></person-group><article-title>Open-access programs for injury categorization using ICD-9 or ICD-10</article-title><source>Inj Epidemiol</source><year>2018</year><month>04</month><day>9</day><volume>5</volume><issue>1</issue><fpage>11</fpage><pub-id pub-id-type="doi">10.1186/s40621-018-0149-8</pub-id><pub-id pub-id-type="medline">29629480</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Radford</surname><given-names>A</given-names> </name><name name-style="western"><surname>Narasimhan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Salimans</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Improving language understanding with unsupervised learning</article-title><source>OpenAI</source><year>2018</year><month>06</month><day>11</day><access-date>2025-05-22</access-date><volume>URL</volume><comment><ext-link ext-link-type="uri" xlink:href="https://openai.com/blog/language-unsupervised">https://openai.com/blog/language-unsupervised</ext-link></comment></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jihong</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Ao</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Xing</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Analysis of the main revisions of the 2015 version of the simplified injury score</article-title><source>Chin J Traumatol</source><year>2022</year><volume>38</volume><issue>8</issue><fpage>747</fpage><lpage>749</lpage><pub-id pub-id-type="doi">10.3760/cma.j.cn501098-20220729-00263</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cui</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Che</surname><given-names>W</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Pre-training with whole word masking for Chinese BERT</article-title><source>IEEE/ACM Trans Audio Speech Lang Process</source><year>2021</year><volume>29</volume><fpage>3504</fpage><lpage>3514</lpage><pub-id pub-id-type="doi">10.1109/TASLP.2021.3124365</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>N&#x00E9;v&#x00E9;ol</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dalianis</surname><given-names>H</given-names> </name><name name-style="western"><surname>Velupillai</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Clinical natural language processing in languages other than English: opportunities and challenges</article-title><source>J Biomed Semantics</source><year>2018</year><month>03</month><day>30</day><volume>9</volume><issue>1</issue><fpage>12</fpage><pub-id pub-id-type="doi">10.1186/s13326-018-0179-8</pub-id><pub-id pub-id-type="medline">29602312</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Topol</surname><given-names>EJ</given-names> </name></person-group><article-title>High-performance medicine: the convergence of human and artificial intelligence</article-title><source>Nat Med</source><year>2019</year><month>01</month><volume>25</volume><issue>1</issue><fpage>44</fpage><lpage>56</lpage><pub-id pub-id-type="doi">10.1038/s41591-018-0300-7</pub-id><pub-id pub-id-type="medline">30617339</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Model input and output examples.</p><media xlink:href="formative_v9i1e67311_app1.docx" xlink:title="DOCX File, 16 KB"/></supplementary-material></app-group></back></article>