<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v9i1e75608</article-id><article-id pub-id-type="doi">10.2196/75608</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Automated Data Harmonization in Clinical Research: Natural Language Processing Approach</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Mallya</surname><given-names>Pratheek</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Henao</surname><given-names>Ricardo</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hong</surname><given-names>Chuan</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wojdyla</surname><given-names>Daniel</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Schibler</surname><given-names>Tony</given-names></name><degrees>MPA</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Manchanda</surname><given-names>Vihaan</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Pencina</surname><given-names>Michael</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hall</surname><given-names>Jennifer</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Zhao</surname><given-names>Juan</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>American Heart Association</institution><addr-line>7272 Greenville Ave</addr-line><addr-line>Dallas</addr-line><addr-line>TX</addr-line><country>United States</country></aff><aff id="aff2"><institution>Department of Biostatistics and Bioinformatics, Duke University</institution><addr-line>Durham</addr-line><addr-line>NC</addr-line><country>United States</country></aff><aff id="aff3"><institution>Duke Clinical Research Institute</institution><addr-line>Durham</addr-line><addr-line>NC</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Mavragani</surname><given-names>Amaryllis</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Arafat</surname><given-names>Amr A</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Shaffi</surname><given-names>Shamnad Mohamed</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Juan Zhao, PhD, American Heart Association, 7272 Greenville Ave, Dallas, TX, 75231, United States, 1 2147061164; <email>Juan.Zhao@heart.org</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>27</day><month>8</month><year>2025</year></pub-date><volume>9</volume><elocation-id>e75608</elocation-id><history><date date-type="received"><day>07</day><month>04</month><year>2025</year></date><date date-type="rev-recd"><day>12</day><month>06</month><year>2025</year></date><date date-type="accepted"><day>14</day><month>06</month><year>2025</year></date></history><copyright-statement>&#x00A9; Pratheek Mallya, Ricardo Henao, Chuan Hong, Daniel Wojdyla, Tony Schibler, Vihaan Manchanda, Michael Pencina, Jennifer Hall, Juan Zhao. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 27.8.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2025/1/e75608"/><abstract><sec><title>Background</title><p>Integrating data is essential for advancing clinical and epidemiological research. However, because datasets often describe variables (eg, demographic and health conditions) in diverse ways, the process of integrating and harmonizing variables from research studies remains a major bottleneck.</p></sec><sec><title>Objective</title><p>The objective was to assess a natural language processing&#x2013;based method to automate variable harmonization to achieve a scalable approach to integration of multiple datasets.</p></sec><sec sec-type="methods"><title>Methods</title><p>We developed a fully connected neural network (FCN) method, enhanced with contrastive learning, using domain-specific embeddings from the Bidirectional Encoder Representations from Transformers for Biomedical Text Mining language representation model, using 3 cardiovascular datasets: the Atherosclerosis Risk in Communities study, the Framingham Heart Study, and the Multi-Ethnic Study of Atherosclerosis. We used metadata variable descriptions and curated harmonized concepts as ground truth. We framed the problem as a paired sentence classification task. The accuracy of this method was compared with a logistic regression baseline method. To assess the generalizability of the trained models, we also evaluated their performance by separating the 3 datasets when preparing the training and validation sets.</p></sec><sec sec-type="results"><title>Results</title><p>The newly developed FCN achieved a top-5 accuracy of 98.95% (95% CI 98.31%&#x2010;99.47%) and an area under the receiver operating characteristic (AUC) of 0.99 (95% CI 0.98&#x2010;0.99), outperforming the standard logistic regression model, which exhibited a top-5 accuracy of 22.23% (95% CI 19.91%&#x2010;24.87%) and an AUC of 0.82 (95% CI 0.81&#x2010;0.83). The contrastive learning enhancement also outperformed the logistic regression model, although slightly below the base FCN model, exhibiting a top-5 accuracy of 89.88% (95% CI 87.88%&#x2010;91.68%) and an AUC of 0.98 (95% CI 0.97&#x2010;0.98).</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>This novel approach provides a scalable solution for harmonizing metadata across large-scale cohort studies. The proposed method significantly enhances the performance over the baseline method by using learned representations to categorize harmonized concepts more accurately for cohorts in cardiovascular disease and stroke.</p></sec></abstract><kwd-group><kwd>harmonization</kwd><kwd>natural language processing</kwd><kwd>cardiovascular research</kwd><kwd>neural networks</kwd><kwd>multi-cohort studies</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>The advent of large language models (LLMs), artificial intelligence, and computational power has the ability to transform our understanding of health and disease. One example is in developing predictive risk models for cardiovascular disease prevention, such as stroke [<xref ref-type="bibr" rid="ref1">1</xref>]. Machine learning&#x2013;based stroke risk prediction models enable the inclusion of a wide variety of factors (socioeconomic, behavioral, etc) to assess stroke risk [<xref ref-type="bibr" rid="ref2">2</xref>]. To fully leverage these approaches and technology, datasets need to be integrated [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. However, integration of datasets is challenging, given inconsistent variable names, column headers, and textual descriptions used to denote clinical or demographic measures [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>].</p><p>These metadata variables, which are the textual labels describing data elements, often differ across studies, even when referring to the same underlying concept (eg, &#x201C;Systolic_BP&#x201D; vs &#x201C;SBP_visit1&#x201D;). In cardiovascular research, cohort datasets such as the Framingham Heart Study (FHS), the Multi-Ethnic Study of Atherosclerosis (MESA), and the Atherosclerosis Risk in Communities (ARIC) study include thousands of such variables, each with custom naming conventions and sparse documentation. This lack of standardization poses a major challenge for dataset interoperability, phenotyping, and cross-cohort analyses [<xref ref-type="bibr" rid="ref7">7</xref>].</p><p>Data harmonization is the process involving the standardization of disparate variables across multiple datasets into a cohesive and unified format [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>]. This technique also increases the statistical power of a dataset to solve problems that could not be addressed when using data only from a single study [<xref ref-type="bibr" rid="ref10">10</xref>]. Traditional harmonization approaches depend heavily on manual mapping by domain experts to map disparate variable descriptions into unified medical concepts, which is time-consuming, error-prone, and difficult to scale [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>]. Standard vocabularies like Systematized Nomenclature of Medicine&#x2013;Clinical Terms [<xref ref-type="bibr" rid="ref12">12</xref>], Logical Observation Identifiers Names and Codes [<xref ref-type="bibr" rid="ref13">13</xref>], <italic>ICD</italic> (<italic>International Classification of Diseases</italic>) codes [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>], Current Procedural Terminology [<xref ref-type="bibr" rid="ref16">16</xref>], Clinical Classifications Software [<xref ref-type="bibr" rid="ref17">17</xref>], Normalized Names for Clinical Drugs [<xref ref-type="bibr" rid="ref18">18</xref>], and National Drug Code [<xref ref-type="bibr" rid="ref19">19</xref>] support structured data harmonization in electronic health records, but are not designed for the free-text, loosely formatted metadata descriptions found in cohort datasets. Recent advances in natural language processing (NLP), including the use of Bidirectional Encoder Representations from Transformers (BERT) models [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>], knowledge network [<xref ref-type="bibr" rid="ref22">22</xref>], and other semantic learning methods [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>], offer promising opportunities to automate the process. Pretrained language models like Bidirectional Encoder Representations from Transformers for Biomedical Text Mining (BioBERT) and semantic embedding techniques can be adapted to understand and categorize medical text [<xref ref-type="bibr" rid="ref21">21</xref>]. However, these models have not been widely applied to the harmonization of variable-level metadata in observational research settings. Our work addresses this gap.</p><p>The goal is to develop and evaluate an NLP-based method for harmonizing variable-level metadata across multiple biomedical datasets. Specifically, we aim to classify free-text variable names and descriptions into harmonized medical concepts that enable integration and analysis across multiple studies.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Overview</title><p>The goal of this approach was to combine different datasets by variable definitions into a harmonized variable defined as a medical concept&#x2014;a term that describes information in a patient&#x2019;s medical record, such as a diagnosis, a prescription, or a measurement.</p><p>To do this, we treated the automation of harmonization as the following steps: (1) to select a list of predefined data harmonization biomedical concepts, and (2) to train a classifier to classify whether a variable belongs to a certain medical concept or not. We used 3 large-scale cardiovascular research cohort studies (ie, FHS, MESA, and ARIC) to harmonize cardiovascular disease risk variables.</p><p>For the second step, we used BioBERT embeddings with a fully connected neural network (FCN). BioBERT, a transformer language representation model pretrained on biomedical corpora, generates embeddings for variable descriptions, capturing their semantic relationships [<xref ref-type="bibr" rid="ref21">21</xref>]. The FCN then classifies these embeddings into predefined harmonized concepts. To address the relatively low number of labeled samples, we also separately augmented the FCN using contrastive learning, a self-supervised representation learning method that is particularly effective in scenarios where training data is limited [<xref ref-type="bibr" rid="ref25">25</xref>]. The process workflow for this approach is outlined in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-2"><title>Data Sources</title><p>We used the metadata from 3 research cohort datasets&#x2014;FHS, MESA, and ARIC [<xref ref-type="bibr" rid="ref7">7</xref>]. The metadata includes variable names and descriptions. In total, we extracted 885 variable descriptions categorized into 64 concepts (spread across 7 concept groups) through manual annotation by 3 independent reviewers, who adapted a preselected list of stroke-related concepts that were illustrated in our previous work [<xref ref-type="bibr" rid="ref26">26</xref>]. The breakdown of each cohort dataset across cohorts and concept groups is provided in <xref ref-type="table" rid="table1">Table 1</xref>. The complete list of variable descriptions and their corresponding concepts is detailed in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. We used this labeled dataset for training and validation.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Breakdown of the number of variables for each concept group across the 3 study cohorts: Framingham Heart Study, Multi-Ethnic Study of Atherosclerosis, and Atherosclerosis Risk in Communities. The 885 variable descriptions are categorized into 64 concepts across 7 concept groups via manual annotation.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" rowspan="2">Study</td><td align="left" valign="bottom" colspan="3">Variables</td></tr><tr><td align="left" valign="bottom">ARIC<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="bottom">MESA<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="left" valign="bottom">FHS<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">Total variable descriptions, n</td><td align="left" valign="top">315</td><td align="left" valign="top">161</td><td align="left" valign="top">409</td></tr><tr><td align="left" valign="top">Variables under each category of concept, n (%)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;Sociodemographics</td><td align="left" valign="top">12 (3.8)</td><td align="left" valign="top">11 (6.8)</td><td align="left" valign="top">13 (3.2)</td></tr><tr><td align="left" valign="top">&#x2003;Vitals</td><td align="left" valign="top">18 (5.7)</td><td align="left" valign="top">10 (6.2)</td><td align="left" valign="top">63 (15.4)</td></tr><tr><td align="left" valign="top">&#x2003;Comorbidities</td><td align="left" valign="top">59 (18.7)</td><td align="left" valign="top">76 (47.2)</td><td align="left" valign="top">98 (24)</td></tr><tr><td align="left" valign="top">&#x2003;Laboratories</td><td align="left" valign="top">32 (10.2)</td><td align="left" valign="top">16 (9.9)</td><td align="left" valign="top">49 (12)</td></tr><tr><td align="left" valign="top">&#x2003;Medications</td><td align="left" valign="top">131 (41.6)</td><td align="left" valign="top">30 (18.7)</td><td align="left" valign="top">91 (22.2)</td></tr><tr><td align="left" valign="top">&#x2003;Diet</td><td align="left" valign="top">42 (13.3)</td><td align="left" valign="top">1 (0.6)</td><td align="left" valign="top">74 (18.1)</td></tr><tr><td align="left" valign="top">&#x2003;Other</td><td align="left" valign="top">21 (6.7)</td><td align="left" valign="top">17 (10.6)</td><td align="left" valign="top">21 (5.1)</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>ARIC: Atherosclerosis Risk in Communities.</p></fn><fn id="table1fn2"><p><sup>b</sup>MESA: Multi-Ethnic Study of Atherosclerosis.</p></fn><fn id="table1fn3"><p><sup>c</sup>FHS: Framingham Heart Study.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-3"><title>BioBERT Embeddings</title><p>We used a pretrained BioBERT model to convert the variable descriptions into embedding vectors. BioBERT is a transformer-based model specifically pretrained on large-scale biomedical corpora, including PubMed abstracts and PubMed Central articles [<xref ref-type="bibr" rid="ref21">21</xref>]. Derived from a general-purpose model known as BERT [<xref ref-type="bibr" rid="ref20">20</xref>], BioBERT has shown superior performance over BERT for biomedical-related tasks such as Named Entity Recognition [<xref ref-type="bibr" rid="ref27">27</xref>], Relation Extraction [<xref ref-type="bibr" rid="ref28">28</xref>], and Question Answering [<xref ref-type="bibr" rid="ref29">29</xref>]. Particularly, for short-length sequences in the biomedical domain, with pretrained domain knowledge, BioBERT can capture domain-specific semantics and relationships better than a general-purpose model. Given its proven effectiveness in biomedical NLP tasks, BioBERT is an ideal choice for analyzing short-text sequences in the biomedical domain. In this study, we converted each variable description using BioBERT into a 768-dimensional embedding vector for downstream classification.</p></sec><sec id="s2-4"><title>Paired Sentences for Classification</title><p>We framed the task as a binary classification problem using pairs of variable descriptions (<italic>x<sub>1</sub></italic>, <italic>x<sub>2</sub></italic>). Each pair was labeled as either belonging to the same concept or not. We calculated cosine similarity for each pair, and these similarity scores were used to train a supervised classifier to distinguish between matched and nonmatched pairs [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref31">31</xref>].</p><p>During inference, for a given variable description, the model compared it against all known concepts. The model calculated similarity scores for each pairing and assigned the description to the concept with the highest similarity score.</p></sec><sec id="s2-5"><title>Data Preparation</title><p>The dataset was prepared as (1) matching pairs (for every concept, all combinations of variable descriptions belonging to that concept were generated as matched pairs) and (2) nonmatching pairs (for each variable description in a concept, a random sample of descriptions from other concepts was used to generate nonmatching pairs).</p><p>To balance the training dataset, we maintained a 1:3 ratio of matching to nonmatching pairs. This ensured sufficient representation of both types of data while maximizing training examples.</p></sec><sec id="s2-6"><title>Models</title><p>We used the logistic regression model as a baseline classifier. The input was the cosine similarity between BioBERT embedding vectors of paired descriptions [<xref ref-type="bibr" rid="ref32">32</xref>]. The model was trained using the cross-entropy loss function, and the output was a probabilistic score, which indicates whether the pair represented the same concept (eg, a matched pair or nonmatched pair).</p><p>The proposed FCN model consisted of 2 hidden layers, with the first hidden layer having a rectified linear unit activation function [<xref ref-type="bibr" rid="ref33">33</xref>], and the second layer using a cosine similarity function, rescaled with a weight and a bias parameter, followed by a sigmoid activation function. The framework of the FCN model is outlined in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. The network was trained using binary cross-entropy loss [<xref ref-type="bibr" rid="ref34">34</xref>]. The Adam optimizer with early stopping on the validation set was used for optimization [<xref ref-type="bibr" rid="ref35">35</xref>]. During inference, given a new input variable description, the model calculated similarity scores between the embedding vectors for an input description and each known concept. The concept with the highest score was assigned to the variable description.</p></sec><sec id="s2-7"><title>Contrastive Learning</title><p>To address the challenges of limited labeled training data, we used a contrastive learning approach [<xref ref-type="bibr" rid="ref36">36</xref>]. The model was trained to minimize Noise-Contrastive Estimation loss, which improves the representation of variable descriptions by learning from matched and nonmatched pairs [<xref ref-type="bibr" rid="ref37">37</xref>]. For each variable description, we applied random permutations of embeddings to create augmented pairs. This method further optimized the FCN by leveraging noisy but informative examples. During inference, we used the same methodology as described for the FCN model to categorize an input variable description to a concept.</p></sec><sec id="s2-8"><title>Evaluation</title><p>To assess the performance, applicability, and generalizability of the method, we used 2 strategies&#x2014;a combined cohort approach and a separated cohort approach. In the combined cohort approach, we used data from all 3 cohort datasets and randomly split it into training, validation, and testing with an approximate ratio of 4:1:1. For the separated cohort approach, we trained and validated each model on 2 cohorts and used the remaining cohort for testing to assess generalizability across datasets.</p><p>We used the area under the receiver operating characteristic (AUC) as our primary performance measure distinguishing matched and nonmatched pairs [<xref ref-type="bibr" rid="ref38">38</xref>,<xref ref-type="bibr" rid="ref39">39</xref>]. To evaluate how often the correct concept ranks within the top-K predictions, we used top-1 and top-5 accuracy [<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref41">41</xref>]. We used bootstrapping to obtain CIs for both the AUC and accuracy scores [<xref ref-type="bibr" rid="ref42">42</xref>].</p><p>All models were developed using Python (v3.11.5) and PyTorch (v2.2.1). The code and trained models used are found on our GitHub repository: duke-harmonization.</p></sec><sec id="s2-9"><title>Ethical Considerations</title><p>This study was approved by the Duke University Health System institutional review board (Pro00106364).</p><p>For the primary data collections, participants in the original studies provided informed consent, which included provisions for data sharing and secondary use. The datasets used in this study were accessed in accordance with those provisions, and no additional consent was required for this secondary analysis.</p><p>All datasets used in this study were fully deidentified and contained no direct or indirect identifiers. The analyses relied exclusively on aggregated metadata, with no linkage to individual-level information. Accordingly, participant confidentiality was maintained throughout.</p><p>No participants were directly involved or recruited for this secondary analysis; therefore, no compensation was provided. This paper does not include any images or materials that could lead to the identification of individual participants.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>We extracted a total of 885 variables from 3 datasets, including the FHS, MESA, and ARIC. We precategorized these variables into 64 harmonized concepts and generated 58,890 sentence pairs. In the combined cohort evaluation strategy, we split this dataset into training, validation, and test datasets in a 4:1:1 ratio. The FCN model outperformed the baseline logistic regression model, achieving an AUC of 0.99 (95% CI 0.98&#x2010;0.99), compared with the baseline&#x2019;s AUC of 0.82 (95% CI 0.81&#x2010;0.83). The contrastive learning approach achieved an AUC of 0.98 (95% CI 0.97&#x2010;0.98), which also outperformed the baseline logistic regression model (<xref ref-type="fig" rid="figure1">Figure 1</xref>). For the top-K accuracy, the FCN model achieved a top-1 accuracy of 80.51% (95% CI 78.08%&#x2010;83.03%) and a top-5 accuracy of 98.95% (95% CI 98.31%&#x2010;99.47%), significantly outperforming the baseline model which achieved top-1 accuracy of 12.12% (95% CI 10.22%&#x2010;14.12%) and top-5 accuracy of 22.23% (95% CI 19.91%&#x2010;24.87%). The contrastive learning approach achieved a moderate top-1 accuracy score of 63.65% (95% CI 60.59%&#x2010;66.81%) and achieved a top-5 accuracy score of 89.88% (95% CI 87.88%&#x2010;91.67%; <xref ref-type="table" rid="table2">Table 2</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Receiver operating characteristic curves for each of the trained fully connected neural network models and the baseline logistic regression model for the combined cohort approach. In this setting, the variables from all three datasets (Atherosclerosis Risk in Communities, Multi-Ethnic Study of Atherosclerosis, and Framingham Heart Study) were pre-categorized into harmonized concepts. The area under the curve is directly proportional to the model&#x2019;s performance in distinguishing between matches and nonmatches for a given pair of variable descriptions. The data used to generate the receiver operating characteristic curves consisted of 11,880 pairs of variable descriptions that were absent from the training data, when evaluated on all the cohorts. AUC: area under the receiver operating characteristic; FCN: fully connected neural network.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e75608_fig01.png"/></fig><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Top-1 and top-5 accuracy with 95% CIs for baseline logistic regression model, the fully connected neural network model, and the fully connected neural network model with contrastive learning. The evaluation was performed under the combined cohort strategy, where the variables from all 3 cohorts (Atherosclerosis Risk in Communities, Multi-Ethnic Study of Atherosclerosis, and Framingham Heart Study) were precategorized into harmonized concepts.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Top-1 accuracy, % (95% CI)</td><td align="left" valign="bottom">Top-5 accuracy, % (95% CI)</td><td align="left" valign="bottom">AUC<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup> (95% CI)</td></tr></thead><tbody><tr><td align="left" valign="top">Logistic regression</td><td align="left" valign="top">12.12 (10.22&#x2010;14.12)</td><td align="left" valign="top">22.23 (19.91&#x2010;24.87)</td><td align="left" valign="top">0.82 (0.81&#x2010;0.83)</td></tr><tr><td align="left" valign="top">FCN<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup> (combined cohort)</td><td align="left" valign="top">80.51 (78.08&#x2010;83.03)</td><td align="left" valign="top">98.95 (98.31&#x2010;99.47)</td><td align="left" valign="top">0.99 (0.98&#x2010;0.99)</td></tr><tr><td align="left" valign="top">Contrastive learning</td><td align="left" valign="top">63.65 (60.59&#x2010;66.81)</td><td align="left" valign="top">89.88 (87.88&#x2010;91.67)</td><td align="left" valign="top">0.98 (0.97&#x2010;0.98)</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>AUC: area under the receiver operating characteristic.</p></fn><fn id="table2fn2"><p><sup>b</sup>FCN: fully connected neural network.</p></fn></table-wrap-foot></table-wrap><p>We assessed the robustness of FCN in the separated cohort evaluation. The FCN model trained with the ARIC-Framingham model achieved an AUC of 0.78 (95% CI 0.73&#x2010;0.83) on the MESA dataset. The MESA-ARIC model, evaluated on the Framingham dataset, achieved the highest AUC of 0.85 (95% CI 0.83&#x2010;0.87). The Framingham-MESA model, evaluated on the ARIC dataset, achieved an AUC of 0.83 (95% CI 0.81&#x2010;0.85). The ROC curves for the separated cohort models are shown in <xref ref-type="fig" rid="figure2">Figure 2</xref>. For the top-K metrics, the ARIC-Framingham model performed best with a top-1 accuracy of 49.33% (95% CI 43.11%&#x2010;55.11%) and a top-5 accuracy of 64% (95% CI 57.78%&#x2010;69.34%). The MESA-ARIC model performed slightly worse, with a top-1 accuracy of 39.32% (95% CI 35.09%&#x2010;43.76%) and a top-5 accuracy of 59.62% (95% CI 55.39%&#x2010;64.06%). The Framingham-MESA model exhibited the lowest accuracy performance, with a top-1 accuracy of 32.98% (95% CI 28.23%&#x2010;37.47%) and a top-5 accuracy of 48.81% (95% CI 43.79%&#x2010;53.56%), which were likely due to greater variability in the ARIC dataset (<xref ref-type="table" rid="table3">Table 3</xref>).</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Receiver operating characteristic curves for each of the trained fully connected neural network models for the separated cohort approach. In this setting, the variables from all 3 datasets (Atherosclerosis Risk in Communities, Multi-Ethnic Study of Atherosclerosis, and Framingham Heart Study) were initially precategorized into harmonized concepts. The models were then trained and validated on 2 of the cohorts and then tested on the remaining cohort to assess generalizability of the model across different datasets. The area under the curve is directly proportional to the model&#x2019;s performance in distinguishing between matches and nonmatches for a given pair of variable descriptions. The receiver operating characteristic curves for each model were obtained by evaluating the model on the subset of the test dataset containing only data from the cohort excluded during training. ARIC: Atherosclerosis Risk in Communities; AUC: area under the receiver operating characteristic; FCN: fully connected neural network; MESA: Multi-Ethnic Study of Atherosclerosis.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e75608_fig02.png"/></fig><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Top-1 and top-5 accuracy with 95% CIs for the 3 cohort-specific fully connected neural network models. The evaluation was performed using the separated cohort evaluation strategy, where the variables from all three cohorts (Atherosclerosis Risk in Communities, Multi-Ethnic Study of Atherosclerosis, and Framingham Heart Study) were initially precategorized into harmonized concepts, and the models were then trained and validated on 2 cohorts and tested on the remaining cohort to assess generalizability of the model across different datasets.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Top-1 accuracy, % (95% CI)</td><td align="left" valign="bottom">Top-5 accuracy, % (95% CI)</td><td align="left" valign="bottom">AUC<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup> (95% CI)</td></tr></thead><tbody><tr><td align="left" valign="top">FCN<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup> (Framingham-MESA<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup>), tested on ARIC</td><td align="left" valign="top">32.98 (28.23&#x2010;37.47)</td><td align="left" valign="top">48.81 (43.79&#x2010;53.56)</td><td align="left" valign="top">0.83 (0.81&#x2010;0.85)</td></tr><tr><td align="left" valign="top">FCN (MESA-ARIC<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup>) tested on Framingham</td><td align="left" valign="top">39.32 (35.09&#x2010;43.76)</td><td align="left" valign="top">59.62 (55.39&#x2010;64.06)</td><td align="left" valign="top">0.85 (0.83&#x2010;0.87)</td></tr><tr><td align="left" valign="top">FCN (ARIC-Framingham) tested on MESA</td><td align="left" valign="top">49.33 (43.11&#x2010;55.11)</td><td align="left" valign="top">64.0 (57.78&#x2010;69.34)</td><td align="left" valign="top">0.78 (0.73&#x2010;0.83)</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>AUC: area under the receiver operating characteristic.</p></fn><fn id="table3fn2"><p><sup>b</sup>FCN: fully connected neural network.</p></fn><fn id="table3fn3"><p><sup>c</sup>MESA: Multi-Ethnic Study of Atherosclerosis.</p></fn><fn id="table3fn4"><p><sup>d</sup>ARIC: Atherosclerosis Risk in Communities.</p></fn></table-wrap-foot></table-wrap><p>We plotted the distribution of the predicted score for matches and nonmatches across different concept groups using the baseline method and the FCN model, illustrated in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. The results indicated that the FCN model generally demonstrates narrower IQRs and more distinct separation between median probabilities for matches and nonmatches, particularly in the diet and sociodemographics categories, which achieved a perfect AUC of 1.0, indicating superior predictive performance compared with the baseline model. The AUC for each model setting when evaluated on a per-concept level is detailed in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>We also computed the positive predictive value, negative predictive value, true positive rate, and false positive rate for each of the concepts when using the top-1 predicted concept from the FCN model on the test dataset. The mean positive predictive value across all concepts was 0.78 (SD 0.25), the mean negative predictive value was 0.99 (SD 0.01), the mean true positive rate was 0.85 (SD 0.21), and the mean false positive rate was 0.01 (SD 0.01). The metrics for all concepts are detailed in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Overview</title><p>Harmonizing multiple diverse research cohort datasets can enlarge the data power for training and validating risk prediction models. However, traditional data harmonization techniques need manual comparison, which is time-consuming and barely scalable. This study presents an automated and scalable approach for variable harmonization by leveraging domain-specific NLP and machine learning applied to metadata. We implemented and evaluated the method using the metadata-level variable descriptions from the 3 National Institutes of Health research cohort studies. By reframing variable harmonization as a sentence-pair classification problem, our approach achieves accurate mapping between free-text variable descriptions and standardized concepts, even in the absence of patient-level data. This methodology addresses the common challenges of short text length, sparse annotation, and class imbalance in harmonization tasks.</p></sec><sec id="s4-2"><title>Principal Results and Comparison With Previous Work</title><p>Our results showed that the FCN model trained on sentence pairs significantly outperformed the baseline logistic regression model. Specifically, both the basic FCN method and the enhanced version using contrastive learning achieved high AUC, top-1, and top-5 accuracy scores, surpassing the logistic regression method. The basic FCN model performed slightly better than the contrastive learning approach. We further assessed the generalizability of our model by separating cohorts for evaluation. Model performance was generally lower and varied, which is expected, due to different variable distributions across different research cohort datasets. The ARIC-Framingham model performed the best in terms of top-K accuracy, suggesting that the MESA dataset shared the most common metadata features with the other two. The Framingham-MESA model performed the worst, possibly because the ARIC metadata has more unique characteristics and models could not effectively learn due to its absence from the training data.</p><p>Similar to earlier manual harmonization efforts, our approach began with expert-curated categorization of variables into predefined concepts, which is a foundational step that was essential for the success of the automated classification process, as described in our previous work [<xref ref-type="bibr" rid="ref26">26</xref>]. However, unlike traditional methods that rely heavily on manual effort throughout, our system automates the subsequent classification, significantly reducing the time and human effort required. While manual harmonization provides expert-driven accuracy, our findings suggest that the automated method can achieve comparable mapping quality with substantially less human input. This framework aligns with practices seen in other harmonization studies, where domain experts played a key role in defining variable concepts [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref43">43</xref>], and other automated harmonization studies [<xref ref-type="bibr" rid="ref44">44</xref>,<xref ref-type="bibr" rid="ref45">45</xref>].</p><p>Our proposed approach for automated variable harmonization used pretrained embeddings to learn the representations from variable descriptions. Similarly, Yang et al [<xref ref-type="bibr" rid="ref45">45</xref>] used semantic embeddings and patient-level data to harmonize continuous variables. However, their approach excluded categorical variables and those with missing data. In contrast, our approach uses only variable metadata, thus enabling harmonizing a broader spectrum of variables including both continuous and categorical. Since we use only metadata, this approach also allows harmonization of datasets with or without missing data&#x2014;thus offering wider applicability for real-world cohort integration.</p><p>With the recent advancements in LLMs, Li et al [<xref ref-type="bibr" rid="ref44">44</xref>] introduced a framework for variable matching using embeddings from general-purpose LLMs. We acknowledge this emerging direction in the field; however, the use of large models often requires fine-tuning on domain-specific data and incurs substantial computational costs, which may limit their practical applicability in resource-constrained settings [<xref ref-type="bibr" rid="ref46">46</xref>]. By leveraging embeddings from domain-specific LLMs, such as BioBERT, we present a cost-effective approach, requiring fewer computational resources for training and implementation [<xref ref-type="bibr" rid="ref47">47</xref>].</p></sec><sec id="s4-3"><title>Implications for Research and Practice</title><p>Our experimental results suggest that our method achieves accurate harmonization for variables across different cardiovascular cohort studies by evaluating contextual similarity across disparate variable descriptions. For example, the descriptions of &#x201C;diabetes mellitus status&#x201D; are inconsistent across ARIC, MESA, and Framingham datasets. In the ARIC study, the description varies by visit or exam, such as &#x201C;diabetes with fasting glucose cutpoint&#x003C;126&#x201D; or &#x201C;diabetes using lower cutpoint 126 mg/dL.&#x201D; In contrast, Framingham and MESA use descriptions like &#x201C;diabetes mellitus status, exam 1.&#x201D; Traditionally, aligning these variables to a SNOMED concept for the condition &#x201C;diabetes mellitus&#x201D; requires manual effort and domain expertise, which is difficult to scale across multiple cohorts. Our automated framework significantly reduces this burden, achieving consistent, accurate mapping in a fraction of the time. In practical settings, this approach enables researchers to integrate datasets for cross-cohort analyses, which are essential for predictive modeling and other data-driven applications.</p></sec><sec id="s4-4"><title>Limitations</title><p>Despite these advancements, we acknowledge that several limitations and challenges remain. First, our proposed framework focused on metadata and did not include patient-level data. However, incorporating patient-level data could help resolve ambiguities in variable definitions. A hybrid approach that leverages patient-level data alongside learned representations from the metadata may help in verifying the automated harmonization results [<xref ref-type="bibr" rid="ref22">22</xref>]. Another limitation is that we did not address the challenges remaining in the harmonization of different units for laboratory values given that our focus was on metadata and variable descriptions. Incorporating comparisons of variable distributions from patient-level data, in addition to the semantic representations of the variable descriptions, could help alleviate this problem [<xref ref-type="bibr" rid="ref48">48</xref>]. Future work should explore hybrid methods to combine harmonized variable descriptions with patient-level data to create a more comprehensive and robust framework for cohort integration.</p><p>While our study focused on cardiovascular datasets, we acknowledge that the generalizability of the proposed harmonization method to other disease domains or datasets with differing data structures remains unproven. BioBERT is pretrained on large-scale biomedical corpora and thus has potential applicability beyond cardiovascular disease [<xref ref-type="bibr" rid="ref49">49</xref>], but we recommend validating this approach in other domains, such as oncology, infectious disease, and mental health, where vocabulary, annotation practices, and data sparsity may vary. To improve robustness and portability, we recommend curating preharmonized benchmark datasets for external validation. In addition, future work could explore the integration of lightweight transformers, few-shot learning, or domain-adaptive transfer learning to handle limited labeled data and further extend the applicability of contrastive learning in diverse biomedical settings [<xref ref-type="bibr" rid="ref50">50</xref>-<xref ref-type="bibr" rid="ref52">52</xref>].</p><p>Third, we did not use more sophisticated models for sequential data such as recurrent neural networks [<xref ref-type="bibr" rid="ref53">53</xref>], or Long Short-Term Memory networks [<xref ref-type="bibr" rid="ref54">54</xref>], nor LLMs such as Generative Pre-trained Transformers [<xref ref-type="bibr" rid="ref55">55</xref>], Pathways Language Model (Google AI) [<xref ref-type="bibr" rid="ref56">56</xref>], or Large Language Model Meta AI [<xref ref-type="bibr" rid="ref57">57</xref>], due to the sparse number of labeled examples present in our training data. Application of the contrastive learning approach in tandem with the advanced language models may prove effective in the scalability of the automated harmonization process. Using more complex batch selection methods may also lead to better results via contrastive learning [<xref ref-type="bibr" rid="ref58">58</xref>].</p></sec><sec id="s4-5"><title>Conclusions</title><p>In this study, we developed a scalable and automated method for variable harmonization using only metadata from research cohorts. By applying domain-specific language models and framing the task as a sentence-pair classification problem, our approach can accurately map variable descriptions to standardized concepts without needing patient-level data. This reduces the time and effort required for harmonization and is especially useful when access to detailed data is limited. Although we tested the method on cardiovascular datasets, it can potentially be used in other areas like cancer or mental health research. This work provides a foundation for faster and more efficient data integration, which is important for large-scale studies and real-world health research.</p></sec></sec></body><back><ack><p>This work was funded by the National Institute of Neurological Disorders and Stroke (NINDS; grant R61/R33NS120246).</p><p/><p>The Framingham Heart Study (FHS) is conducted and supported by the National Heart, Lung, and Blood Institute (NHLBI) in collaboration with Boston University (Contract N01-HC-25195 and HHSN268201500001I). This manuscript was not prepared in collaboration with investigators of the FHS and does not necessarily reflect the opinions or views of the FHS, Boston University, or NHLBI. The original metadata used in this work can be found at dbGaP, using the dbGaP accession number phs000007.v32.</p><p>Multi-Ethnic Study of Atherosclerosis (MESA) and the MESA SHARe project are conducted and supported by the NHLBI in collaboration with MESA investigators. Support for MESA is provided by contracts HHSN268201500003I, N01-HC-95159, N01-HC-95160, N01-HC-95161, N01-HC-95162, N01-HC-95163, N01-HC95164, N01-HC-95165, N01-HC-95166, N01-HC- 95167, N01-HC-95168, N01-HC-95169, UL1-TR-001079, UL1-TR000040, UL1-TR-001420, UL1-TR-001881, DK063491 and CTSA UL1-RR-024156. The original metadata used in this work can be found at dbGaP using the dbGaP accession phs000209.v13.</p><p>The Atherosclerosis Risk in Communities (ARIC) study has been funded in whole or in part with Federal funds from the NHLBI, National Institute of Health, Department of Health and Human Services, under contracts (HHSN268201700001I, HHSN268201700002I, HHSN268201700003I, HHSN268201700004I, and HHSN268201700005I). The authors thank the staff members and participants of the ARIC study for their important contributions. The original metadata used in this work can be found at the database of Genotypes and Phenotypes (dbGaP) using the dbGaP accession phs000280.v7.</p><p>The metadata for FHS, MESA, and ARIC can also be obtained from the NHLBI Biologic Specimen and Data Repository Information Coordinating Center (BioLINCC). BioLINCC does not necessarily reflect the opinions or views of the FHS, MESA, ARIC, or NHLBI. This work uses only the metadata from the FHS, MESA, and ARIC studies.</p></ack><notes><sec><title>Data Availability</title><p>The datasets generated are not publicly available as we are unable to share the harmonized data as we do not have the right to grant access to individual-level data. This requires a signed data-use agreement from the data cohorts (FHS, MESA, and ARIC) via the database of Genotypes and Phenotypes (dbGaP) or via the Biologic Specimen and Data Repository Information Coordinating Center (BioLINCC). If you are interested in acquiring the data, please contact dbGaP or BioLINCC. The code used to create the concepts for manual harmonization of the cohorts can be found on our GitHub repository. The training metadata dataset containing the variable descriptions and their assigned concepts can be found in the supplementary materials (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). The code and the trained models used for our proposed automated harmonization method are also found on our GitHub repository.</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">ARIC</term><def><p>Atherosclerosis Risk in Communities</p></def></def-item><def-item><term id="abb2">AUC</term><def><p>area under the receiver operating characteristic</p></def></def-item><def-item><term id="abb3">BERT</term><def><p>Bidirectional Encoder Representations from Transformers</p></def></def-item><def-item><term id="abb4">BioBERT</term><def><p>Bidirectional Encoder Representations from Transformers for Biomedical Text Mining</p></def></def-item><def-item><term id="abb5">FCN</term><def><p>fully connected neural network</p></def></def-item><def-item><term id="abb6">FHS</term><def><p>Framingham Heart Study</p></def></def-item><def-item><term id="abb7"><italic>ICD</italic></term><def><p><italic>International Classification of Diseases</italic></p></def></def-item><def-item><term id="abb8">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb9">MESA</term><def><p>Multi-Ethnic Study of Atherosclerosis</p></def></def-item><def-item><term id="abb10">NLP</term><def><p>natural language processing</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tsao</surname><given-names>CW</given-names> </name><name name-style="western"><surname>Aday</surname><given-names>AW</given-names> </name><name name-style="western"><surname>Almarzooq</surname><given-names>ZI</given-names> </name><etal/></person-group><article-title>Heart disease and stroke statistics-2023 update: a report from the American Heart Association</article-title><source>Circulation</source><year>2023</year><month>02</month><day>21</day><volume>147</volume><issue>8</issue><fpage>e93</fpage><lpage>e621</lpage><pub-id pub-id-type="doi">10.1161/CIR.0000000000001123</pub-id><pub-id pub-id-type="medline">36695182</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jamthikar</surname><given-names>A</given-names> </name><name name-style="western"><surname>Gupta</surname><given-names>D</given-names> </name><name name-style="western"><surname>Saba</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Cardiovascular/stroke risk predictive calculators: a comparison between statistical and machine learning models</article-title><source>Cardiovasc Diagn Ther</source><year>2020</year><month>08</month><volume>10</volume><issue>4</issue><fpage>919</fpage><lpage>938</lpage><pub-id pub-id-type="doi">10.21037/cdt.2020.01.07</pub-id><pub-id pub-id-type="medline">32968651</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhao</surname><given-names>J</given-names> </name><name name-style="western"><surname>Feng</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Wei</surname><given-names>WQ</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Bai</surname><given-names>JPF</given-names> </name><name name-style="western"><surname>Hur</surname><given-names>J</given-names> </name></person-group><article-title>Integration of omics and phenotypic data for precision medicine</article-title><source>Methods Mol Biol</source><year>2022</year><volume>2486</volume><fpage>19</fpage><lpage>35</lpage><pub-id pub-id-type="doi">10.1007/978-1-0716-2265-0_2</pub-id><pub-id pub-id-type="medline">35437716</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>KB</given-names> </name><name name-style="western"><surname>Wei</surname><given-names>W</given-names> </name><name name-style="western"><surname>Weeraratne</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Precision medicine, AI, and the future of personalized health care</article-title><source>Clinical Translational Sci</source><year>2021</year><month>01</month><access-date>2025-08-18</access-date><volume>14</volume><issue>1</issue><fpage>86</fpage><lpage>93</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://ascpt.onlinelibrary.wiley.com/toc/17528062/14/1">https://ascpt.onlinelibrary.wiley.com/toc/17528062/14/1</ext-link></comment><pub-id pub-id-type="doi">10.1111/cts.12884</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gurugubelli</surname><given-names>VS</given-names> </name><name name-style="western"><surname>Fang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Shikany</surname><given-names>JM</given-names> </name><etal/></person-group><article-title>A review of harmonization methods for studying dietary patterns</article-title><source>Smart Health (2014)</source><year>2022</year><month>03</month><volume>23</volume><fpage>100263</fpage><pub-id pub-id-type="doi">10.1016/j.smhl.2021.100263</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Peng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Bathelt</surname><given-names>F</given-names> </name><name name-style="western"><surname>Gebler</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Use of metadata-driven approaches for data harmonization in the medical domain: scoping review</article-title><source>JMIR Med Inform</source><year>2024</year><month>02</month><day>14</day><volume>12</volume><fpage>e52967</fpage><pub-id pub-id-type="doi">10.2196/52967</pub-id><pub-id pub-id-type="medline">38354027</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mallya</surname><given-names>P</given-names> </name><name name-style="western"><surname>Stevens</surname><given-names>LM</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Facilitating harmonization of variables in Framingham, MESA, ARIC, and REGARDS studies through a metadata repository</article-title><source>Circ: Cardiovascular Quality and Outcomes</source><year>2023</year><month>11</month><volume>16</volume><issue>11</issue><fpage>11</fpage><pub-id pub-id-type="doi">10.1161/CIRCOUTCOMES.123.009938</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cheng</surname><given-names>C</given-names> </name><name name-style="western"><surname>Messerschmidt</surname><given-names>L</given-names> </name><name name-style="western"><surname>Bravo</surname><given-names>I</given-names> </name><etal/></person-group><article-title>A general primer for data harmonization</article-title><source>Sci Data</source><year>2024</year><month>01</month><day>31</day><volume>11</volume><issue>1</issue><fpage>152</fpage><pub-id pub-id-type="doi">10.1038/s41597-024-02956-3</pub-id><pub-id pub-id-type="medline">38297013</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Bazzano</surname><given-names>LA</given-names> </name><name name-style="western"><surname>Betha</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Large-scale data harmonization across prospective studies</article-title><source>Am J Epidemiol</source><year>2023</year><month>11</month><day>10</day><volume>192</volume><issue>12</issue><fpage>2033</fpage><lpage>2049</lpage><pub-id pub-id-type="doi">10.1093/aje/kwad153</pub-id><pub-id pub-id-type="medline">37403415</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Adhikari</surname><given-names>K</given-names> </name><name name-style="western"><surname>Patten</surname><given-names>SB</given-names> </name><name name-style="western"><surname>Patel</surname><given-names>AB</given-names> </name><etal/></person-group><article-title>Data harmonization and data pooling from cohort studies: a practical approach for data management</article-title><source>Int J Popul Data Sci</source><year>2021</year><volume>6</volume><issue>1</issue><fpage>1680</fpage><pub-id pub-id-type="doi">10.23889/ijpds.v6i1.1680</pub-id><pub-id pub-id-type="medline">34888420</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Sony</surname><given-names>P</given-names> </name></person-group><article-title>Concept-based electronic health record retrieval system in healthcare IOT</article-title><source>Cognitive Informatics and Soft Computing</source><year>2019</year><volume>768</volume><publisher-name>Springer</publisher-name><fpage>175</fpage><lpage>188</lpage><pub-id pub-id-type="doi">10.1007/978-981-13-0617-4_17</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ruch</surname><given-names>P</given-names> </name><name name-style="western"><surname>Gobeill</surname><given-names>J</given-names> </name><name name-style="western"><surname>Lovis</surname><given-names>C</given-names> </name><name name-style="western"><surname>Geissb&#x00FC;hler</surname><given-names>A</given-names> </name></person-group><article-title>Automatic medical encoding with SNOMED categories</article-title><source>BMC Med Inform Decis Mak</source><year>2008</year><month>10</month><day>27</day><volume>8 Suppl 1</volume><issue>Suppl 1</issue><fpage>S6</fpage><pub-id pub-id-type="doi">10.1186/1472-6947-8-S1-S6</pub-id><pub-id pub-id-type="medline">19007443</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McDonald</surname><given-names>CJ</given-names> </name><name name-style="western"><surname>Huff</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Suico</surname><given-names>JG</given-names> </name><etal/></person-group><article-title>LOINC, a universal standard for identifying laboratory observations: a 5-year update</article-title><source>Clin Chem</source><year>2003</year><month>04</month><volume>49</volume><issue>4</issue><fpage>624</fpage><lpage>633</lpage><pub-id pub-id-type="doi">10.1373/49.4.624</pub-id><pub-id pub-id-type="medline">12651816</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="book"><person-group person-group-type="author"><collab>W. H. Organization and others, International classification of diseases</collab></person-group><source>Basic Tabulation List with Alphabetic Index</source><year>1978</year><publisher-name>World Health Organization</publisher-name></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="book"><person-group person-group-type="author"><collab>W. H. Organization, International Statistical Classification of Diseases and related health problems</collab></person-group><source>Alphabetical Index</source><year>2004</year><volume>3</volume><publisher-name>World Health Organization</publisher-name></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Abraham</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ahlman</surname><given-names>JT</given-names> </name><name name-style="western"><surname>Boudreau</surname><given-names>AJ</given-names> </name><etal/></person-group><source>CPT 2011 Standard Edition</source><year>2010</year><edition>4</edition><publisher-name>American Medical Association</publisher-name><pub-id pub-id-type="other">1603592164, 9781603592161</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="web"><source>Clinical Classifications Software (CCS) for ICD-9-CM</source><access-date>2024-12-09</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.oit.va.gov/Services/TRM/ToolPage.aspx?tid=7602">https://www.oit.va.gov/Services/TRM/ToolPage.aspx?tid=7602</ext-link></comment></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bennett</surname><given-names>CC</given-names> </name></person-group><article-title>Utilizing RxNorm to support practical computing applications: capturing medication history in live electronic health records</article-title><source>J Biomed Inform</source><year>2012</year><month>08</month><volume>45</volume><issue>4</issue><fpage>634</fpage><lpage>641</lpage><pub-id pub-id-type="doi">10.1016/j.jbi.2012.02.011</pub-id><pub-id pub-id-type="medline">22426081</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="web"><article-title>National Drug Code Directory</article-title><source>U.S. Food and Drug Association</source><access-date>2024-12-09</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.fda.gov/drugs/drug-approvals-and-databases/national-drug-code-directory">https://www.fda.gov/drugs/drug-approvals-and-databases/national-drug-code-directory</ext-link></comment></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Devlin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>MW</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>K</given-names> </name><name name-style="western"><surname>Toutanova</surname><given-names>K</given-names> </name></person-group><article-title>BERT: pre-training of deep bidirectional transformers for language understanding</article-title><source>arXiv</source><access-date>2025-08-18</access-date><comment>Preprint posted online on  May 24, 2019</comment><comment><ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/1810.04805">http://arxiv.org/abs/1810.04805</ext-link></comment><pub-id pub-id-type="doi">10.48550/arXiv.1810.04805</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>J</given-names> </name><name name-style="western"><surname>Yoon</surname><given-names>W</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>S</given-names> </name><etal/></person-group><article-title>BioBERT: a pre-trained biomedical language representation model for biomedical text mining</article-title><source>Bioinformatics</source><year>2020</year><month>02</month><day>15</day><volume>36</volume><issue>4</issue><fpage>1234</fpage><lpage>1240</lpage><pub-id pub-id-type="doi">10.1093/bioinformatics/btz682</pub-id><pub-id pub-id-type="medline">31501885</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hong</surname><given-names>C</given-names> </name><name name-style="western"><surname>Rush</surname><given-names>E</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Clinical knowledge extraction via sparse embedding regression (KESER) with multi-center large scale electronic health record data</article-title><source>NPJ Digit Med</source><year>2021</year><month>10</month><day>27</day><volume>4</volume><issue>1</issue><fpage>151</fpage><pub-id pub-id-type="doi">10.1038/s41746-021-00519-z</pub-id><pub-id pub-id-type="medline">34707226</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kartchner</surname><given-names>D</given-names> </name><name name-style="western"><surname>Christensen</surname><given-names>T</given-names> </name><name name-style="western"><surname>Humpherys</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wade</surname><given-names>S</given-names> </name></person-group><article-title>Code2Vec: embedding and clustering medical diagnosis data</article-title><conf-name>2017 IEEE International Conference on Healthcare Informatics (ICHI)</conf-name><conf-date>Aug 23-26, 2017</conf-date><conf-loc>Park City, UT, USA</conf-loc><fpage>386</fpage><lpage>390</lpage><pub-id pub-id-type="doi">10.1109/ICHI.2017.94</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Choi</surname><given-names>E</given-names> </name><name name-style="western"><surname>Bahadori</surname><given-names>MT</given-names> </name><name name-style="western"><surname>Searles</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Multi-layer representation learning for medical concepts</article-title><conf-name>Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining</conf-name><conf-date>Aug 13, 2016</conf-date><conf-loc>San Francisco California USA</conf-loc><publisher-name>ACM</publisher-name><fpage>1495</fpage><lpage>1504</lpage><pub-id pub-id-type="doi">10.1145/2939672.2939823</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>H&#x00E9;naff</surname><given-names>OJ</given-names> </name><etal/></person-group><article-title>Data-efficient image recognition with contrastive predictive coding</article-title><comment>Preprint posted online on 2019</comment><pub-id pub-id-type="doi">10.48550/ARXIV.1905.09272</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hong</surname><given-names>C</given-names> </name><name name-style="western"><surname>Pencina</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Wojdyla</surname><given-names>DM</given-names> </name><etal/></person-group><article-title>Predictive accuracy of stroke risk prediction models across Black and White race, sex, and age groups</article-title><source>JAMA</source><year>2023</year><month>01</month><day>24</day><volume>329</volume><issue>4</issue><fpage>306</fpage><lpage>317</lpage><pub-id pub-id-type="doi">10.1001/jama.2022.24683</pub-id><pub-id pub-id-type="medline">36692561</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Habibi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Weber</surname><given-names>L</given-names> </name><name name-style="western"><surname>Neves</surname><given-names>M</given-names> </name><name name-style="western"><surname>Wiegandt</surname><given-names>DL</given-names> </name><name name-style="western"><surname>Leser</surname><given-names>U</given-names> </name></person-group><article-title>Deep learning with word embeddings improves biomedical named entity recognition</article-title><source>Bioinformatics</source><year>2017</year><month>07</month><day>15</day><volume>33</volume><issue>14</issue><fpage>i37</fpage><lpage>i48</lpage><pub-id pub-id-type="doi">10.1093/bioinformatics/btx228</pub-id><pub-id pub-id-type="medline">28881963</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Lin</surname><given-names>C</given-names> </name><name name-style="western"><surname>Miller</surname><given-names>T</given-names> </name><name name-style="western"><surname>Dligach</surname><given-names>D</given-names> </name><name name-style="western"><surname>Bethard</surname><given-names>S</given-names> </name><name name-style="western"><surname>Savova</surname><given-names>G</given-names> </name></person-group><article-title>A BERT-based universal model for both within-and cross-sentence clinical temporal relation extraction</article-title><conf-name>Proceedings of the 2nd Clinical Natural Language Processing Workshop</conf-name><conf-date>Jun 2019</conf-date><conf-loc>Minneapolis, Minnesota, USA</conf-loc><fpage>65</fpage><lpage>71</lpage><pub-id pub-id-type="doi">10.18653/v1/W19-1908</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wiese</surname><given-names>G</given-names> </name><name name-style="western"><surname>Weissenborn</surname><given-names>D</given-names> </name><name name-style="western"><surname>Neves</surname><given-names>M</given-names> </name></person-group><article-title>Neural domain adaptation for biomedical question answering</article-title><conf-name>Proceedings of the 21st Conference on Computational Natural Language Learning (CoNLL 2017)</conf-name><conf-date>2017</conf-date><conf-loc>Vancouver, Canada</conf-loc><pub-id pub-id-type="doi">10.18653/v1/K17-1029</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Sutton</surname><given-names>A</given-names> </name><name name-style="western"><surname>Cristianini</surname><given-names>N</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Maglogiannis</surname><given-names>I</given-names> </name><name name-style="western"><surname>Iliadis</surname><given-names>L</given-names> </name><name name-style="western"><surname>Pimenidis</surname><given-names>E</given-names> </name></person-group><article-title>On the learnability of concepts: with applications to comparing word embedding algorithms</article-title><source>Artificial Intelligence Applications and Innovations</source><year>2020</year><volume>584</volume><publisher-name>Springer International Publishing</publisher-name><fpage>420</fpage><lpage>432</lpage><pub-id pub-id-type="doi">10.1007/978-3-030-49186-4_35</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Mueller</surname><given-names>J</given-names> </name><name name-style="western"><surname>Thyagarajan</surname><given-names>A</given-names> </name></person-group><article-title>Siamese recurrent architectures for learning sentence similarity</article-title><conf-name>Proceedings of the AAAI conference on artificial intelligence</conf-name><conf-date>2016</conf-date><pub-id pub-id-type="doi">10.1609/aaai.v30i1.10350</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Corley</surname><given-names>C</given-names> </name><name name-style="western"><surname>Mihalcea</surname><given-names>R</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Dolan</surname><given-names>B</given-names> </name><name name-style="western"><surname>Dagan</surname><given-names>I</given-names> </name></person-group><article-title>Measuring the semantic similarity of texts</article-title><conf-name>Proceedings of the ACL Workshop on Empirical Modeling of Semantic Equivalence and Entailment</conf-name><conf-date>Jun 18, 2005</conf-date><conf-loc>Ann Arbor, Michigan</conf-loc><pub-id pub-id-type="doi">10.3115/1631862.1631865</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Nair</surname><given-names>V</given-names> </name><name name-style="western"><surname>Hinton</surname><given-names>GE</given-names> </name></person-group><article-title>Rectified linear units improve restricted boltzmann machines</article-title><access-date>2025-08-18</access-date><conf-name>Proceedings of the 27th International Conference on Machine Learning (ICML-10)</conf-name><conf-date>2010</conf-date><conf-loc>Madison, Wisconsin, USA</conf-loc><fpage>807</fpage><lpage>814</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/10.5555/3104322.3104425">https://dl.acm.org/doi/10.5555/3104322.3104425</ext-link></comment></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Sabuncu</surname><given-names>MR</given-names> </name></person-group><article-title>Generalized cross entropy loss for training deep neural networks with noisy labels</article-title><source>Adv Neural Inf Process Syst</source><year>2018</year><month>12</month><volume>32</volume><fpage>8792</fpage><lpage>8802</lpage><pub-id pub-id-type="medline">39839708</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Kingma</surname><given-names>DP</given-names> </name><name name-style="western"><surname>Ba</surname><given-names>J</given-names> </name></person-group><article-title>Adam: a method for stochastic optimization</article-title><source>arXiv</source><comment>Preprint posted online on 2014</comment></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>T</given-names> </name><name name-style="western"><surname>Kornblith</surname><given-names>S</given-names> </name><name name-style="western"><surname>Norouzi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hinton</surname><given-names>G</given-names> </name></person-group><article-title>A simple framework for contrastive learning of visual representations</article-title><source>arXiv</source><access-date>2025-08-18</access-date><comment>Preprint posted online on  Feb 13, 2020</comment><comment><ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/2002.05709">http://arxiv.org/abs/2002.05709</ext-link></comment><pub-id pub-id-type="doi">10.48550/arXiv.2002.05709</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Vinyals</surname><given-names>O</given-names> </name></person-group><article-title>Representation learning with contrastive predictive coding</article-title><comment>Preprint posted online on 2018</comment><pub-id pub-id-type="doi">10.48550/ARXIV.1807.03748</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bradley</surname><given-names>AP</given-names> </name></person-group><article-title>The use of the area under the ROC curve in the evaluation of machine learning algorithms</article-title><source>Pattern Recognit DAGM</source><year>1997</year><month>07</month><volume>30</volume><issue>7</issue><fpage>1145</fpage><lpage>1159</lpage><pub-id pub-id-type="doi">10.1016/S0031-3203(96)00142-2</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hanley</surname><given-names>JA</given-names> </name><name name-style="western"><surname>McNeil</surname><given-names>BJ</given-names> </name></person-group><article-title>The meaning and use of the area under a receiver operating characteristic (ROC) curve</article-title><source>Radiology</source><year>1982</year><month>04</month><volume>143</volume><issue>1</issue><fpage>29</fpage><lpage>36</lpage><pub-id pub-id-type="doi">10.1148/radiology.143.1.7063747</pub-id><pub-id pub-id-type="medline">7063747</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pedregosa</surname><given-names>F</given-names> </name><name name-style="western"><surname>Varoquaux</surname><given-names>G</given-names> </name><name name-style="western"><surname>Gramfort</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Scikit-learn: machine learning in python</article-title><source>J Mach Learn Res</source><year>2011</year><access-date>2025-08-18</access-date><volume>12</volume><fpage>2825</fpage><lpage>2830</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://jmlr.org/papers/volume12/pedregosa11a/pedregosa11a.pdf">https://jmlr.org/papers/volume12/pedregosa11a/pedregosa11a.pdf</ext-link></comment></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Krizhevsky</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sutskever</surname><given-names>I</given-names> </name><name name-style="western"><surname>Hinton</surname><given-names>GE</given-names> </name></person-group><article-title>Imagenet classification with deep convolutional neural networks</article-title><source>Adv Neural Inf Process Syst</source><year>2012</year><volume>25</volume></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Efron</surname><given-names>B</given-names> </name><name name-style="western"><surname>Tibshirani</surname><given-names>RJ</given-names> </name></person-group><article-title>An introduction to the bootstrap</article-title><source>Chapman and Hall/CRC</source><year>1994</year><pub-id pub-id-type="doi">10.1201/9780429246593</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Spjuth</surname><given-names>O</given-names> </name><name name-style="western"><surname>Krestyaninova</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hastings</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Harmonising and linking biomedical and clinical data across disparate data archives to enable integrative cross-biobank research</article-title><source>Eur J Hum Genet</source><year>2016</year><month>04</month><volume>24</volume><issue>4</issue><fpage>521</fpage><lpage>528</lpage><pub-id pub-id-type="doi">10.1038/ejhg.2015.165</pub-id><pub-id pub-id-type="medline">26306643</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Prabhu</surname><given-names>SP</given-names> </name><name name-style="western"><surname>Popp</surname><given-names>ZT</given-names> </name><etal/></person-group><article-title>A natural language processing approach to support biomedical data harmonization: leveraging large language models</article-title><source>PLoS ONE</source><year>2025</year><volume>20</volume><issue>7</issue><fpage>e0328262</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0328262</pub-id><pub-id pub-id-type="medline">40705832</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>D</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>D</given-names> </name><name name-style="western"><surname>Cai</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Robust automated harmonization of heterogeneous data through ensemble machine learning: algorithm development and validation study</article-title><source>JMIR Med Inform</source><year>2025</year><month>01</month><day>22</day><volume>13</volume><fpage>e54133</fpage><pub-id pub-id-type="doi">10.2196/54133</pub-id><pub-id pub-id-type="medline">39844378</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Tinn</surname><given-names>R</given-names> </name><name name-style="western"><surname>Cheng</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Domain-specific language model pretraining for biomedical natural language processing</article-title><source>ACM Trans Comput Healthcare</source><year>2022</year><month>01</month><day>31</day><volume>3</volume><issue>1</issue><fpage>1</fpage><lpage>23</lpage><pub-id pub-id-type="doi">10.1145/3458754</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Peng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Yan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>Z</given-names> </name></person-group><article-title>Transfer learning in biomedical natural language processing: an evaluation of BERT and elmo on ten benchmarking datasets</article-title><conf-name>Proceedings of the 18th BioNLP Workshop and Shared Task</conf-name><conf-date>2019</conf-date><conf-loc>Florence, Italy</conf-loc><publisher-name>Association for Computational Linguistics</publisher-name><fpage>58</fpage><lpage>65</lpage><pub-id pub-id-type="doi">10.18653/v1/W19-5006</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bradwell</surname><given-names>KR</given-names> </name><name name-style="western"><surname>Wooldridge</surname><given-names>JT</given-names> </name><name name-style="western"><surname>Amor</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Harmonizing units and values of quantitative data elements in a very large nationally pooled electronic health record (EHR) dataset</article-title><source>J Am Med Inform Assoc</source><year>2022</year><month>06</month><day>14</day><volume>29</volume><issue>7</issue><fpage>1172</fpage><lpage>1182</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocac054</pub-id><pub-id pub-id-type="medline">35435957</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singhal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Azizi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Large language models encode clinical knowledge</article-title><source>Nature New Biol</source><year>2023</year><month>08</month><volume>620</volume><issue>7972</issue><fpage>172</fpage><lpage>180</lpage><pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id><pub-id pub-id-type="medline">37438534</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Song</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>T</given-names> </name><name name-style="western"><surname>Cai</surname><given-names>P</given-names> </name><name name-style="western"><surname>Mondal</surname><given-names>SK</given-names> </name><name name-style="western"><surname>Sahoo</surname><given-names>JP</given-names> </name></person-group><article-title>A comprehensive survey of few-shot learning: evolution, applications, challenges, and opportunities</article-title><source>ACM Comput Surv</source><year>2023</year><month>12</month><day>31</day><volume>55</volume><issue>13s</issue><fpage>1</fpage><lpage>40</lpage><pub-id pub-id-type="doi">10.1145/3582688</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhuang</surname><given-names>F</given-names> </name><name name-style="western"><surname>Qi</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Duan</surname><given-names>K</given-names> </name><etal/></person-group><article-title>A comprehensive survey on transfer learning</article-title><source>Proc IEEE</source><year>2021</year><month>01</month><volume>109</volume><issue>1</issue><fpage>43</fpage><lpage>76</lpage><pub-id pub-id-type="doi">10.1109/JPROC.2020.3004555</pub-id></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rohanian</surname><given-names>O</given-names> </name><name name-style="western"><surname>Nouriborji</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kouchaki</surname><given-names>S</given-names> </name><name name-style="western"><surname>Clifton</surname><given-names>DA</given-names> </name></person-group><article-title>On the effectiveness of compact biomedical transformers</article-title><source>Bioinformatics</source><year>2023</year><month>03</month><day>1</day><volume>39</volume><issue>3</issue><fpage>btad103</fpage><pub-id pub-id-type="doi">10.1093/bioinformatics/btad103</pub-id><pub-id pub-id-type="medline">36825820</pub-id></nlm-citation></ref><ref id="ref53"><label>53</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Graves</surname><given-names>A</given-names> </name><name name-style="western"><surname>Mohamed</surname><given-names>A r</given-names> </name><name name-style="western"><surname>Hinton</surname><given-names>G</given-names> </name><name name-style="western"><surname>Mohamed</surname><given-names>A r</given-names> </name></person-group><article-title>Speech recognition with deep recurrent neural networks</article-title><conf-name>2013 IEEE International Conference on Acoustics, Speech and Signal Processing</conf-name><conf-date>May 26-31, 2013</conf-date><conf-loc>Vancouver, BC, Canada</conf-loc><fpage>6645</fpage><lpage>6649</lpage><pub-id pub-id-type="doi">10.1109/ICASSP.2013.6638947</pub-id></nlm-citation></ref><ref id="ref54"><label>54</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hochreiter</surname><given-names>S</given-names> </name><name name-style="western"><surname>Schmidhuber</surname><given-names>J</given-names> </name></person-group><article-title>Long short-term memory</article-title><source>Neural Comput</source><year>1997</year><month>11</month><day>15</day><volume>9</volume><issue>8</issue><fpage>1735</fpage><lpage>1780</lpage><pub-id pub-id-type="doi">10.1162/neco.1997.9.8.1735</pub-id><pub-id pub-id-type="medline">9377276</pub-id></nlm-citation></ref><ref id="ref55"><label>55</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Brown</surname><given-names>TB</given-names> </name><name name-style="western"><surname>Mann</surname><given-names>B</given-names> </name><name name-style="western"><surname>Ryder</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Language models are few-shot learners</article-title><source>arXiv</source><comment>Preprint posted online on 2020</comment><pub-id pub-id-type="doi">10.48550/ARXIV.2005.14165</pub-id></nlm-citation></ref><ref id="ref56"><label>56</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chowdhery</surname><given-names>A</given-names> </name><name name-style="western"><surname>Narang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Devlin</surname><given-names>J</given-names> </name><etal/></person-group><article-title>PaLM: scaling language modeling with pathways</article-title><source>J Mach Learn Res</source><year>2023</year><month>08</month><access-date>2025-08-18</access-date><fpage>1</fpage><lpage>113</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://www.jmlr.org/papers/volume24/22-1144/22-1144.pdf">https://www.jmlr.org/papers/volume24/22-1144/22-1144.pdf</ext-link></comment></nlm-citation></ref><ref id="ref57"><label>57</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Touvron</surname><given-names>H</given-names> </name><name name-style="western"><surname>Lavril</surname><given-names>T</given-names> </name><name name-style="western"><surname>Izacard</surname><given-names>G</given-names> </name><etal/></person-group><article-title>LLaMA: open and efficient foundation language models</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 27, 2023</comment><pub-id pub-id-type="doi">10.48550/ARXIV.2302.13971</pub-id></nlm-citation></ref><ref id="ref58"><label>58</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kanakarajan</surname><given-names>K raj</given-names> </name><name name-style="western"><surname>Kundumani</surname><given-names>B</given-names> </name><name name-style="western"><surname>Abraham</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sankarasubbu</surname><given-names>M</given-names> </name></person-group><article-title>BioSimCSE: biomedical sentence embeddings using contrastive learning</article-title><conf-name>Proceedings of the 13th International Workshop on Health Text Mining and Information Analysis (LOUHI)</conf-name><conf-date>2022</conf-date><conf-loc>Abu Dhabi, United Arab Emirates (Hybrid</conf-loc><fpage>81</fpage><lpage>86</lpage><pub-id pub-id-type="doi">10.18653/v1/2022.louhi-1.10</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Additional material.</p><media xlink:href="formative_v9i1e75608_app1.docx" xlink:title="DOCX File, 505 KB"/></supplementary-material></app-group></back></article>