<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JFR</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id>
      <journal-title>JMIR Formative Research</journal-title>
      <issn pub-type="epub">2561-326X</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v8i1e52200</article-id>
      <article-id pub-id-type="pmid">38277207</article-id>
      <article-id pub-id-type="doi">10.2196/52200</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Patient Phenotyping for Atopic Dermatitis With Transformers and Machine Learning: Algorithm Development and Validation Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Eysenbach</surname>
            <given-names>Gunther</given-names>
          </name>
        </contrib>
        <contrib contrib-type="editor">
          <name>
            <surname>Mavragani</surname>
            <given-names>Amaryllis</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Zaghir</surname>
            <given-names>Jamil</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Chapman</surname>
            <given-names>Alec</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>Andrew</given-names>
          </name>
          <degrees>BSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0003-1310-0107</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Fulton</surname>
            <given-names>Rachel</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9229-7259</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Hwang</surname>
            <given-names>Sy</given-names>
          </name>
          <degrees>MS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-3851-9521</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Margolis</surname>
            <given-names>David J</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-0506-8085</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author" corresp="yes" equal-contrib="yes">
          <name name-style="western">
            <surname>Mowery</surname>
            <given-names>Danielle</given-names>
          </name>
          <degrees>MS, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>University of Pennsylvania</institution>
            <addr-line>A206 Richards Building</addr-line>
            <addr-line>3700 Hamilton Walk</addr-line>
            <addr-line>Philadelphia, PA, 19104</addr-line>
            <country>United States</country>
            <phone>1 2157466677</phone>
            <email>dlmowery@pennmedicine.upenn.edu</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-3802-4457</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>University of Pennsylvania</institution>
        <addr-line>Philadelphia, PA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Lankenau Medical Center</institution>
        <addr-line>Wynnewood, PA</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Danielle Mowery <email>dlmowery@pennmedicine.upenn.edu</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2024</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>26</day>
        <month>1</month>
        <year>2024</year>
      </pub-date>
      <volume>8</volume>
      <elocation-id>e52200</elocation-id>
      <history>
        <date date-type="received">
          <day>31</day>
          <month>8</month>
          <year>2023</year>
        </date>
        <date date-type="rev-request">
          <day>6</day>
          <month>10</month>
          <year>2023</year>
        </date>
        <date date-type="rev-recd">
          <day>30</day>
          <month>11</month>
          <year>2023</year>
        </date>
        <date date-type="accepted">
          <day>4</day>
          <month>12</month>
          <year>2023</year>
        </date>
      </history>
      <copyright-statement>©Andrew Wang, Rachel Fulton, Sy Hwang, David J Margolis, Danielle Mowery. Originally published in JMIR Formative Research (https://formative.jmir.org), 26.01.2024.</copyright-statement>
      <copyright-year>2024</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on https://formative.jmir.org, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://formative.jmir.org/2024/1/e52200" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Atopic dermatitis (AD) is a chronic skin condition that millions of people around the world live with each day. Performing research into identifying the causes and treatment for this disease has great potential to provide benefits for these individuals. However, AD clinical trial recruitment is not a trivial task due to the variance in diagnostic precision and phenotypic definitions leveraged by different clinicians, as well as the time spent finding, recruiting, and enrolling patients by clinicians to become study participants. Thus, there is a need for automatic and effective patient phenotyping for cohort recruitment.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aims to present an approach for identifying patients whose electronic health records suggest that they may have AD.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We created a vectorized representation of each patient and trained various supervised machine learning methods to classify when a patient has AD. Each patient is represented by a vector of either probabilities or binary values, where each value indicates whether they meet a different criteria for AD diagnosis.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>The most accurate AD classifier performed with a class-balanced accuracy of 0.8036, a precision of 0.8400, and a recall of 0.7500 when using XGBoost (Extreme Gradient Boosting).</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Creating an automated approach for identifying patient cohorts has the potential to accelerate, standardize, and automate the process of patient recruitment for AD studies; therefore, reducing clinician burden and informing the discovery of better treatment options for AD.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>atopic dermatitis</kwd>
        <kwd>classification</kwd>
        <kwd>classifier</kwd>
        <kwd>dermatitis</kwd>
        <kwd>dermatology</kwd>
        <kwd>EHR</kwd>
        <kwd>electronic health record</kwd>
        <kwd>health records</kwd>
        <kwd>health</kwd>
        <kwd>informatics</kwd>
        <kwd>machine learning</kwd>
        <kwd>natural language processing</kwd>
        <kwd>NLP</kwd>
        <kwd>patient phenotyping</kwd>
        <kwd>phenotype</kwd>
        <kwd>skin</kwd>
        <kwd>transformer</kwd>
        <kwd>transformers</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>Atopic dermatitis (AD) is a common skin disease with a population prevalence of approximately 30% [<xref ref-type="bibr" rid="ref1">1</xref>]. It is often diagnosed in early childhood, but onset can occur at any age [<xref ref-type="bibr" rid="ref2">2</xref>-<xref ref-type="bibr" rid="ref5">5</xref>]. Symptoms of AD include inflamed, red, irritated, and itchy skin and can cause significant physical and emotional distress. AD is often associated with other allergic illnesses, including asthma, seasonal allergies, and food allergies [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref5">5</xref>-<xref ref-type="bibr" rid="ref7">7</xref>].</p>
        <p>AD is thought to be associated with skin barrier dysfunction and immune dysregulation [<xref ref-type="bibr" rid="ref5">5</xref>]. AD has also been associated with genetic variation as well as environmental factors [<xref ref-type="bibr" rid="ref5">5</xref>]. Classic treatment for AD has included the use of moisturizers, topical steroids, and other topical anti-inflammatory agents [<xref ref-type="bibr" rid="ref8">8</xref>]. However, in the past few years, there have been significant treatment advances, which include systemic agents that alter immune function, such as dupilumab. Therefore, due to the widespread nature of AD, the need for improved knowledge of the natural history of AD, the need to understand the efficacy of new treatments, and the need to develop new treatments, there is an urgent need to understand the clinical course of individuals with AD. However, identifying appropriate cohorts of patients for medical studies can be difficult and time-consuming. Because AD is so common as well as being diagnosed and managed by many different clinicians in varying health care settings, a potential source population would be patients from a health system’s electronic health records (EHRs) [<xref ref-type="bibr" rid="ref9">9</xref>]. Investigators often ascertain a patient’s illness using International Classification of Disease (ICD) hospital billing codes as recorded during routine office visits. However, it has been previously demonstrated that reliance on ICD codes is not an accurate method for the ascertainment of study cohorts with AD [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>]. Furthermore, epidemiologic studies have used different methods and algorithms, including the UK Working Party (UKWP) diagnostic criteria and the Hanifin and Rajka (HR) criteria [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>]. Investigators attempting to conduct clinical trials and observational studies have also relied on manual, large-scale chart review, a process that is inefficient, slow, and tedious [<xref ref-type="bibr" rid="ref9">9</xref>]. This motivates the need for a standard method to accurately, automatically, and efficiently identify potential patient cohorts from their text medical records by using natural language processing (NLP) and machine learning (ML) techniques.</p>
      </sec>
      <sec>
        <title>Previous Work</title>
        <p>Previously, researchers aimed to phenotype patients with AD using EHR data. In particular, Gustafson et al [<xref ref-type="bibr" rid="ref10">10</xref>] trained a logistic regression model with lasso regularization to identify cases of AD from the Northwestern Medical Enterprise Data Warehouse, which contained both structured data (ICD Ninth and Tenth Revision codes, medication prescriptions, and laboratory results) as well as unstructured data (clinician notes from patient encounters). A gold standard diagnosis was assigned to each patient in their data set by 2 rheumatologists following a chart review when using the UKWP criteria and (alternatively) when using the HR criteria.</p>
        <p>Although similar, this study differs in the following ways: (1) we survey a wide range of supervised ML algorithms as opposed to only using lasso regularized logistic regression, (2) we use transformer embeddings of sentences to represent information in each patient’s records and aggregate these embeddings with multilayer perceptron (MLP) networks to create a patient vector representation for patient phenotyping, and (3) we performed an ablation study of processing methods to compare the impact on performance in using a probability-based versus binary label of whether each patient meets various AD diagnostic criteria when creating a vector to represent each patient for input to our final patient phenotyping algorithms.</p>
      </sec>
      <sec>
        <title>Contributions</title>
        <p>The primary contributions of this study are as follows:</p>
        <list list-type="bullet">
          <list-item>
            <p>We introduce and validate a rules-based approach for aggregating information from patient EHR data to generate binary-valued patient vectors that are used with standard ML algorithms for patient phenotyping.</p>
          </list-item>
          <list-item>
            <p>We introduce and validate a transformer-based approach for aggregating information and patient phenotyping by using “Bidirectional Encoder Representations from Transformers” (BERT) models (ie, BERT Base Uncased and BioClinical BERT) to generate patient vectors of probabilities, which are used with standard ML algorithms for patient phenotyping.</p>
          </list-item>
          <list-item>
            <p>We compare the aforementioned approaches to (1) discern whether a transformer model pretrained on clinical text can provide performance benefits over a transformer model not pretrained on clinical text, and (2) discern whether a transformer-based approach for aggregating information could outperform a rules-based approach for aggregating information.</p>
          </list-item>
          <list-item>
            <p>We demonstrate that MLP networks can be used with BERT sentence embeddings to identify which sentences in patient records are relevant to the diagnosis of AD. These MLP networks can then be used during clinician chart review to highlight sentences that are relevant to diagnosis and therefore accelerate the process of chart review during clinical trial recruitment.</p>
          </list-item>
        </list>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Overview</title>
        <p>To predict whether a patient may qualify as a participant for an AD study based on their EHR, we first assigned patients in our data set to either the training or testing sets. Then, for each patient, we aggregated the text from their EHR and constructed a vector representation of clinical features indicative of AD according to the UKWP criteria. Lastly, we leveraged our vectorized patient representations to train several ML classifiers to predict whether each patient has AD. In the following sections, we detail this process.</p>
      </sec>
      <sec>
        <title>Data Set Creation</title>
        <p>We initially sampled 2000 patients and their clinical records from Epic Clarity, Penn Medicine’s EHR database. We selected Penn Medicine patients who were diagnosed with a subset of AD-related ICD codes [<xref ref-type="bibr" rid="ref9">9</xref>]. As shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>, of the 2000 sampled patients, we identified 1926 patients who had clinical notes for processing. We then deidentified these patient records according to the Safe Harbor method using the “Protected Health Information filter” (Philter) [<xref ref-type="bibr" rid="ref13">13</xref>]. Each patient in the data set was also manually reviewed and labeled according to the UKWP diagnostic criteria for AD. According to the UKWP criteria, in order to qualify as having AD, a patient must have an itchy skin condition along with 3 or more of the following: a history of flexural involvement, a history of asthma or hay fever, a history of dry skin, an onset of rash when aged 2 years or younger, or a visible flexural dermatitis. Our data set was validated by 2 clinicians (a board-certified dermatologist [DJM] and a medical fellow [RF]), resulting in 137 patients with AD and 1789 patients without AD.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Waterfall diagram of cohort. AD: atopic dermatitis.</p>
          </caption>
          <graphic xlink:href="formative_v8i1e52200_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Training and Testing Split</title>
        <p>We first created our training set. Due to the heavy class imbalance in our data set, we decided to create a balanced training set to prevent biasing the model toward either patients with AD or patients without AD. In particular, we created the training set by assigning 80% (109/137) of the 137 patients with AD to our training set and undersampling the patients without AD to match the number of patients with AD. The remaining 20% (28/137) of the 137 patients were assigned to both of our testing sets. This resulted in a training set that had 109 patients with AD and 109 patients without AD.</p>
        <p>Next, we created 2 testing sets. The first testing set was class-balanced and was intended to show how our patient classification model can generalize to unseen samples if the class distribution is kept the same. The second testing set was class-imbalanced (28/91, 30% of patients with AD and 63/91, 70% of patients without AD) and was intended to show how our patient classification model can perform when the class-distribution of the data set matches the prevalence of AD in the United States.</p>
        <p>We created the first (balanced) testing set by including the 20% (28/137; previously reserved for testing) of the 137 patients with AD and combining them with an equal number of patients without AD who have not been used during training. This resulted in a balanced testing set that had 28 patients with AD and 28 patients without AD.</p>
        <p>Furthermore, we created the second (unbalanced) testing set by including the same 20% (28/137) patients with AD but instead combining them with a greater number of patients without AD to match the 30% prevalence rate of AD found in the United States [<xref ref-type="bibr" rid="ref1">1</xref>]. This resulted in an unbalanced testing set with 28 patients who have AD and 63 patients without AD.</p>
        <p>We chose not to create a separate hyperparameter tuning set and instead applied cross-validation for hyperparameter tuning on the training set due to the data-scarce setting of our experiments.</p>
      </sec>
      <sec>
        <title>Vector Representation for AD Classification</title>
        <p>Next, we created a vector representation for each patient. We performed 3 experiments to compare different methods of creating each patient’s vector representation (<xref rid="figure2" ref-type="fig">Figure 2</xref>).</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Atopic dermatitis (AD) phenotyping pipeline across all 3 experiments. BERT: Bidirectional Encoder Representations from Transformers; MLP: multilayer perceptron.</p>
          </caption>
          <graphic xlink:href="formative_v8i1e52200_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <sec>
          <title>Description of Patient Vector Representation</title>
          <p>Each patient’s vector representation is 8 elements long, where each element of the vector is representative of whether the patient fulfills a different AD diagnosis criteria based on the UKWP criteria as well as clinician feedback (<xref ref-type="table" rid="table1">Table 1</xref>). Across all 3 experiments, each element in the patient vector corresponds to a distinct classification task; however, in experiments 1 and 2, each element is a probability, and in experiment 3, each element is a binary value.</p>
          <p>In experiments 1 and 2, elements 1-8 of each patient’s vector represent the highest probability that any sentence in the patient’s EHR mentions (1) AD or synonyms of AD, (2) keywords that suggest hay fever allergies, (3) keywords that suggest atopic allergies, (4) keywords that suggest eczema or rashes, (5) keywords that indicate dry or itchy skin, (6) keywords denoting nonasthma medications, (7) keywords suggesting the presence of asthma, and (8) keywords indicating the use of asthma medications.</p>
          <p>In experiment 3, instead of each element representing a probability, each element represents a binary value of whether there was at least 1 sentence in the corresponding patient record suggesting the presence of the corresponding AD indicator.</p>
          <p>In the first 2 experiments, each patient’s vector elements represent probabilities (ranging from 0 to 1). Each probability value is derived from a distinct MLP classifier. Experiments 1 and 2 were performed to compare the use of 2 BERT models (BERT Base Uncased [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>] in experiment 1 and BioClinical BERT [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>] in experiment 2) for creating sentence embeddings used to train MLP networks (or alternatively, sentence classifiers). A separate MLP network is trained for each element of the patient vector. Each MLP network is trained to distinguish sentences in 1 of the 8 AD indicator categories from sentences in all other categories. Furthermore, <italic>medSpacy</italic> (Eyre et al [<xref ref-type="bibr" rid="ref18">18</xref>]) was used to split documents into sentences and label each sentence with different categories. After each sentence classifier is trained, embeddings of all sentences in each patient’s full EHR are passed through each sentence classifier, and an aggregation function (max operator) is used to assign a value to each element of each patient’s vector. Our goal in experiments 1 and 2 was to test the hypothesis that a BERT model pretrained on clinical text (BioClinical BERT) could outperform a BERT model trained on nonclinical text (BERT Base Uncased).</p>
          <p>In experiment 3, each patient’s vector elements are binary (either 0 or 1). Each element corresponds to a diagnostic criterion and represents whether <italic>medSpacy</italic> was able to identify at least 1 sentence in the patient’s record with a keyword and affirming context that suggests the patient meets the corresponding diagnostic criteria. Our goal was to conduct an ablation study to test the hypothesis that an AD phenotyping classifier leveraging BERT embeddings to create the patient vector representation will better discern whether a patient has AD than an AD Phenotyping Classifier without BERT embeddings.</p>
          <table-wrap position="float" id="table1">
            <label>Table 1</label>
            <caption>
              <p>Meaning of each patient vector element.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="130"/>
              <col width="870"/>
              <thead>
                <tr valign="top">
                  <td>Element</td>
                  <td>AD<sup>a</sup> indicator (diagnostic criteria)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>1</td>
                  <td>EHR<sup>b</sup> directly mentions patient has AD</td>
                </tr>
                <tr valign="top">
                  <td>2</td>
                  <td>Patient has hay fever allergies</td>
                </tr>
                <tr valign="top">
                  <td>3</td>
                  <td>Patient has atopic allergies</td>
                </tr>
                <tr valign="top">
                  <td>4</td>
                  <td>Patient has eczema or rashes</td>
                </tr>
                <tr valign="top">
                  <td>5</td>
                  <td>Patient has dry or itchy skin</td>
                </tr>
                <tr valign="top">
                  <td>6</td>
                  <td>Patient uses nonasthma medications related to treating AD</td>
                </tr>
                <tr valign="top">
                  <td>7</td>
                  <td>Patient has asthma</td>
                </tr>
                <tr valign="top">
                  <td>8</td>
                  <td>Patient uses asthma medications</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table1fn1">
                <p><sup>a</sup>AD: atopic dermatitis.</p>
              </fn>
              <fn id="table1fn2">
                <p><sup>b</sup>EHR: electronic health record.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
        </sec>
        <sec>
          <title>Preprocessing for Experiments 1-3</title>
          <p>Before each experiment, we applied the same preprocessing steps to assign 1 or more labels to each sentence in our corpus of documents in both our training and testing sets. Each sentence can be labeled as applying to 1, multiple, or none of the 8 AD indicators previously defined.</p>
          <p>For each of the 8 diagnostic criteria, we first created a list of keywords and phrases (for each vector element) that suggested the presence of the corresponding diagnostic criteria. Next, we used <italic>medSpacy</italic> with the ConText (Harkema et al [<xref ref-type="bibr" rid="ref20">20</xref>]) algorithm to split each document into sentences and categorize each sentence [<xref ref-type="bibr" rid="ref18">18</xref>]. Using <italic>medSpacy</italic> allows us to obtain sentences that suggest the presence of each of the 8 diagnostic criteria due to <italic>medSpacy</italic>’s use of regex and rules-based keyword matching. Furthermore, <italic>medSpacy</italic>’s implementation of the ConText algorithm allows us to discern between sentences that affirm from negated assertions. We define negated sentences for each AD indicator as sentences where the indicator is ruled out, sentences where the indicator is experienced by someone other than the patient, and sentences where the existence of the indicator is hypothetical [<xref ref-type="bibr" rid="ref19">19</xref>-<xref ref-type="bibr" rid="ref22">22</xref>].</p>
          <p>After assigning 1 or more categorical labels to each sentence with <italic>medSpacy</italic>, we then performed 3 different experiments to create a vectorized representation of each patient.</p>
          <p>In <xref ref-type="table" rid="table2">Tables 2</xref> and <xref ref-type="table" rid="table3">3</xref>, we include some statistics on the data set obtained after preprocessing.</p>
          <p>As shown in <xref ref-type="table" rid="table2">Table 2</xref>, patients with AD have approximately twice as many sentences as patients without AD. The average number of documents and sentences is the same (within patients with AD and similarly within patients without AD) between BERT Base Uncased and BioClinical BERT experiments because these values are only dependent on <italic>medSpacy</italic>’s preprocessing of documents. Furthermore, using BioClinical BERT to tokenize sentences tends to yield more tokens (on average) per patient and per document. We hypothesize this is because the BioClinical BERT tokenizer is able to recognize more clinical terms and therefore yields more tokens for the same sentence than using the tokenizer from BERT Base Uncased.</p>
          <p>As shown in <xref ref-type="table" rid="table3">Table 3</xref>, sentences in category 5 (relating to dry or itchy skin) tend to have the most tokens, whereas sentences in category 6 (relating to the use of nonasthma medications related to treating AD) tend to have the least number of tokens. We hypothesize that this is because categories where the average number of tokens per sentence is greater tend to correspond to more general categories where many terms and sentences could apply, whereas categories where the average number of tokens per sentence is lower tend to correspond to more specific categories, thus yielding a lower average number of tokens per sentence. Additionally, similarly to before, we can see that using BioClinical BERT tends to result in a greater number of tokens per sentence than using BERT Base Uncased for the same sentence.</p>
          <table-wrap position="float" id="table2">
            <label>Table 2</label>
            <caption>
              <p>Differences in the number of documents, sentences, and tokens between patients with atopic dermatitis (AD) and those without AD.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="430"/>
              <col width="140"/>
              <col width="150"/>
              <col width="130"/>
              <col width="150"/>
              <thead>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td colspan="2">Patients with AD</td>
                  <td colspan="2">Patients without AD</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>BERT<sup>a</sup> Uncased</td>
                  <td>BioClinical BERT</td>
                  <td>BERT Uncased</td>
                  <td>BioClinical BERT</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Average number of documents (per patient)</td>
                  <td>23.44</td>
                  <td>23.44</td>
                  <td>7.99</td>
                  <td>7.99</td>
                </tr>
                <tr valign="top">
                  <td>Average number of sentences (per patient)</td>
                  <td>392.99</td>
                  <td>392.99</td>
                  <td>193.69</td>
                  <td>193.69</td>
                </tr>
                <tr valign="top">
                  <td>Average number of tokens (per patient)</td>
                  <td>16035.39</td>
                  <td>17054.11</td>
                  <td>7241.02</td>
                  <td>7674.35</td>
                </tr>
                <tr valign="top">
                  <td>Average number of sentences (per document)</td>
                  <td>16.77</td>
                  <td>16.77</td>
                  <td>24.25</td>
                  <td>24.25</td>
                </tr>
                <tr valign="top">
                  <td>Average number of tokens (per document)</td>
                  <td>684.16</td>
                  <td>727.63</td>
                  <td>906.45</td>
                  <td>960.69</td>
                </tr>
                <tr valign="top">
                  <td>Average number of tokens (per sentence)</td>
                  <td>40.80</td>
                  <td>43.40</td>
                  <td>37.38</td>
                  <td>39.62</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table2fn1">
                <p><sup>a</sup>BERT: Bidirectional Encoder Representations from Transformers.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
          <table-wrap position="float" id="table3">
            <label>Table 3</label>
            <caption>
              <p>Mean number of tokens for sentences identified in each category.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="330"/>
              <col width="340"/>
              <col width="330"/>
              <thead>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>BERT<sup>a</sup> Uncased (tokens per sentence), mean</td>
                  <td>BioClinical BERT (tokens per sentence), mean</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Category 1</td>
                  <td>99.49</td>
                  <td>106.16</td>
                </tr>
                <tr valign="top">
                  <td>Category 2</td>
                  <td>81.18</td>
                  <td>92.41</td>
                </tr>
                <tr valign="top">
                  <td>Category 3</td>
                  <td>79.20</td>
                  <td>82.07</td>
                </tr>
                <tr valign="top">
                  <td>Category 4</td>
                  <td>83.74</td>
                  <td>92.55</td>
                </tr>
                <tr valign="top">
                  <td>Category 5</td>
                  <td>106.64</td>
                  <td>112.58</td>
                </tr>
                <tr valign="top">
                  <td>Category 6</td>
                  <td>74.93</td>
                  <td>80.17</td>
                </tr>
                <tr valign="top">
                  <td>Category 7</td>
                  <td>92.85</td>
                  <td>109.40</td>
                </tr>
                <tr valign="top">
                  <td>Category 8</td>
                  <td>76.13</td>
                  <td>83.57</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table3fn1">
                <p><sup>a</sup>BERT: Bidirectional Encoder Representations from Transformers.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
        </sec>
        <sec>
          <title>Experiments 1 and 2: Patient Vector Construction With BERT Embeddings</title>
          <p>In experiments 1 and 2, we first used the sentences <italic>medSpacy</italic> identified in each category to create class-balanced training and testing sets for each MLP network classifier, as shown in <xref ref-type="table" rid="table4">Table 4</xref>. The same training and testing set was used for both experiment 1 (BioClinical BERT) and experiment 2 (BERT Base Uncased).</p>
          <p>Next, we used pretrained BERT models to generate embeddings of the sentences in each classifier’s training and testing set. We incorporated pretrained BERT models because these models have been trained on a much larger corpus than our existing data set, and BERT provides a context-sensitive embedding of text that other techniques, such as bag of words, do not provide. Furthermore, we used BERT Base Uncased in experiment 1 and Alsentzer et al’s [<xref ref-type="bibr" rid="ref16">16</xref>] BioClinical BERT in experiment 2 because we wanted to quantify how much of a difference in performance using a model pretrained on clinical text can provide over a model that has not been pretrained on clinical text.</p>
          <p>Using these embeddings, we trained a MLP network to distinguish sentence embeddings in each category from sentence embeddings that are not in the corresponding category. Each of our MLPs was trained with the following architecture: a fully connected input layer of shape 768 × 100, followed by a Rectified Linear Unit (ReLU) activation, further followed by a fully connected output layer of shape 100 × 2. We trained each of our MLPs for 10 epochs with the cross-entropy loss function, the stochastic gradient descent (SGD) optimizer, a learning rate of 0.001, and a momentum value of 0.9. The final layer of each MLP can then be used to obtain the probability that any given sentence embedding comes from the category for which the MLP is being trained by passing the logits of the final layer to the softmax function.</p>
          <p>We used the ReLU activation function as defined below, where x is the input to the ReLU function:</p>
          <disp-formula>
            <graphic xlink:href="formative_v8i1e52200_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>We also used the softmax function as defined below, where e is the standard exponential function and <inline-graphic xlink:href="formative_v8i1e52200_fig6.png" xlink:type="simple" mimetype="image"/> is element at index i of the K element long input vector <inline-graphic xlink:href="formative_v8i1e52200_fig7.png" xlink:type="simple" mimetype="image"/>.</p>
          <disp-formula>
            <graphic xlink:href="formative_v8i1e52200_fig8.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>We chose to embed our sentences once with pretrained BERT models and then feed these saved embeddings to our MLP networks as opposed to adding a classification head (a linear layer) to the end of our pretrained BERT models. Although doing so only allows us to fine-tune the weights in our MLP network (as opposed to also fine-tuning the weights BERT uses to embed the sentences), doing so allows us to iterate over different experiments more quickly and with less computational power. In particular, we are able to (1) avoid the large computational expense of gradient calculations during backpropagation for all 12 layers of transformers used by BERT when fine-tuning the model, (2) avoid the computational expense of repeatedly generating the same embeddings from BERT multiple times (if we choose to freeze the weights of BERT and only fine-tune an added classification head or linear layer), and (3) iterate more efficiently over different hyperparameter combinations across different experiments with our MLP networks.</p>
          <p>After training a separate MLP network for each of the 8 categories, we generated a vector representation for each patient, where each of the 8 vector elements represents the highest probability that any given sentence in the patient record affirms the presence of the corresponding AD indicator (<xref rid="figure3" ref-type="fig">Figure 3</xref>). We accomplished this by iterating through all sentences in each patient’s full EHR and passing the sentence embedding through each of our 8 trained MLP networks to obtain 8 probabilities for each sentence corresponding to the probability that the sentence affirms each of the 8 AD indicators we previously selected. Then, for each patient and for each AD indicator, we kept the highest probability that any given sentence in the patient’s record affirms the presence of the AD indicator.</p>
          <table-wrap position="float" id="table4">
            <label>Table 4</label>
            <caption>
              <p>Training and testing data set size for each classifier.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="120"/>
              <col width="450"/>
              <col width="430"/>
              <thead>
                <tr valign="top">
                  <td>Classifier</td>
                  <td>Number of training samples, n</td>
                  <td>Number of testing samples, n</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>1</td>
                  <td>2766</td>
                  <td>862</td>
                </tr>
                <tr valign="top">
                  <td>2</td>
                  <td>1302</td>
                  <td>392</td>
                </tr>
                <tr valign="top">
                  <td>3</td>
                  <td>532</td>
                  <td>168</td>
                </tr>
                <tr valign="top">
                  <td>4</td>
                  <td>9822</td>
                  <td>2454</td>
                </tr>
                <tr valign="top">
                  <td>5</td>
                  <td>1466</td>
                  <td>354</td>
                </tr>
                <tr valign="top">
                  <td>6</td>
                  <td>9114</td>
                  <td>2316</td>
                </tr>
                <tr valign="top">
                  <td>7</td>
                  <td>1596</td>
                  <td>520</td>
                </tr>
                <tr valign="top">
                  <td>8</td>
                  <td>4764</td>
                  <td> 1070</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
          <fig id="figure3" position="float">
            <label>Figure 3</label>
            <caption>
              <p>Patient vector representations of atopic dermatitis indicators in experiments 1 and 2. BERT: Bidirectional Encoder Representations from Transformers; MLP: multilayer perceptron.</p>
            </caption>
            <graphic xlink:href="formative_v8i1e52200_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Experiment 3: Patient Vector Construction Without BERT Embeddings</title>
          <p>In experiment 3, we generated each patient’s vector representation by assigning a 1 to each element of the patient vector if <italic>medSpacy</italic> with the ConText algorithm identified at least 1 sentence in the patient’s record that affirms or suggests the presence of the AD indicator for which the vector element corresponds (<xref rid="figure4" ref-type="fig">Figure 4</xref>). Experiment 3 was conducted as an ablation study to quantify the performance benefit (if at all) of using contextual BERT text embeddings to generate probability scores that the patient meets various AD indicators.</p>
          <fig id="figure4" position="float">
            <label>Figure 4</label>
            <caption>
              <p>Patient vector representations of atopic dermatitis (AD) indicators in experiment 3.</p>
            </caption>
            <graphic xlink:href="formative_v8i1e52200_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
      </sec>
      <sec>
        <title>AD Phenotyping With Vector Representations</title>
        <p>In all 3 experiments, after generating a vector representation for each patient, we collated each patient’s vector representation with the corresponding label our clinicians assigned the patient when validating the data set. Then, we fed the vector patient representation and corresponding patient label through a variety of classification algorithms. These include logistic regression, support vector machines (SVM), decision trees, random forests, k-nearest neighbor (KNN), Extreme Gradient Boosting (XGBoost), and Adaptive Boosting (AdaBoost). During training for each of the previously mentioned classifiers, we used 5-fold cross validation to determine the best set of hyperparameters to use (as opposed to creating a separate validation set) due to the data-scarce setting of our experiments. We then used the selected hyperparameters to train each algorithm on the entire training set and evaluated performance on the unbalanced and balanced testing sets. In addition to using the previously mentioned classifiers, we also used the stacking algorithm provided by scikit-learn to obtain an ensemble prediction from the different classifiers [<xref ref-type="bibr" rid="ref23">23</xref>]. To quantify performance, we calculated the accuracy, precision, recall, <italic>F</italic><sub>1</sub>-score, negative predictive value (NPV), and specificity of each algorithm on both testing sets.</p>
        <p>We define accuracy, precision, and recall as follows, where TP is the number of true positives, TN is the number of true negatives, FP is the number of false positives, and FN is the number of false negatives:</p>
        <disp-formula>
          <graphic xlink:href="formative_v8i1e52200_fig9.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <disp-formula>
          <graphic xlink:href="formative_v8i1e52200_fig10.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <disp-formula>
          <graphic xlink:href="formative_v8i1e52200_fig11.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>Additionally, we define the <italic>F</italic><sub>1</sub>-score, NPV, and specificity as follows:</p>
        <disp-formula>
          <graphic xlink:href="formative_v8i1e52200_fig12.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <disp-formula>
          <graphic xlink:href="formative_v8i1e52200_fig13.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <disp-formula>
          <graphic xlink:href="formative_v8i1e52200_fig14.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>This research protocol was reviewed and approved by the University of Pennsylvania Institute Review Board and determined to be exempt (IRB#843922).</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Performance of MLP Networks</title>
        <p>In this section, we compare the performance of several MLP classifiers in distinguishing sentences relevant to the diagnosis of AD. This corresponds to the “Train separate MLP network (sentence classifier) for each of 8 AD indicators” box in <xref rid="figure2" ref-type="fig">Figure 2</xref>.</p>
        <p>As part of our AD phenotyping pipeline, we trained various MLP networks to classify when a given sentence embedding indicates the presence of an AD indicator, and we compared the performance of BioClinical BERT embeddings to BERT Base Uncased embeddings when training these MLP networks. In both cases, the classifier with the highest accuracy was the classifier for category 1 (sentences with direct mentions of AD). The classifiers with the 2 lowest accuracies were either the classifier for category 5 (sentences with mentions of dry or itchy skin) or the classifier for category 7 (sentences with mentions of asthma) for both the use of BioClinical BERT embeddings and the use of BERT Base Uncased embeddings. However, the accuracy in classifier 7 was lower when using BERT Base Uncased embeddings than when using BioClinical BERT embeddings.</p>
        <p>In experiment 1, the accuracies across AD indicator classifiers ranged from 0.7373 (classifier 5) to 0.9002 (classifier 1), as shown in <xref ref-type="table" rid="table5">Table 5</xref>.</p>
        <p>In experiment 2, the accuracies across AD indicator classifiers ranged from 0.7269 (classifier 7) to 0.9153 (classifier 1), as shown in <xref ref-type="table" rid="table6">Table 6</xref>.</p>
        <table-wrap position="float" id="table5">
          <label>Table 5</label>
          <caption>
            <p>Accuracy of different multilayer perceptron networks in discerning sentences by atopic dermatitis (AD) indicator categories using “BioClinical Bidirectional Encoder Representations from Transformers” sentence embeddings.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="190"/>
            <col width="630"/>
            <col width="180"/>
            <thead>
              <tr valign="top">
                <td>Classifier</td>
                <td>AD indicator</td>
                <td>Accuracy</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>1</td>
                <td>Direct mention of AD</td>
                <td>0.9002</td>
              </tr>
              <tr valign="top">
                <td>2</td>
                <td>Mention of hay fever allergies</td>
                <td>0.8954</td>
              </tr>
              <tr valign="top">
                <td>3</td>
                <td>Mention of atopic allergies</td>
                <td>0.8214</td>
              </tr>
              <tr valign="top">
                <td>4</td>
                <td>Mention of eczema or rash</td>
                <td>0.8284</td>
              </tr>
              <tr valign="top">
                <td>5</td>
                <td>Mention of dry or itchy skin</td>
                <td>0.7373</td>
              </tr>
              <tr valign="top">
                <td>6</td>
                <td>Mention of nonasthma medications</td>
                <td>0.8204</td>
              </tr>
              <tr valign="top">
                <td>7</td>
                <td>Mention of asthma</td>
                <td>0.7712</td>
              </tr>
              <tr valign="top">
                <td>8</td>
                <td>Mention of asthma medications</td>
                <td> 0.8299</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <table-wrap position="float" id="table6">
          <label>Table 6</label>
          <caption>
            <p>Accuracy of different multilayer perceptron networks in discerning sentences by atopic dermatitis (AD) indicator categories using “Bidirectional Encoder Representations from Transformers Base Uncased” sentence embeddings.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="190"/>
            <col width="620"/>
            <col width="190"/>
            <thead>
              <tr valign="top">
                <td>Classifier</td>
                <td>AD indicator</td>
                <td>Accuracy</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>1</td>
                <td>Direct mention of AD</td>
                <td>0.9153</td>
              </tr>
              <tr valign="top">
                <td>2</td>
                <td>Mention of hay fever allergies</td>
                <td>0.7730</td>
              </tr>
              <tr valign="top">
                <td>3</td>
                <td>Mention of atopic allergies</td>
                <td>0.7976</td>
              </tr>
              <tr valign="top">
                <td>4</td>
                <td>Mention of eczema or rash</td>
                <td>0.8439</td>
              </tr>
              <tr valign="top">
                <td>5</td>
                <td>Mention of dry or itchy skin</td>
                <td>0.7288</td>
              </tr>
              <tr valign="top">
                <td>6</td>
                <td>Mention of nonasthma medications</td>
                <td>0.8096</td>
              </tr>
              <tr valign="top">
                <td>7</td>
                <td>Mention of asthma</td>
                <td>0.7269</td>
              </tr>
              <tr valign="top">
                <td>8</td>
                <td>Mention of asthma medications</td>
                <td> 0.8738</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>AD Phenotyping With Patient Vector Representations</title>
        <p>In this section, we compare performance in patient classification when using different methods for creating patient vector representations. This encompasses all 3 experiments and corresponds to the “Use vector patient representations to classify whether patient has AD” box in <xref rid="figure2" ref-type="fig">Figure 2</xref>.</p>
        <p>In experiment 1, we leveraged BioClinical BERT sentence embeddings to train various MLP networks to discern sentence embeddings in different AD indicator categories. Then, we applied these trained MLP networks (sentence classifiers) along with an aggregation function (max operator) to assign values to each element of each patient’s vector representation. Lastly, we used each patient’s vector representation with their validated label to train various ML algorithms. We evaluated these on both a balanced and unbalanced testing set.</p>
        <p>As shown in <xref ref-type="table" rid="table7">Table 7</xref>, the accuracy on the balanced testing set ranges from 0.5893 (decision tree) to 0.7321 (logistic regression and SVM).</p>
        <p>As shown in <xref ref-type="table" rid="table8">Table 8</xref>, the range of accuracies on the unbalanced testing set is slightly lower, ranging from 0.5824 (decision tree) to 0.7253 (stacking classifier).</p>
        <p>In experiment 2, we followed the same process as in experiment 1; however, we used BERT Base Uncased instead of BioClinical BERT. As shown in <xref ref-type="table" rid="table9">Table 9</xref>, the accuracy of our AD classifiers on the balanced testing set ranges from 0.5179 (AdaBoost) to 0.6250 (random forest).</p>
        <p>As shown in <xref ref-type="table" rid="table10">Table 10</xref>, the range of accuracies of our AD classifiers on the unbalanced testing set is slightly higher, ranging from 0.5714 (logistic regression and SVM) to 0.6703 (random forest).</p>
        <p>In experiment 3, we performed an ablation study and assigned binary labels to the elements of each patient’s vector based on whether <italic>medSpacy</italic> was able to identify at least 1 sentence in each of the AD indicator categories that each vector element corresponds to. As shown in <xref ref-type="table" rid="table11">Table 11</xref>, the accuracy across our AD classifiers on the balanced testing set ranges from 0.6964 (KNN) to 0.8036 (XGBoost).</p>
        <p>As shown in <xref ref-type="table" rid="table12">Table 12</xref>, the lower bound of the range of accuracies across our AD classifiers on the unbalanced testing set is higher, and the upper bound of the accuracies is lower. The accuracies on the unbalanced testing set range from 0.7143 (Stacking Classifier) to 0.7582 (Random Forest and Stacking Classifier).</p>
        <table-wrap position="float" id="table7">
          <label>Table 7</label>
          <caption>
            <p>Atopic dermatitis phenotyping performance on balanced testing set in experiment 1 (BioClinical Bidirectional Encoder Representations from Transformers).</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="260"/>
            <col width="130"/>
            <col width="130"/>
            <col width="100"/>
            <col width="130"/>
            <col width="100"/>
            <col width="150"/>
            <thead>
              <tr valign="top">
                <td>Model</td>
                <td>Accuracy</td>
                <td>Precision</td>
                <td>Recall</td>
                <td><italic>F</italic><sub>1</sub>-score</td>
                <td>NPV<sup>a</sup></td>
                <td>Specificity</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Logistic regression</td>
                <td>0.7321</td>
                <td>0.7241</td>
                <td>0.7500</td>
                <td>0.7368</td>
                <td>0.7407</td>
                <td>0.7500</td>
              </tr>
              <tr valign="top">
                <td>SVM<sup>b</sup></td>
                <td>0.7321</td>
                <td>0.7826</td>
                <td>0.6429</td>
                <td>0.7059</td>
                <td>0.6970</td>
                <td>0.7857</td>
              </tr>
              <tr valign="top">
                <td>Decision tree</td>
                <td>0.5893</td>
                <td>0.6316</td>
                <td>0.4286</td>
                <td>0.5106</td>
                <td>0.5676</td>
                <td>0.7500</td>
              </tr>
              <tr valign="top">
                <td>Random forest</td>
                <td>0.6964</td>
                <td>0.7037</td>
                <td>0.6786</td>
                <td>0.6909</td>
                <td>0.6897</td>
                <td>0.8214</td>
              </tr>
              <tr valign="top">
                <td>KNN<sup>c</sup></td>
                <td>0.6786</td>
                <td>0.7273</td>
                <td>0.5714</td>
                <td>0.6400</td>
                <td>0.6471</td>
                <td>0.7857</td>
              </tr>
              <tr valign="top">
                <td>XGBoost<sup>d</sup></td>
                <td>0.6071</td>
                <td>0.6154</td>
                <td>0.5714</td>
                <td>0.5926</td>
                <td>0.6000</td>
                <td>0.8571</td>
              </tr>
              <tr valign="top">
                <td>AdaBoost<sup>e</sup></td>
                <td>0.6429</td>
                <td>0.6538</td>
                <td>0.6071</td>
                <td>0.6296</td>
                <td>0.6333</td>
                <td>0.7857</td>
              </tr>
              <tr valign="top">
                <td>Stacking classifier</td>
                <td>0.6964</td>
                <td>0.7391</td>
                <td>0.6071</td>
                <td>0.6667</td>
                <td>0.6667</td>
                <td>0.7500</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table7fn1">
              <p><sup>a</sup>NPV: negative predictive value.</p>
            </fn>
            <fn id="table7fn2">
              <p><sup>b</sup>SVM: support vector machines.</p>
            </fn>
            <fn id="table7fn3">
              <p><sup>c</sup>KNN: k-nearest neighbor.</p>
            </fn>
            <fn id="table7fn4">
              <p><sup>d</sup>XGBoost: Extreme Gradient Boosting.</p>
            </fn>
            <fn id="table7fn5">
              <p><sup>e</sup>AdaBoost: Adaptive Boosting.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table8">
          <label>Table 8</label>
          <caption>
            <p>Atopic dermatitis phenotyping performance on unbalanced testing set in experiment 1 (BioClinical Bidirectional Encoder Representations from Transformers).</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="260"/>
            <col width="130"/>
            <col width="130"/>
            <col width="100"/>
            <col width="130"/>
            <col width="100"/>
            <col width="150"/>
            <thead>
              <tr valign="top">
                <td>Model</td>
                <td>Accuracy</td>
                <td>Precision</td>
                <td>Recall</td>
                <td><italic>F</italic><sub>1</sub>-score</td>
                <td>NPV<sup>a</sup></td>
                <td>Specificity</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Logistic regression</td>
                <td>0.6813</td>
                <td>0.4884</td>
                <td>0.7500</td>
                <td>0.5915</td>
                <td>0.8542</td>
                <td>0.6984</td>
              </tr>
              <tr valign="top">
                <td>SVM<sup>b</sup></td>
                <td>0.6923</td>
                <td>0.5000</td>
                <td>0.6429</td>
                <td>0.5625</td>
                <td>0.8181</td>
                <td>0.7302</td>
              </tr>
              <tr valign="top">
                <td>Decision tree</td>
                <td>0.5824</td>
                <td>0.3438</td>
                <td>0.3929</td>
                <td>0.3667</td>
                <td>0.7119</td>
                <td>0.7143</td>
              </tr>
              <tr valign="top">
                <td>Random forest</td>
                <td>0.7143</td>
                <td>0.5313</td>
                <td>0.6071</td>
                <td>0.5667</td>
                <td>0.6845</td>
                <td>0.7619</td>
              </tr>
              <tr valign="top">
                <td>KNN<sup>c</sup></td>
                <td>0.6593</td>
                <td>0.4571</td>
                <td>0.5714</td>
                <td>0.5079</td>
                <td>0.7857</td>
                <td>0.7937</td>
              </tr>
              <tr valign="top">
                <td>XGBoost<sup>d</sup></td>
                <td>0.6264</td>
                <td>0.4211</td>
                <td>0.5714</td>
                <td>0.4848</td>
                <td>0.7736</td>
                <td>0.7619</td>
              </tr>
              <tr valign="top">
                <td>AdaBoost<sup>e</sup></td>
                <td>0.6044</td>
                <td>0.4048</td>
                <td>0.6071</td>
                <td>0.4857</td>
                <td>0.7755</td>
                <td>0.7302</td>
              </tr>
              <tr valign="top">
                <td>Stacking classifier</td>
                <td>0.7253</td>
                <td>0.5429</td>
                <td>0.6786</td>
                <td>0.6032</td>
                <td>0.8393</td>
                <td>0.6984</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table8fn1">
              <p><sup>a</sup>NPV: negative predictive value.</p>
            </fn>
            <fn id="table8fn2">
              <p><sup>b</sup>SVM: support vector machines.</p>
            </fn>
            <fn id="table8fn3">
              <p><sup>c</sup>KNN: k-nearest neighbor.</p>
            </fn>
            <fn id="table8fn4">
              <p><sup>d</sup>XGBoost: Extreme Gradient Boosting.</p>
            </fn>
            <fn id="table8fn5">
              <p><sup>e</sup>AdaBoost: Adaptive Boosting.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table9">
          <label>Table 9</label>
          <caption>
            <p>Atopic dermatitis phenotyping performance on balanced testing set in experiment 2 (Bidirectional Encoder Representations from Transformers Base Uncased).</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="240"/>
            <col width="140"/>
            <col width="130"/>
            <col width="100"/>
            <col width="140"/>
            <col width="100"/>
            <col width="150"/>
            <thead>
              <tr valign="top">
                <td>Model</td>
                <td>Accuracy</td>
                <td>Precision</td>
                <td>Recall</td>
                <td><italic>F</italic><sub>1</sub>-score</td>
                <td>NPV<sup>a</sup></td>
                <td>Specificity</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Logistic regression</td>
                <td>0.5893</td>
                <td>0.5758</td>
                <td>0.6786</td>
                <td>0.6230</td>
                <td>0.6087</td>
                <td>0.5000</td>
              </tr>
              <tr valign="top">
                <td>SVM<sup>b</sup></td>
                <td>0.6071</td>
                <td>0.5938</td>
                <td>0.6786</td>
                <td>0.6333</td>
                <td>0.6250</td>
                <td>0.5357</td>
              </tr>
              <tr valign="top">
                <td>Decision tree</td>
                <td>0.6071</td>
                <td>0.6071</td>
                <td>0.6071</td>
                <td>0.6071</td>
                <td>0.6071</td>
                <td>0.6071</td>
              </tr>
              <tr valign="top">
                <td>Random forest</td>
                <td>0.6250</td>
                <td>0.6522</td>
                <td>0.5357</td>
                <td>0.5882</td>
                <td>0.6061</td>
                <td>0.7143</td>
              </tr>
              <tr valign="top">
                <td>KNN<sup>c</sup></td>
                <td>0.5536</td>
                <td>0.5714</td>
                <td>0.4286</td>
                <td>0.4898</td>
                <td>0.5429</td>
                <td>0.6786</td>
              </tr>
              <tr valign="top">
                <td>XGBoost<sup>d</sup></td>
                <td>0.5536</td>
                <td>0.5556</td>
                <td>0.5357</td>
                <td>0.5455</td>
                <td>0.5517</td>
                <td>0.5714</td>
              </tr>
              <tr valign="top">
                <td>AdaBoost<sup>e</sup></td>
                <td>0.5179</td>
                <td>0.5185</td>
                <td>0.5000</td>
                <td>0.5091</td>
                <td>0.5172</td>
                <td>0.5357</td>
              </tr>
              <tr valign="top">
                <td>Stacking classifier</td>
                <td>0.6071</td>
                <td>0.6071</td>
                <td>0.6071</td>
                <td>0.6071</td>
                <td>0.6071</td>
                <td>0.6071</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table9fn1">
              <p><sup>a</sup>NPV: negative predictive value.</p>
            </fn>
            <fn id="table9fn2">
              <p><sup>b</sup>SVM: support vector machines.</p>
            </fn>
            <fn id="table9fn3">
              <p><sup>c</sup>KNN: k-nearest neighbor.</p>
            </fn>
            <fn id="table9fn4">
              <p><sup>d</sup>XGBoost: Extreme Gradient Boosting.</p>
            </fn>
            <fn id="table9fn5">
              <p><sup>e</sup>AdaBoost: Adaptive Boosting.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table10">
          <label>Table 10</label>
          <caption>
            <p>Atopic dermatitis phenotyping performance on unbalanced testing set in experiment 2 (Bidirectional Encoder Representations from Transformers Base Uncased).</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="230"/>
            <col width="130"/>
            <col width="130"/>
            <col width="110"/>
            <col width="130"/>
            <col width="120"/>
            <col width="150"/>
            <thead>
              <tr valign="top">
                <td>Model</td>
                <td>Accuracy</td>
                <td>Precision</td>
                <td>Recall</td>
                <td><italic>F</italic><sub>1</sub>-score</td>
                <td>NPV<sup>a</sup></td>
                <td>Specificity</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Logistic regression</td>
                <td>0.5714</td>
                <td>0.3878</td>
                <td>0.6786</td>
                <td>0.4935</td>
                <td>0.7857</td>
                <td>0.5238</td>
              </tr>
              <tr valign="top">
                <td>SVM<sup>b</sup></td>
                <td>0.5714</td>
                <td>0.3878</td>
                <td>0.6786</td>
                <td>0.4935</td>
                <td>0.7857</td>
                <td>0.5238</td>
              </tr>
              <tr valign="top">
                <td>Decision tree</td>
                <td>0.6484</td>
                <td>0.4474</td>
                <td>0.6071</td>
                <td>0.5152</td>
                <td>0.7925</td>
                <td>0.6667</td>
              </tr>
              <tr valign="top">
                <td>Random forest</td>
                <td>0.6703</td>
                <td>0.4737</td>
                <td>0.6429</td>
                <td>0.5455</td>
                <td>0.8113</td>
                <td>0.6825</td>
              </tr>
              <tr valign="top">
                <td>KNN<sup>c</sup></td>
                <td>0.6264</td>
                <td>0.4000</td>
                <td>0.4286</td>
                <td>0.4138</td>
                <td>0.7377</td>
                <td>0.7143</td>
              </tr>
              <tr valign="top">
                <td>XGBoost<sup>d</sup></td>
                <td>0.6374</td>
                <td>0.4286</td>
                <td>0.5357</td>
                <td>0.4762</td>
                <td>0.7679</td>
                <td>0.6825</td>
              </tr>
              <tr valign="top">
                <td>AdaBoost<sup>e</sup></td>
                <td>0.5934</td>
                <td>0.3784</td>
                <td>0.5000</td>
                <td>0.4308</td>
                <td>0.7407</td>
                <td>0.6349</td>
              </tr>
              <tr valign="top">
                <td>Stacking classifier</td>
                <td>0.6484</td>
                <td>0.4474</td>
                <td>0.6071</td>
                <td>0.5152</td>
                <td>0.7925</td>
                <td>0.6667</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table10fn1">
              <p><sup>a</sup>NPV: negative predictive value.</p>
            </fn>
            <fn id="table10fn2">
              <p><sup>b</sup>SVM: support vector machines.</p>
            </fn>
            <fn id="table10fn3">
              <p><sup>c</sup>KNN: k-nearest neighbor.</p>
            </fn>
            <fn id="table10fn4">
              <p><sup>d</sup>XGBoost: Extreme Gradient Boosting.</p>
            </fn>
            <fn id="table10fn5">
              <p><sup>e</sup>AdaBoost: Adaptive Boosting.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table11">
          <label>Table 11</label>
          <caption>
            <p>Atopic dermatitis phenotyping performance on balanced testing set in experiment 3 (binary vector encoding).</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="260"/>
            <col width="130"/>
            <col width="130"/>
            <col width="110"/>
            <col width="120"/>
            <col width="100"/>
            <col width="150"/>
            <thead>
              <tr valign="top">
                <td>Model</td>
                <td>Accuracy</td>
                <td>Precision</td>
                <td>Recall</td>
                <td><italic>F</italic><sub>1</sub>-score</td>
                <td>NPV<sup>a</sup></td>
                <td>Specificity</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Logistic regression</td>
                <td>0.7679</td>
                <td>0.7586</td>
                <td>0.7857</td>
                <td>0.7719</td>
                <td>0.7778</td>
                <td>0.7500</td>
              </tr>
              <tr valign="top">
                <td>SVM<sup>b</sup></td>
                <td>0.7857</td>
                <td>0.7857</td>
                <td>0.7857</td>
                <td>0.7857</td>
                <td>0.7857</td>
                <td>0.7857</td>
              </tr>
              <tr valign="top">
                <td>Decision tree</td>
                <td>0.7857</td>
                <td>0.7667</td>
                <td>0.8214</td>
                <td>0.7931</td>
                <td>0.8077</td>
                <td>0.7500</td>
              </tr>
              <tr valign="top">
                <td>Random forest</td>
                <td>0.7857</td>
                <td>0.8077</td>
                <td>0.7500</td>
                <td>0.7778</td>
                <td>0.7667</td>
                <td>0.8214</td>
              </tr>
              <tr valign="top">
                <td>KNN<sup>c</sup></td>
                <td>0.6964</td>
                <td>0.7391</td>
                <td>0.6071</td>
                <td>0.6667</td>
                <td>0.6667</td>
                <td>0.7857</td>
              </tr>
              <tr valign="top">
                <td>XGBoost<sup>d</sup></td>
                <td>0.8036</td>
                <td>0.8400</td>
                <td>0.7500</td>
                <td>0.7925</td>
                <td>0.7742</td>
                <td>0.8571</td>
              </tr>
              <tr valign="top">
                <td>AdaBoost<sup>e</sup></td>
                <td>0.7857</td>
                <td>0.7857</td>
                <td>0.7857</td>
                <td>0.7857</td>
                <td>0.7857</td>
                <td>0.7857</td>
              </tr>
              <tr valign="top">
                <td>Stacking classifier</td>
                <td>0.7500</td>
                <td>0.7500</td>
                <td>0.7500</td>
                <td>0.7500</td>
                <td>0.7500</td>
                <td>0.7500</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table11fn1">
              <p><sup>a</sup>NPV: negative predictive value.</p>
            </fn>
            <fn id="table11fn2">
              <p><sup>b</sup>SVM: support vector machines.</p>
            </fn>
            <fn id="table11fn3">
              <p><sup>c</sup>KNN: k-nearest neighbor.</p>
            </fn>
            <fn id="table11fn4">
              <p><sup>d</sup>XGBoost: Extreme Gradient Boosting.</p>
            </fn>
            <fn id="table11fn5">
              <p><sup>e</sup>AdaBoost: Adaptive Boosting.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table12">
          <label>Table 12</label>
          <caption>
            <p>Atopic dermatitis phenotyping performance on unbalanced testing set in experiment 3 (binary vector encoding).</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="258"/>
            <col width="134"/>
            <col width="134"/>
            <col width="109"/>
            <col width="122"/>
            <col width="97"/>
            <col width="146"/>
            <thead>
              <tr valign="top">
                <td>Model</td>
                <td>Accuracy</td>
                <td>Precision</td>
                <td>Recall</td>
                <td><italic>F</italic><sub>1</sub>-score</td>
                <td>NPV<sup>a</sup></td>
                <td>Specificity</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Logistic regression</td>
                <td>0.7253</td>
                <td>0.5366</td>
                <td>0.7857</td>
                <td>0.6377</td>
                <td>0.8800</td>
                <td>0.6984</td>
              </tr>
              <tr valign="top">
                <td>SVM<sup>b</sup></td>
                <td>0.7473</td>
                <td>0.5641</td>
                <td>0.7857</td>
                <td>0.6567</td>
                <td>0.8846</td>
                <td>0.7302</td>
              </tr>
              <tr valign="top">
                <td>Decision tree</td>
                <td>0.7473</td>
                <td>0.5610</td>
                <td>0.8214</td>
                <td>0.6667</td>
                <td>0.9000</td>
                <td>0.7143</td>
              </tr>
              <tr valign="top">
                <td>Random forest</td>
                <td>0.7582</td>
                <td>0.5833</td>
                <td>0.7500</td>
                <td>0.6563</td>
                <td>0.8727</td>
                <td>0.7619</td>
              </tr>
              <tr valign="top">
                <td>KNN<sup>c</sup></td>
                <td>0.7363</td>
                <td>0.5667</td>
                <td>0.6071</td>
                <td>0.5862</td>
                <td>0.8197</td>
                <td>0.7937</td>
              </tr>
              <tr valign="top">
                <td>XGBoost<sup>d</sup></td>
                <td>0.7582</td>
                <td>0.5833</td>
                <td>0.7500</td>
                <td>0.6563</td>
                <td>0.8727</td>
                <td>0.7619</td>
              </tr>
              <tr valign="top">
                <td>AdaBoost<sup>e</sup></td>
                <td>0.7473</td>
                <td>0.5641</td>
                <td>0.7857</td>
                <td>0.6567</td>
                <td>0.8846</td>
                <td>0.7302</td>
              </tr>
              <tr valign="top">
                <td>Stacking classifier</td>
                <td>0.7143</td>
                <td>0.5250</td>
                <td>0.7500</td>
                <td>0.6176</td>
                <td>0.8627</td>
                <td>0.6984</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table12fn1">
              <p><sup>a</sup>NPV: negative predictive value.</p>
            </fn>
            <fn id="table12fn2">
              <p><sup>b</sup>SVM: support vector machines.</p>
            </fn>
            <fn id="table12fn3">
              <p><sup>c</sup>KNN: k-nearest neighbor.</p>
            </fn>
            <fn id="table12fn4">
              <p><sup>d</sup>XGBoost: Extreme Gradient Boosting.</p>
            </fn>
            <fn id="table12fn5">
              <p><sup>e</sup>AdaBoost: Adaptive Boosting.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Sentence Classification Results</title>
        <p>We hypothesized that using BioClinical BERT sentence embeddings to train sentence classifiers would provide better performance than using BERT Base Uncased sentence embeddings due to the clinical setting of our data. Given the results in <xref ref-type="table" rid="table5">Tables 5</xref> and <xref ref-type="table" rid="table6">6</xref>, we observed that this was most often true in the context of sentence classification because we were able to achieve better performance in the majority (5 out of 8) of the sentence classification tasks when using BioClinical BERT embeddings as opposed to BERT Base Uncased embeddings.</p>
        <p>Using BioClinical BERT sentence embeddings yielded stronger performance when distinguishing sentences in 5 of the 8 sentence categories: category 2 (mentions of hay fever allergies), category 3 (mentions of atopic allergies), category 5 (mentions of dry or itchy skin), category 6 (mentions of nonasthma medications), and category 7 (mentions of asthma). More specifically, we observed higher accuracies when using BioClinical BERT sentence embeddings for classifiers 2 (0.8954), 3 (0.8214), 5 (0.7373), 6 (0.8204), and 7 (0.7712) than their corresponding counterparts when using BERT Base Uncased embeddings for classifiers 2 (0.7730), 3 (0.7976), 5 (0.7288), 6 (0.8096), and 7 (0.7269). We observed that the differences in performance between using BioClinical BERT embeddings and BERT Base Uncased embeddings are most pronounced for classifiers 2 and 7, which correspond to mentions of hay fever allergies and asthma mentions, respectively. We hypothesize this is because hay fever allergies and asthma (and their synonyms) may be very common terms in clinical notes; therefore, models trained on clinical data (BioClinical BERT) may be able to provide stronger performance than models trained on nonclinical text (BERT Base Uncased), which may not have as many mentions of hay fever allergies or asthma.</p>
        <p>Conversely, using BERT Base Uncased embeddings yielded stronger performance when distinguishing sentences in the other 3 of 8 sentence categories: category 1 (direct mentions of AD), category 4 (mentions of eczema or rashes), and category 8 (mentions of asthma medications). More specifically, we observed higher accuracies when using BERT Base Uncased sentence embeddings for classifiers 1 (0.9153), 4 (0.8439), and 8 (0.8738) than their corresponding counterparts when using BioClinical BERT embeddings for classifiers 1 (0.9002), 4 (0.8284), and 8 (0.8299). We observed differences in performance between using BERT Base Uncased embeddings and BioClinical BERT embeddings, which are most evident for classifier 8, which corresponds to mentions of asthma medications. Although this is counterintuitive at first (we would expect a classifier using embeddings generated from BioClinical BERT to be able to better recognize allergy medicines), we believe that the performance benefit from using BERT Base Uncased can be attributed to the list of terms we gave to <italic>medSpacy</italic> when asking it to identify sentences in category 8. Many of the asthma medications in category 8 sentences are either monoclonal antibody medications ending in -mab (benralizumab, mepolizumab, omalizumab, etc) or hydrofluoroalkanes (hfa; atrovent hfa, flovent hfa, xopenex hfa, etc). Because monoclonal antibodies are very specialized types of medication, they may not occur as frequently as other terms in the corpus used to train BioClinical BERT, so a more general model such as BERT Base Uncased may provide more robust performance. Additionally, because the hydrofluoroalkane allergy medications in category 8 sentences are often abbreviated with “hfa,” which can have alternate medical meanings such as high-functioning autism or health facility administrator, the BioClinical BERT embeddings might not be representative of the presence of allergy medications in the sentence, so a more general model such as BERT Base Uncased may be able to provide better performance.</p>
        <p>More broadly, looking at the results in <xref ref-type="table" rid="table5">Tables 5</xref> and <xref ref-type="table" rid="table6">6</xref>, we can see that the least accurate classifier has an accuracy of 0.7288, while the most accurate classifier is able to achieve an accuracy of 0.9153. Furthermore, when aggregating the most accurate classifiers from both tables we can see that we are able to achieve accuracies of 0.9153 (classifier 1) for identifying sentences that directly suggest the patient has AD, 0.8954 (classifier 2) for identifying sentences that mention hay fever allergies, 0.8214 (classifier 3) for identifying sentences that mention atopic allergies, 0.8439 (classifier 4) for identifying sentences that mention eczema or skin rashes, 0.7373 (classifier 5) for identifying sentences that mention dry or itchy skin, 0.8204 (classifier 6) for identifying sentences that mention nonasthma medications related to diagnosis of AD, 0.7712 (classifier 7) for identifying sentences that mention asthma, and 0.8738 (classifier 8) for identifying sentences that mention asthma medications. Because our training and testing sets were both class-balanced and the majority (6 of the 8) of the most accurate classifiers previously mentioned achieved accuracies between 0.8204 and 0.9153, we believe these results are promising and indicate that our sentence classifiers could potentially be used to save time in a clinical setting during chart review by identifying (and highlighting for review) sentences relevant to the diagnosis of AD when recruiting for clinical trials.</p>
      </sec>
      <sec>
        <title>AD Phenotyping Results</title>
        <p>As per <xref ref-type="table" rid="table7">Tables 7</xref>-<xref ref-type="table" rid="table10">10</xref>, our earlier hypothesis holds: using clinical embeddings (BioClinical BERT) to generate the patient vector representation does provide better performance in patient phenotyping than using nonclinical embeddings (BERT Base Uncased). Comparing evaluations on the balanced testing set in <xref ref-type="table" rid="table7">Tables 7</xref> and <xref ref-type="table" rid="table9">9</xref>, we observe that using BioClinical BERT embeddings provides higher accuracy in almost all models, with the exception of Decision Trees where BERT Base Uncased provides better performance (accuracy of 0.6071) as compared with BioClinical BERT (accuracy of 0.5893). Comparing evaluations on the unbalanced testing set in <xref ref-type="table" rid="table8">Tables 8</xref> and <xref ref-type="table" rid="table10">10</xref>, we observed that the same trend follows: using BioClinical BERT embeddings provides higher accuracy in almost all models, with the exception of Decision Trees and XGBoost, where using BERT Base Uncased embeddings provides better performance (accuracy of 0.6484 for Decision Trees and 0.6374 for XGBoost) as compared with their counterparts with BioClinical BERT embeddings (accuracy of 0.5824 for Decision Trees and 0.6264 for XGBoost).</p>
        <p>As part of our experimental design, we included an ablation study in experiment 3 so we could compare the difference in performance during patient phenotyping when removing the use of BERT models to create each patient’s vector representations. On the class-balanced testing set, we observed that accuracies range from 0.6071 to 0.7321 when using BioClinical BERT embeddings in <xref ref-type="table" rid="table7">Table 7</xref>, accuracies range from 0.5179 to 0.6250 when using BERT Base Uncased embeddings in <xref ref-type="table" rid="table9">Table 9</xref>, and accuracies range from 0.6964 to 0.8036 when removing the use of BERT models in <xref ref-type="table" rid="table11">Table 11</xref> (experiment 3). On the unbalanced testing set, we observed that accuracies range from 0.5824 to 0.7253 when using BioClinical BERT embeddings in <xref ref-type="table" rid="table8">Table 8</xref>, accuracies range from 0.5714 to 0.6703 when using BERT Base Uncased embeddings in <xref ref-type="table" rid="table10">Table 10</xref>, and accuracies range from 0.7143 to 0.7582 when removing the use of BERT models in <xref ref-type="table" rid="table12">Table 12</xref> (experiment 3).</p>
        <p>In both cases (evaluation on the balanced testing set and evaluation on the unbalanced testing set), we found that models in experiment 3 (ablation study) generally outperform (or are as good as) their corresponding counterparts in experiments 1 and 2 (BERT experiments) across all metrics (accuracy, precision, recall, <italic>F</italic><sub>1</sub>-score, NPV, and specificity), with the exception that the stacking classifier in experiment 1 (BioClinical BERT) has marginally stronger accuracy and precision than the stacking classifier in experiment 3. This shows that traditional rules-based approaches (experiment 3) can outperform BERT-based approaches for generating a patient vector representation for downstream patient phenotyping.</p>
        <p>We hypothesize that models in experiments 1 and 2 showed lower performance because errors from our sentence classifiers in earlier stages of the pipeline could have propagated to later stages of the pipeline during patient phenotyping. Because we leveraged the max operator to aggregate probabilities that any given sentence in the patient record applies to each category, more sentences in each patient record would lead to a greater chance that an erroneous prediction with a high probability would lead to a false positive error in the creation of each patient’s vector representation in experiments 1 and 2.</p>
        <p>Although there is a wide range in performance for our patients with AD phenotyping algorithms, we believe that we have reached our goal of developing a system capable of patient with AD phenotyping for clinical trial recruitment because <xref ref-type="table" rid="table11">Tables 11</xref> and <xref ref-type="table" rid="table12">12</xref> show promising results. Furthermore, our system can be used as a first step during AD clinical trial recruitment to filter out most patients who may not qualify for AD trials and therefore save valuable clinician time. We believe our pipeline is important and valuable because, unlike other diseases, such as influenza, COVID-19, and cancer, there is no gold-standard test result that can be used to determine when a patient has AD. Instead, clinicians must spend large amounts of time undergoing chart reviews to individually determine whether each patient has AD.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>One limitation of this study was the small size of our data set. Although we had a total of 1926 patients in our data set, only 137 of them were validated as having AD. During training, we leveraged 109 of the 137 patients with AD and sampled another 109 patients without AD to create a class-balanced training set. The small size of the training set could lead to overfitting and therefore result in reduced performance on the testing set. Future work could involve obtaining more data from patients with AD as well as exploring the use of an imbalanced data set but using a class-weighted loss function to counteract the class imbalance.</p>
        <p>A second limitation of this study was the input-limit size of the large language models that were used. Both BERT Base Uncased and BioClinical BERT had an input limit of 512 tokens. This meant that any input text that was longer than 512 tokens would be ignored when training BERT. Consequently, we could not simply directly concatenate all documents from each patient’s EHR and feed the tokenized documents of each patient into BERT with an added classification head for training as well as direct prediction of whether the patient has AD. Instead, we designed a pipeline around distilling information from all documents in each patient’s EHR into a patient vector representation and then using this patient vector representation to train various classical ML algorithms for phenotyping the patient. Future work could involve exploring the use of other large language models that are suited for long inputs, such as Longformer or Doc2Vec, for predicting when a patient should be labeled as having AD.</p>
        <p>A third limitation of this study was the list of AD indicators we selected. We did not consider additional AD indicators, and we also did not consider the use of different combinations (or subsets) of the AD indicators selected. This is particularly relevant in considering that (1) our pipeline is intended to be used for identifying patients with AD, and (2) one of our AD indicators (category 1) directly targets whether there is any given sentence in the patient’s record that mentions AD, which could be in the context of a family history of AD, a potential (but not confirmed) diagnosis of AD, as well as a confirmed diagnosis of AD, among other possibilities. If this AD indicator is removed, then 1 interesting research question could be whether our pipeline is still able to maintain performance similarly to what it is currently able to achieve. Future work could involve assessing the performance impact of removing or adding the use of various AD indicators. We could then determine if our pipeline is relying too much on or overfitting 1 or more indicators. Furthermore, we could also redesign our patient vector and separate the feature for category 1 (any sentence that mentions AD) into 3 separate indicators: whether there is (1) a family history of AD, (2) an affirmed diagnosis that the patient has AD, and (3) uncertainty of whether the patient has AD. Doing so could potentially improve precision.</p>
      </sec>
      <sec>
        <title>Potential Applications</title>
        <p>Given the aforementioned results, we believe our AD classifier could be operationalized to facilitate reliable and efficient EHR chart review. For example, sentence classifiers could visually indicate AD indicators inline text, therefore reducing information foraging efforts by clinicians. Additionally, AD phenotyping classifiers could indicate the strength of a patient match to UKWP criteria, exact or partial, based on AD indicator sentence classifications. Furthermore, ranking patient cases by match strength could reduce the number of cases reviewed to generate both case and matched controls.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>In conclusion, we present and validate a promising pipeline for phenotyping patients with AD during clinical trial recruitment. To do so, we compare a rules-based and transformer-based approach for creating a vector representation of each patient and compare downstream performance in patient phenotyping with various standard ML algorithms. We find that a traditional rules-based approach outperforms using a transformer-based approach (experiment 3). We hope that our pipeline can be deployed in hospital settings during clinical trial recruitment as an initial step to automatically filter candidates before manual review. Additionally, we show that MLP networks can identify whether sentences are relevant to AD diagnosis. These MLP networks can later be deployed in clinical settings to highlight which sentences are relevant for physicians during manual chart review, therefore reducing physician burden. Future work can involve extending our patient phenotyping pipeline to other data sets and other diseases.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AD</term>
          <def>
            <p>atopic dermatitis</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">AdaBoost</term>
          <def>
            <p>Adaptive Boosting</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">BERT</term>
          <def>
            <p>Bidirectional Encoder Representations from Transformers</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">EHR</term>
          <def>
            <p>electronic health record</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">FN</term>
          <def>
            <p>false negatives</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">FP</term>
          <def>
            <p>false positives</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">Hfa</term>
          <def>
            <p>hydrofluoroalkanes</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">HR</term>
          <def>
            <p>Hanifin and Rajka</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">ICD</term>
          <def>
            <p>International Classification of Disease</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">KNN</term>
          <def>
            <p>k-nearest neighbor</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">ML</term>
          <def>
            <p>machine learning</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">MLP</term>
          <def>
            <p>multilayer perceptron</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb14">NPV</term>
          <def>
            <p>negative predictive value</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb15">Philter</term>
          <def>
            <p>Protected Health Information filter</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb16">ReLU</term>
          <def>
            <p>Rectified Linear Unit</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb17">SGD</term>
          <def>
            <p>stochastic gradient descent</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb18">SVM</term>
          <def>
            <p>support vector machines</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb19">TN</term>
          <def>
            <p>true negatives</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb20">TP</term>
          <def>
            <p>true positives</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb21">UKWP</term>
          <def>
            <p>UK Working Party</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb22">XGBoost</term>
          <def>
            <p>Extreme Gradient Boosting</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This study was partially funded by the National Institutes of Health (NIH) and the National Institute of Arthritis and Musculoskeletal and Skin Diseases (NIAMS) P30-AR069589 as part of the Penn Skin Biology and Diseases Resource-Based Center (core: DJM and DM).</p>
    </ack>
    <fn-group>
      <fn fn-type="con">
        <p>AW designed the experiments, wrote the code, performed the experiments, wrote the first draft of the manuscript, and revised the manuscript. DJM conceptualized and implemented the chart abstraction study, annotated the data set, interpreted the results, and revised the manuscript. RF annotated the data set and revised the manuscript. SH queried and deidentified the data set as well as revised the manuscript. DM conceptualized the study and experiment design, interpreted results, wrote and revised the manuscript, and provided secure storage and computer resources.</p>
      </fn>
      <fn fn-type="conflict">
        <p>DJM is or recently has been a consultant for Pfizer, Leo, and Sanofi with respect to studies of atopic dermatitis and served on an advisory board for the National Eczema Association.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Collins-Williams</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Eczema (atopic dermatitis)</article-title>
          <source>Paediatric Allergy and Clinical Immunology (as Applied to Atopic Disease): A Manual for Students and Practitioners of Medicine</source>
          <year>1973</year>
          <publisher-loc>Toronto</publisher-loc>
          <publisher-name>University of Toronto Press</publisher-name>
          <fpage>32</fpage>
          <lpage>37</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lyons</surname>
              <given-names>JJ</given-names>
            </name>
            <name name-style="western">
              <surname>Milner</surname>
              <given-names>JD</given-names>
            </name>
            <name name-style="western">
              <surname>Stone</surname>
              <given-names>KD</given-names>
            </name>
          </person-group>
          <article-title>Atopic dermatitis in children: clinical features, pathophysiology, and treatment</article-title>
          <source>Immunol Allergy Clin North Am</source>
          <year>2015</year>
          <volume>35</volume>
          <issue>1</issue>
          <fpage>161</fpage>
          <lpage>183</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/25459583"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.iac.2014.09.008</pub-id>
          <pub-id pub-id-type="medline">25459583</pub-id>
          <pub-id pub-id-type="pii">S0889-8561(14)00108-8</pub-id>
          <pub-id pub-id-type="pmcid">PMC4254569</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Eichenfield</surname>
              <given-names>LF</given-names>
            </name>
            <name name-style="western">
              <surname>Tom</surname>
              <given-names>WL</given-names>
            </name>
            <name name-style="western">
              <surname>Chamlin</surname>
              <given-names>SL</given-names>
            </name>
            <name name-style="western">
              <surname>Feldman</surname>
              <given-names>SR</given-names>
            </name>
            <name name-style="western">
              <surname>Hanifin</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Simpson</surname>
              <given-names>EL</given-names>
            </name>
            <name name-style="western">
              <surname>Berger</surname>
              <given-names>TG</given-names>
            </name>
            <name name-style="western">
              <surname>Bergman</surname>
              <given-names>JN</given-names>
            </name>
            <name name-style="western">
              <surname>Cohen</surname>
              <given-names>DE</given-names>
            </name>
            <name name-style="western">
              <surname>Cooper</surname>
              <given-names>KD</given-names>
            </name>
            <name name-style="western">
              <surname>Cordoro</surname>
              <given-names>KM</given-names>
            </name>
            <name name-style="western">
              <surname>Davis</surname>
              <given-names>DM</given-names>
            </name>
            <name name-style="western">
              <surname>Krol</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Margolis</surname>
              <given-names>DJ</given-names>
            </name>
            <name name-style="western">
              <surname>Paller</surname>
              <given-names>AS</given-names>
            </name>
            <name name-style="western">
              <surname>Schwarzenberger</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Silverman</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Williams</surname>
              <given-names>HC</given-names>
            </name>
            <name name-style="western">
              <surname>Elmets</surname>
              <given-names>CA</given-names>
            </name>
            <name name-style="western">
              <surname>Block</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Harrod</surname>
              <given-names>CG</given-names>
            </name>
            <name name-style="western">
              <surname>Begolka</surname>
              <given-names>WS</given-names>
            </name>
            <name name-style="western">
              <surname>Sidbury</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Guidelines of care for the management of atopic dermatitis: section 1. Diagnosis and assessment of atopic dermatitis</article-title>
          <source>J Am Acad Dermatol</source>
          <year>2014</year>
          <volume>70</volume>
          <issue>2</issue>
          <fpage>338</fpage>
          <lpage>351</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/24290431"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jaad.2013.10.010</pub-id>
          <pub-id pub-id-type="medline">24290431</pub-id>
          <pub-id pub-id-type="pii">S0190-9622(13)01095-5</pub-id>
          <pub-id pub-id-type="pmcid">PMC4410183</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Abramovits</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Atopic dermatitis</article-title>
          <source>J Am Acad Dermatol</source>
          <year>2005</year>
          <volume>53</volume>
          <issue>1 Suppl 1</issue>
          <fpage>S86</fpage>
          <lpage>S93</lpage>
          <pub-id pub-id-type="doi">10.1016/j.jaad.2005.04.034</pub-id>
          <pub-id pub-id-type="medline">15968268</pub-id>
          <pub-id pub-id-type="pii">S0190962205013125</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Weidinger</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Beck</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Bieber</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Kabashima</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Irvine</surname>
              <given-names>AD</given-names>
            </name>
          </person-group>
          <article-title>Atopic dermatitis</article-title>
          <source>Nat Rev Dis Primers</source>
          <year>2018</year>
          <volume>4</volume>
          <issue>1</issue>
          <fpage>1</fpage>
          <pub-id pub-id-type="doi">10.1038/s41572-018-0001-z</pub-id>
          <pub-id pub-id-type="medline">29930242</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41572-018-0001-z</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Schneider</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Hanifin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Boguniewicz</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Eichenfield</surname>
              <given-names>LF</given-names>
            </name>
            <name name-style="western">
              <surname>Spergel</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Dakovic</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Paller</surname>
              <given-names>AS</given-names>
            </name>
          </person-group>
          <article-title>Study of the atopic march: development of atopic comorbidities</article-title>
          <source>Pediatr Dermatol</source>
          <year>2016</year>
          <volume>33</volume>
          <issue>4</issue>
          <fpage>388</fpage>
          <lpage>398</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/27273433"/>
          </comment>
          <pub-id pub-id-type="doi">10.1111/pde.12867</pub-id>
          <pub-id pub-id-type="medline">27273433</pub-id>
          <pub-id pub-id-type="pmcid">PMC5649252</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Del Pozo</surname>
              <given-names>DV</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Mitra</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Hoffstad</surname>
              <given-names>OJ</given-names>
            </name>
            <name name-style="western">
              <surname>Margolis</surname>
              <given-names>DJ</given-names>
            </name>
          </person-group>
          <article-title>The risk of atopic comorbidities and atopic march progression among Black and White children with mild-to-moderate atopic dermatitis: a cross-sectional study</article-title>
          <source>J Am Acad Dermatol</source>
          <year>2022</year>
          <volume>87</volume>
          <issue>5</issue>
          <fpage>1145</fpage>
          <lpage>1147</lpage>
          <pub-id pub-id-type="doi">10.1016/j.jaad.2022.02.023</pub-id>
          <pub-id pub-id-type="medline">35192898</pub-id>
          <pub-id pub-id-type="pii">S0190-9622(22)00334-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Eichenfield</surname>
              <given-names>LF</given-names>
            </name>
            <name name-style="western">
              <surname>Tom</surname>
              <given-names>WL</given-names>
            </name>
            <name name-style="western">
              <surname>Berger</surname>
              <given-names>TG</given-names>
            </name>
            <name name-style="western">
              <surname>Krol</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Paller</surname>
              <given-names>AS</given-names>
            </name>
            <name name-style="western">
              <surname>Schwarzenberger</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Bergman</surname>
              <given-names>JN</given-names>
            </name>
            <name name-style="western">
              <surname>Chamlin</surname>
              <given-names>SL</given-names>
            </name>
            <name name-style="western">
              <surname>Cohen</surname>
              <given-names>DE</given-names>
            </name>
            <name name-style="western">
              <surname>Cooper</surname>
              <given-names>KD</given-names>
            </name>
            <name name-style="western">
              <surname>Cordoro</surname>
              <given-names>KM</given-names>
            </name>
            <name name-style="western">
              <surname>Davis</surname>
              <given-names>DM</given-names>
            </name>
            <name name-style="western">
              <surname>Feldman</surname>
              <given-names>SR</given-names>
            </name>
            <name name-style="western">
              <surname>Hanifin</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Margolis</surname>
              <given-names>DJ</given-names>
            </name>
            <name name-style="western">
              <surname>Silverman</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Simpson</surname>
              <given-names>EL</given-names>
            </name>
            <name name-style="western">
              <surname>Williams</surname>
              <given-names>HC</given-names>
            </name>
            <name name-style="western">
              <surname>Elmets</surname>
              <given-names>CA</given-names>
            </name>
            <name name-style="western">
              <surname>Block</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Harrod</surname>
              <given-names>CG</given-names>
            </name>
            <name name-style="western">
              <surname>Begolka</surname>
              <given-names>WS</given-names>
            </name>
            <name name-style="western">
              <surname>Sidbury</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Guidelines of care for the management of atopic dermatitis: section 2. Management and treatment of atopic dermatitis with topical therapies</article-title>
          <source>J Am Acad Dermatol</source>
          <year>2014</year>
          <volume>71</volume>
          <issue>1</issue>
          <fpage>116</fpage>
          <lpage>132</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/24813302"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jaad.2014.03.023</pub-id>
          <pub-id pub-id-type="medline">24813302</pub-id>
          <pub-id pub-id-type="pii">S0190-9622(14)01257-2</pub-id>
          <pub-id pub-id-type="pmcid">PMC4326095</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fulton</surname>
              <given-names>RL</given-names>
            </name>
            <name name-style="western">
              <surname>Mitra</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Chiesa-Fuxench</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Sockler</surname>
              <given-names>PG</given-names>
            </name>
            <name name-style="western">
              <surname>Margolis</surname>
              <given-names>DJ</given-names>
            </name>
          </person-group>
          <article-title>Untapping the potential of utilizing electronic medical records to identify patients with atopic dermatitis: an algorithm using ICD-10 codes</article-title>
          <source>Arch Dermatol Res</source>
          <year>2022</year>
          <volume>314</volume>
          <issue>5</issue>
          <fpage>439</fpage>
          <lpage>444</lpage>
          <pub-id pub-id-type="doi">10.1007/s00403-021-02251-w</pub-id>
          <pub-id pub-id-type="medline">34081192</pub-id>
          <pub-id pub-id-type="pii">10.1007/s00403-021-02251-w</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gustafson</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Pacheco</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wehbe</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Silverberg</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Thompson</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>A machine learning algorithm for identifying atopic dermatitis in adults from electronic health records</article-title>
          <source>IEEE Int Conf Healthc Inform</source>
          <year>2017</year>
          <volume>2017</volume>
          <fpage>83</fpage>
          <lpage>90</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/29104964"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/ICHI.2017.31</pub-id>
          <pub-id pub-id-type="medline">29104964</pub-id>
          <pub-id pub-id-type="pmcid">PMC5664951</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hanifin</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Rajka</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Diagnostic features of atopic dermatitis</article-title>
          <source>Acta Derm Venereol</source>
          <year>1980</year>
          <volume>60</volume>
          <fpage>44</fpage>
          <lpage>47</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medicaljournalssweden.se/actadv/article/view/10725"/>
          </comment>
          <pub-id pub-id-type="doi">10.2340/00015555924447</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Williams</surname>
              <given-names>HC</given-names>
            </name>
            <name name-style="western">
              <surname>Burney</surname>
              <given-names>PG</given-names>
            </name>
            <name name-style="western">
              <surname>Hay</surname>
              <given-names>RJ</given-names>
            </name>
            <name name-style="western">
              <surname>Archer</surname>
              <given-names>CB</given-names>
            </name>
            <name name-style="western">
              <surname>Shipley</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Hunter</surname>
              <given-names>JJ</given-names>
            </name>
            <name name-style="western">
              <surname>Bingham</surname>
              <given-names>EA</given-names>
            </name>
            <name name-style="western">
              <surname>Finlay</surname>
              <given-names>AY</given-names>
            </name>
            <name name-style="western">
              <surname>Pembroke</surname>
              <given-names>AC</given-names>
            </name>
            <name name-style="western">
              <surname>Graham-Brown</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Atherton</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>Lewis-Jones</surname>
              <given-names>MS</given-names>
            </name>
            <name name-style="western">
              <surname>Holden</surname>
              <given-names>CA</given-names>
            </name>
            <name name-style="western">
              <surname>Harper</surname>
              <given-names>JI</given-names>
            </name>
            <name name-style="western">
              <surname>Champion</surname>
              <given-names>RH</given-names>
            </name>
            <name name-style="western">
              <surname>Poyner</surname>
              <given-names>TF</given-names>
            </name>
            <name name-style="western">
              <surname>Launer</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>David</surname>
              <given-names>TJ</given-names>
            </name>
          </person-group>
          <article-title>The U.K. Working Party's diagnostic criteria for atopic dermatitis. I. Derivation of a minimum set of discriminators for atopic dermatitis</article-title>
          <source>Br J Dermatol</source>
          <year>1994</year>
          <volume>131</volume>
          <issue>3</issue>
          <fpage>383</fpage>
          <lpage>396</lpage>
          <pub-id pub-id-type="doi">10.1111/j.1365-2133.1994.tb08530.x</pub-id>
          <pub-id pub-id-type="medline">7918015</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Norgeot</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Muenzen</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Peterson</surname>
              <given-names>TA</given-names>
            </name>
            <name name-style="western">
              <surname>Fan</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Glicksberg</surname>
              <given-names>BS</given-names>
            </name>
            <name name-style="western">
              <surname>Schenk</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Rutenberg</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Oskotsky</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Sirota</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Yazdany</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Schmajuk</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Ludwig</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Goldstein</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Butte</surname>
              <given-names>AJ</given-names>
            </name>
          </person-group>
          <article-title>Protected Health Information filter (Philter): accurately and securely de-identifying free-text clinical notes</article-title>
          <source>NPJ Digit Med</source>
          <year>2020</year>
          <volume>3</volume>
          <fpage>57</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-020-0258-y"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-020-0258-y</pub-id>
          <pub-id pub-id-type="medline">32337372</pub-id>
          <pub-id pub-id-type="pii">258</pub-id>
          <pub-id pub-id-type="pmcid">PMC7156708</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="web">
          <article-title>Bert-base-uncased</article-title>
          <source>Hugging Face</source>
          <access-date>2023-11-29</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://huggingface.co/bert-base-uncased">https://huggingface.co/bert-base-uncased</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Devlin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>MW</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Toutanova</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>BERT: pre-training of deep bidirectional transformers for language understanding</article-title>
          <source>ArXiv. Preprint posted online on May 24 2019</source>
          <year>2018</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1810.04805"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Alsentzer</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Murphy</surname>
              <given-names>JR</given-names>
            </name>
            <name name-style="western">
              <surname>Boag</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Weng</surname>
              <given-names>WH</given-names>
            </name>
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Naumann</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>McDermott</surname>
              <given-names>MBA</given-names>
            </name>
          </person-group>
          <article-title>Publicly available clinical BERT embeddings</article-title>
          <source>ArXiv. Preprint posted online on June 20 2019</source>
          <year>2019</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1904.03323"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/w19-1909</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="web">
          <article-title>emilyalsentzer/Bio_ClinicalBERT</article-title>
          <source>Hugging Face</source>
          <access-date>2023-11-29</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://huggingface.co/emilyalsentzer/Bio_ClinicalBERT">https://huggingface.co/emilyalsentzer/Bio_ClinicalBERT</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Eyre</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Chapman</surname>
              <given-names>AB</given-names>
            </name>
            <name name-style="western">
              <surname>Peterson</surname>
              <given-names>KS</given-names>
            </name>
            <name name-style="western">
              <surname>Shi</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Alba</surname>
              <given-names>PR</given-names>
            </name>
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Box</surname>
              <given-names>TL</given-names>
            </name>
            <name name-style="western">
              <surname>DuVall</surname>
              <given-names>SL</given-names>
            </name>
            <name name-style="western">
              <surname>Patterson</surname>
              <given-names>OV</given-names>
            </name>
          </person-group>
          <article-title>Launching into clinical space with medspaCy: a new clinical text processing toolkit in Python</article-title>
          <source>AMIA Annu Symp Proc</source>
          <year>2021</year>
          <volume>2021</volume>
          <fpage>438</fpage>
          <lpage>447</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/35308962"/>
          </comment>
          <pub-id pub-id-type="medline">35308962</pub-id>
          <pub-id pub-id-type="pii">3576697</pub-id>
          <pub-id pub-id-type="pmcid">PMC8861690</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chapman</surname>
              <given-names>BE</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kang</surname>
              <given-names>HP</given-names>
            </name>
            <name name-style="western">
              <surname>Chapman</surname>
              <given-names>WW</given-names>
            </name>
          </person-group>
          <article-title>Document-level classification of CT pulmonary angiography reports based on an extension of the ConText algorithm</article-title>
          <source>J Biomed Inform</source>
          <year>2011</year>
          <volume>44</volume>
          <issue>5</issue>
          <fpage>728</fpage>
          <lpage>737</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(11)00062-1"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2011.03.011</pub-id>
          <pub-id pub-id-type="medline">21459155</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(11)00062-1</pub-id>
          <pub-id pub-id-type="pmcid">PMC3164892</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Harkema</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Dowling</surname>
              <given-names>JN</given-names>
            </name>
            <name name-style="western">
              <surname>Thornblade</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Chapman</surname>
              <given-names>WW</given-names>
            </name>
          </person-group>
          <article-title>ConText: an algorithm for determining negation, experiencer, and temporal status from clinical reports</article-title>
          <source>J Biomed Inform</source>
          <year>2009</year>
          <volume>42</volume>
          <issue>5</issue>
          <fpage>839</fpage>
          <lpage>851</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(09)00074-4"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2009.05.002</pub-id>
          <pub-id pub-id-type="medline">19435614</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(09)00074-4</pub-id>
          <pub-id pub-id-type="pmcid">PMC2757457</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mowery</surname>
              <given-names>DL</given-names>
            </name>
            <name name-style="western">
              <surname>Kawamoto</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Bradshaw</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Kohlmann</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Schiffman</surname>
              <given-names>JD</given-names>
            </name>
            <name name-style="western">
              <surname>Weir</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Borbolla</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Chapman</surname>
              <given-names>WW</given-names>
            </name>
            <name name-style="western">
              <surname>Del Fiol</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Determining onset for familial breast and colorectal cancer from family history comments in the electronic health record</article-title>
          <source>AMIA Jt Summits Transl Sci Proc</source>
          <year>2019</year>
          <volume>2019</volume>
          <fpage>173</fpage>
          <lpage>181</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/31258969"/>
          </comment>
          <pub-id pub-id-type="medline">31258969</pub-id>
          <pub-id pub-id-type="pmcid">PMC6568127</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mowery</surname>
              <given-names>DL</given-names>
            </name>
            <name name-style="western">
              <surname>Velupillai</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chapman</surname>
              <given-names>WW</given-names>
            </name>
          </person-group>
          <article-title>Medical diagnosis lost in translation-analysis of uncertainty and negation expressions in English and Swedish clinical texts</article-title>
          <year>2012</year>
          <conf-name>Proceedings of the 2012 Workshop on Biomedical Natural Language Processing (BioNLP 2012)</conf-name>
          <conf-date>Montreal, Canada</conf-date>
          <conf-loc>June 8, 2012</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/W12-2407.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pedregosa</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Varoquaux</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Gramfort</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Michel</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Thirion</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Grisel</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Blondel</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Prettenhofer</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Weiss</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Dubourg</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Vanderplas</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Passos</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Cournapeau</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Brucher</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Perrot</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Duchesnay</surname>
              <given-names>É</given-names>
            </name>
          </person-group>
          <article-title>Scikit-learn: machine learning in python</article-title>
          <source>J Mach Learn Res</source>
          <year>2011</year>
          <volume>12</volume>
          <issue>85</issue>
          <fpage>2825</fpage>
          <lpage>2830</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmlr.org/papers/volume12/pedregosa11a/pedregosa11a.pdf?ref=https:/"/>
          </comment>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
