<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JFR</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id>
      <journal-title>JMIR Formative Research</journal-title>
      <issn pub-type="epub">2561-326X</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v7i1e44876</article-id>
      <article-id pub-id-type="pmid">37347514</article-id>
      <article-id pub-id-type="doi">10.2196/44876</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Identifying Patient Populations in Texts Describing Drug Approvals Through Deep Learning–Based Information Extraction: Development of a Natural Language Processing Algorithm</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Mavragani</surname>
            <given-names>Amaryllis</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Behera</surname>
            <given-names>Tapan</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Wilson</surname>
            <given-names>Ian</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Li</surname>
            <given-names>Yikuan</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Gendrin</surname>
            <given-names>Aline</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>AstraZeneca</institution>
            <addr-line>City house</addr-line>
            <addr-line>126-130 Hills Rd</addr-line>
            <addr-line>Cambridge, CB2 1RY</addr-line>
            <country>United Kingdom</country>
            <phone>44 7814585004</phone>
            <email>aline.gendrinbrokmann@astrazeneca.com</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6722-9369</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Souliotis</surname>
            <given-names>Leonidas</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0001-4540-7344</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Loudon-Griffiths</surname>
            <given-names>James</given-names>
          </name>
          <degrees>BSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0002-1501-2031</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Aggarwal</surname>
            <given-names>Ravisha</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0002-6701-6691</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Amoako</surname>
            <given-names>Daniel</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0005-4062-9174</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Desouza</surname>
            <given-names>Gregory</given-names>
          </name>
          <degrees>BSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0009-0372-3136</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Dimitrievska</surname>
            <given-names>Sashka</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0002-0296-9344</ext-link>
        </contrib>
        <contrib id="contrib8" contrib-type="author">
          <name name-style="western">
            <surname>Metcalfe</surname>
            <given-names>Paul</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0001-0644-4724</ext-link>
        </contrib>
        <contrib id="contrib9" contrib-type="author">
          <name name-style="western">
            <surname>Louvet</surname>
            <given-names>Emilie</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0006-1396-4265</ext-link>
        </contrib>
        <contrib id="contrib10" contrib-type="author">
          <name name-style="western">
            <surname>Sahni</surname>
            <given-names>Harpreet</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0008-8305-5889</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>AstraZeneca</institution>
        <addr-line>Cambridge</addr-line>
        <country>United Kingdom</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>AstraZeneca</institution>
        <addr-line>Bangalore</addr-line>
        <country>India</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>AstraZeneca</institution>
        <addr-line>Wilmington, DE</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>AstraZeneca</institution>
        <addr-line>Gaithersburg, MD</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Aline Gendrin <email>aline.gendrinbrokmann@astrazeneca.com</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2023</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>22</day>
        <month>6</month>
        <year>2023</year>
      </pub-date>
      <volume>7</volume>
      <elocation-id>e44876</elocation-id>
      <history>
        <date date-type="received">
          <day>7</day>
          <month>12</month>
          <year>2022</year>
        </date>
        <date date-type="rev-request">
          <day>10</day>
          <month>2</month>
          <year>2023</year>
        </date>
        <date date-type="rev-recd">
          <day>30</day>
          <month>3</month>
          <year>2023</year>
        </date>
        <date date-type="accepted">
          <day>17</day>
          <month>4</month>
          <year>2023</year>
        </date>
      </history>
      <copyright-statement>©Aline Gendrin, Leonidas Souliotis, James Loudon-Griffiths, Ravisha Aggarwal, Daniel Amoako, Gregory Desouza, Sashka Dimitrievska, Paul Metcalfe, Emilie Louvet, Harpreet Sahni. Originally published in JMIR Formative Research (https://formative.jmir.org), 22.06.2023.</copyright-statement>
      <copyright-year>2023</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on https://formative.jmir.org, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://formative.jmir.org/2023/1/e44876" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>New drug treatments are regularly approved, and it is challenging to remain up-to-date in this rapidly changing environment. Fast and accurate visualization is important to allow a global understanding of the drug market. Automation of this information extraction provides a helpful starting point for the subject matter expert, helps to mitigate human errors, and saves time.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>We aimed to semiautomate disease population extraction from the free text of oncology drug approval descriptions from the BioMedTracker database for 6 selected drug targets. More specifically, we intended to extract (1) line of therapy, (2) stage of cancer of the patient population described in the approval, and (3) the clinical trials that provide evidence for the approval. We aimed to use these results in downstream applications, aiding the searchability of relevant content against related drug project sources.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We fine-tuned a state-of-the-art deep learning model, Bidirectional Encoder Representations from Transformers, for each of the 3 desired outputs. We independently applied rule-based text mining approaches. We compared the performances of deep learning and rule-based approaches and selected the best method, which was then applied to new entries. The results were manually curated by a subject matter expert and then used to train new models.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>The training data set is currently small (433 entries) and will enlarge over time when new approval descriptions become available or if a choice is made to take another drug target into account. The deep learning models achieved 61% and 56% 5-fold cross-validated accuracies for line of therapy and stage of cancer, respectively, which were treated as classification tasks. Trial identification is treated as a named entity recognition task, and the 5-fold cross-validated <italic>F</italic><sub>1</sub>-score is currently 87%. Although the scores of the classification tasks could seem low, the models comprise 5 classes each, and such scores are a marked improvement when compared to random classification. Moreover, we expect improved performance as the input data set grows, since deep learning models need to be trained on a large enough amount of data to be able to learn the task they are taught. The rule-based approach achieved 60% and 74% 5-fold cross-validated accuracies for line of therapy and stage of cancer, respectively. No attempt was made to define a rule-based approach for trial identification.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>We developed a natural language processing algorithm that is currently assisting subject matter experts in disease population extraction, which supports health authority approvals. This algorithm achieves semiautomation, enabling subject matter experts to leverage the results for deeper analysis and to accelerate information retrieval in a crowded clinical environment such as oncology.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>algorithm</kwd>
        <kwd>artificial intelligence</kwd>
        <kwd>BERT</kwd>
        <kwd>cancer</kwd>
        <kwd>classification</kwd>
        <kwd>data extraction</kwd>
        <kwd>data mining</kwd>
        <kwd>deep-learning</kwd>
        <kwd>development</kwd>
        <kwd>drug approval</kwd>
        <kwd>free text</kwd>
        <kwd>information retrieval</kwd>
        <kwd>line of therapy</kwd>
        <kwd>machine learning</kwd>
        <kwd>natural language processing</kwd>
        <kwd>NLP</kwd>
        <kwd>oncology</kwd>
        <kwd>pharmaceutic</kwd>
        <kwd>pharmacology</kwd>
        <kwd>pharmacy</kwd>
        <kwd>stage of cancer</kwd>
        <kwd>text extraction</kwd>
        <kwd>text mining</kwd>
        <kwd>unstructured data</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Recent developments in deep learning–based [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>] natural language processing (NLP) have enabled transfer learning [<xref ref-type="bibr" rid="ref3">3</xref>] to be used in automated or semiautomated information extraction using data sets as small as thousands or even sometimes hundreds of entries [<xref ref-type="bibr" rid="ref4">4</xref>]. While a data set containing billions of words (the full Wikipedia and BooksCorpus content) is necessary to train models such as Bidirectional Encoder Representations from Transformers (BERT) [<xref ref-type="bibr" rid="ref1">1</xref>], a state-of-the-art deep learning NLP model, fine-tuning this model can be successfully applied to much smaller data sets [<xref ref-type="bibr" rid="ref4">4</xref>]. Small input data sets are often encountered in practice, and such methods allow applicability to a larger number of problems. Moreover, BERT has demonstrated state-of-the-art performances on a wide variety of tasks, including binary and multiclass classification on balanced and unbalanced data sets or question-answering data sets [<xref ref-type="bibr" rid="ref1">1</xref>]. When data drift has to be expected, such stability is a strong differentiator.</p>
      <p>Besides the fine-tuned BERT deep learning model, we develop a fit-for-purpose rule-based approach. We then compare results of both approaches, and the algorithm that performs best is applied to new data. The results are sent for review and curation to subject matter experts.</p>
      <p>In the case study presented in this paper, the goal was to categorize and extract entities from descriptions of drug approvals that would allow us to link a particular patient population and clinical trials to a specific drug approval event. This linkage supports our aim of streamlining information extraction and aiding visualization of the competitive drug approval landscape.</p>
      <p>We selected 6 drug targets of relevance to AstraZeneca’s Oncology portfolio and investigate the capability of NLP tools to extract an overview of the competitive landscape for these drug targets. The aim was to retrieve information defining the patient profile—specifically the approved line of therapy and stage of cancer—and references to the clinical trial or trials that support each drug approval.</p>
      <p>Machine-learning and rule-based approaches, or their combination, have been used to extract cancer stage automatically from electronic medical records.</p>
      <p>Shivade et al [<xref ref-type="bibr" rid="ref5">5</xref>] and Meng et al [<xref ref-type="bibr" rid="ref6">6</xref>] have reviewed automatic systems, rule- or machine-learning–based, applied to automatically identify patient phenotype, including but not limited to cancer stage and line of therapy.</p>
      <p>A carefully crafted sequence of rule-based approaches and machine learning algorithms allowed cancer stage identification in McCowen et al’s [<xref ref-type="bibr" rid="ref7">7</xref>] and Yim et al’s [<xref ref-type="bibr" rid="ref8">8</xref>] studies. In Nguyen et al’s [<xref ref-type="bibr" rid="ref9">9</xref>] study, a rule-based algorithm was compared to a machine learning approach based on support vector machine, and performances are found to be equivalent. A recent example is described by Hu et al [<xref ref-type="bibr" rid="ref10">10</xref>], where fine-tuned BERT models were used to identify 14 different named entities and relations among entities. These are then fed to a rule-based postprocessing workflow that answers a list of 22 questions indicative of cancer stage. Most recently, CancerBERT [<xref ref-type="bibr" rid="ref11">11</xref>] is a fine-tuned BERT-based deep learning model trained to extract 10 types of named entity recognition (NER) entities, including cancer stage.</p>
      <p>Example applications of rule-based approaches used to extract line of therapy automatically are described previously [<xref ref-type="bibr" rid="ref12">12</xref>-<xref ref-type="bibr" rid="ref15">15</xref>]. In these studies, cancer stage and line of therapy are often expressed through several indicators that need to be identified individually and then combined. The nature of the documents in our case is less detailed, and a new methodology is needed. A single paragraph of text is available, which sometimes consists of 2 or 3 lines of text only, sometimes more (<xref rid="figure1" ref-type="fig">Figure 1</xref>). Stage can be mentioned explicitly, or information can be provided indirectly through words such as “advanced” or “metastatic.” A text describing an approval can cover only 1 cancer stage or a wide range of stages. As for line of therapy, a previous treatment or intervention (resection…) is sometimes mentioned, which helps narrow down the possibilities.</p>
      <p>Finally, automatic information extraction of clinical trial characteristics has also been published using carefully crafted combinations of machine learning and rule-based approaches. The extracted information includes trial names as well as relevant information about patient populations enrolled in the trial [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>]. In our case, we identified, among several trial names, those that lead to compound approval, hence the need for a specially crafted model.</p>
      <fig id="figure1" position="float">
        <label>Figure 1</label>
        <caption>
          <p>Example approval description from BioMedTracker [<xref ref-type="bibr" rid="ref18">18</xref>], the data source for this project. The BioMedTracker [<xref ref-type="bibr" rid="ref18">18</xref>] database contains a repository of standardized drug approval events, reported across several indications and markets. Each event has a number of structured metadata associated with it (eg, disease, approval date, and approval region), as shown in the top half of this figure. Information relating to a more granular description of the patient population is constrained to the unstructured free-text section that is written by an analyst, shown in the lower half of this figure. Texts describing approvals are accessed programatically using a Representative State Transfer application programming interface query (REST API). Image reused with permission by Informa Pharma Intelligence.</p>
        </caption>
        <graphic xlink:href="formative_v7i1e44876_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Data Set and Labeling Process</title>
        <p>The BioMedTracker [<xref ref-type="bibr" rid="ref18">18</xref>] database contains a repository of standardized drug approval events, reported across a number of indications and markets. Each event has a number of structured metadata associated with it (eg, disease, approval date, and approval region), as shown in the table in the top half of <xref rid="figure1" ref-type="fig">Figure 1</xref>. However, information relating to a more granular description of the patient population (including line of therapy and stage of disease) and any supportive clinical trial is constrained to the unstructured free-text section that is written by an analyst. This can be seen in the lower half of <xref rid="figure1" ref-type="fig">Figure 1</xref>. Texts describing drug approvals of interest were accessed programatically from the database using a Representative State Transfer application programing interface (API) query.</p>
        <p>We focused on approval events in 6 drug targets, which were included sequentially as the project evolved. The drug targets taken into account were (1) EGFR (Epidermal Growth Factor Receptor), (2) human epidermal growth factor receptor 2/neu or ErbB-2, (3) Cytotoxic T-Lymphocyte Antigen 4, (4) Programed death-1 receptor/Programed death ligands (1 and 2), (5) Poly ADP-Ribose Polymerase, and (6) Bruton’s Tyrosine Kinase, which are all of relevance to AstraZeneca’s drug portfolio.</p>
        <p>In terms of preprocessing, hyperlinks were deleted from input texts. Information about line of therapy and cancer stage was found in the first 2 paragraphs of text, so only these were considered for these tasks. The full text was used to identify trials leading to an approval.</p>
        <p>A manual labeling process was applied to ensure consistency; 2 subject matter experts split the task of labeling 433 texts describing approvals, while an independent third labeler reviewed their work to ensure accuracy and consistency. The task is difficult as line of therapy and cancer stage are sometimes described indirectly. We used Label-studio [<xref ref-type="bibr" rid="ref19">19</xref>] to perform the labeling task.</p>
        <p>We found that both line of therapy and cancer stage showed a large number of possible classes in the data (<xref ref-type="table" rid="table1">Table 1</xref>), and for the purposes of model training, pooled some of these categories together to make the classification task more manageable.</p>
        <p>The final list of refined classes was selected based on their frequency and an assessment of how useful an individual class would be to the project, as judged by a subject matter expert; this information was also used to assign an ordinal rank to each class, lower ranks corresponding to more common classes.</p>
        <p>To map from the initial list to the final list, we developed a binning algorithm that chooses the training class with the highest rank that overlaps with the labeled class. For example, in <xref ref-type="table" rid="table2">Table 2</xref>, “Second line” was ranked third and “First line” was ranked fourth; therefore, a labeled class with the categories “First line; Second line; Third line” would have the training class value of “Second line.” The highest-ranked class was always assigned the null set. This functioned as the default value for when there is no overlapping training class in the labeled class.</p>
        <p>Algorithmically, this means the following:</p>
        <p>base_features = {(i,j,k), (i,k,l), (m,n), (k), …},</p>
        <p>target_classes_reverse_order = {(i), (j,k), (j), (k), …}</p>
        <p>text_target_class = null</p>
        <p>for text in texts:</p>
        <p>for base_feature in base_features:</p>
        <p>for target_class in target_classes_reverse_order:</p>
        <p>if base_feature in target_class:</p>
        <p>text_target_class = target_class</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Labeled classes present in the data set for line of therapy and stage of cancer after labeling. These classes are input into the binning algorithm to produce the training classes seen in <xref ref-type="table" rid="table2">Table 2</xref>.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="700"/>
            <col width="0"/>
            <col width="270"/>
            <thead>
              <tr valign="top">
                <td colspan="3">Class</td>
                <td>Texts describing approvals, n</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="4">
                  <bold>Line of therapy</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>First line</td>
                <td colspan="2">123</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Second line</td>
                <td colspan="2">114</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <italic>blank</italic>
                </td>
                <td colspan="2">62</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Maintenance and Consolidation</td>
                <td colspan="2">45</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>First line; Second line</td>
                <td colspan="2">19</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Second line; Third line</td>
                <td colspan="2">18</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Fourth line or Greater; Third line</td>
                <td colspan="2">17</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Adjuvant</td>
                <td colspan="2">14</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Third line</td>
                <td colspan="2">5</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Fourth line or Greater; Second line; Third line</td>
                <td colspan="2">5</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Fourth line or Greater</td>
                <td colspan="2">3</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Maintenance and Consolidation; Third line</td>
                <td colspan="2">3</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>First line; Second line; Third line</td>
                <td colspan="2">3</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Adjuvant; Second line; Third line</td>
                <td colspan="2">2</td>
              </tr>
              <tr valign="top">
                <td colspan="4">
                  <bold>Stage of cancer</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Stage III; Stage IV</td>
                <td colspan="2">176</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Stage IV</td>
                <td colspan="2">72</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <italic>blank</italic>
                </td>
                <td colspan="2">50</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Relapsed</td>
                <td colspan="2">41</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Relapsed; Stage III; Stage IV</td>
                <td colspan="2">40</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Stage III</td>
                <td colspan="2">19</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Relapsed; Stage IV</td>
                <td colspan="2">16</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Extensive stage</td>
                <td colspan="2">11</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Stage I; Stage II; Stage III</td>
                <td colspan="2">3</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Stage I</td>
                <td colspan="2">3</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Stage I; Stage II</td>
                <td colspan="2">2</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Training classes used for line of therapy and stage of cancer derived by the binning algorithm and ordered by input rank.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="550"/>
            <col width="420"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Class</td>
                <td>Texts describing approvals, n</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="3">
                  <bold>Line of therapy</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <italic>Blank </italic>
                </td>
                <td>45</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Maintenance/Consolidation</td>
                <td>79</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Second line</td>
                <td>163</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>First line</td>
                <td>123</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Third line</td>
                <td>23</td>
              </tr>
              <tr valign="top">
                <td colspan="3">
                  <bold>Stage of cancer</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <italic>Blank</italic>
                </td>
                <td>41</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Stage III; Stage IV</td>
                <td>201</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Stage IV</td>
                <td>73</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Relapsed</td>
                <td>61</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Relapsed; Stage III; Stage IV</td>
                <td>57</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>NLP Algorithm Development</title>
        <p>Off-the-shelf packages are available that return state-of-the-art results on many different benchmarking data sets. Here, we use the transformers library from huggingface [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>].</p>
        <p>We applied transfer learning [<xref ref-type="bibr" rid="ref2">2</xref>] and fine-tuned a DistilBERT [<xref ref-type="bibr" rid="ref22">22</xref>] model, a distilled version of BERT that runs faster while retaining comparable performance. We attempted to use BioBERT [<xref ref-type="bibr" rid="ref4">4</xref>] because models adapted to medical literature have been shown to increase scores [<xref ref-type="bibr" rid="ref23">23</xref>]; however, here, performance did not improve. Along these lines, we also fine-tuned a domain-adapted BERT-based model, using trial titles from the Trialtrove database as text and patient population categories that had been tagged by a Trialtrove analyst as the target. The performance of this model was disappointing, and we concluded that syntaxes were too different between Trialtrove titles and BioMedTracker approval descriptions, possibly because titles are too short to allow the model to learn.</p>
        <p>Line of therapy and stage of cancer extraction are treated as classification problems, while trial identification is treated as a NER task. Preimplemented flows are available in the Hugging Face library [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>] and we adapted them to our needs. Selecting only the first 2 paragraphs of text led to better results in the classification tasks, while using the full text was found to be best for the NER task, probably because information relating to the line of therapy or stage of cancer is located at the beginning of the text, while trials leading to approval can be found either at the beginning, or toward the end. We also deleted HTML tags, which generally correspond to hyperlinks leading to trial description.</p>
        <p>As a benchmark, we developed a rule-based text mining approach on the same data. Subject matter experts gathered common examples of words and phrases that were associated with their choice of line of therapy or stage of cancer. These examples were used as a lookup list in the text-mining model.</p>
        <p>The accuracy of the rule-based approach was then calculated and compared with the cross-validated accuracy from the deep learning approach. The highest score was considered as the winning model. Predictions using this winning model were used to prepopulate label-studio input to guide the labeling process when new texts describing approvals became available or new drug targets were taken into account.</p>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>This study is exempt from human subjects’ research review as no human subjects were involved.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Classification Tasks: Line and Stage</title>
        <p>We observed the following results for the classification task.</p>
        <p>For the BERT-based models, we display 5-fold cross-validated accuracy, defined in recent literature as the best available metric for classification [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref26">26</xref>]. This means that the data set is divided into 5 segments, which are successively considered as test data sets; we train the model on 4 segments, that is, 80% of the data, and test it on the remaining segment, that is, 20% of the data. Then the next data segment is considered as test data set. Finally, the 5 results are averaged.</p>
        <p>We followed the methodology proposed in appendix A.3 of Devlin et al [<xref ref-type="bibr" rid="ref1">1</xref>] and performed a grid search over batch size (possible values: 16 and 32), learning rate (possible values: 2e-5, 3e-5, and 5e-5), and number of epochs (possible values: 2, 3, and 4). Devlin et al [<xref ref-type="bibr" rid="ref1">1</xref>] report in appendix A.3 that searching over these hyperparameters worked well across all tasks they worked on, which include binary and multiclass classifications, balanced and unbalanced data sets, and question answering tasks.</p>
        <p>5-fold cross-validated accuracies (red line in <xref rid="figure2" ref-type="fig">Figure 2</xref>) generally increase with the number of texts describing approvals, similarly to benchmark data sets (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> [<xref ref-type="bibr" rid="ref27">27</xref>-<xref ref-type="bibr" rid="ref43">43</xref>] ). As a second observation, we notice some local decreases, similar to those observed for benchmark data sets, for example, TREC-6 or IMDB, for similar abscissas (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Based on these 2 observations, benchmark data sets provide a good understanding of the fine-tuned BERT models’ behavior.</p>
        <p>Due to the unbalanced data set, we also displayed the percentage of inputs from the largest class (green line). This percentage fluctuates through time as new drug targets are included sequentially. A simple algorithm that would place all entries in the largest class would return a score corresponding to the green curve. As an example, for stage of cancer, the proportion of the class corresponding to “stage III; stage IV” reached almost 65% during the project.</p>
        <p>For the rule-based text mining model (blue line in <xref rid="figure2" ref-type="fig">Figure 2</xref>), accuracies decreased as new texts describing approvals were added to the database. This is in line with expectations: the dictionary of expressions was established early on and not modified, and new texts describing approvals can only bring more diversity in the expressions.</p>
        <p>Marked changes at the end of the curves correspond to two key operational decisions: (1) labeling was refined and homogenized and (2) the number of different classes considered in the classification tasks was increased from 4 to 5 (all 5 lines from <xref ref-type="table" rid="table2">Table 2</xref> for line of therapy and stage of cancer were taken into account instead of the first 4 lines only).</p>
        <p>Current scores of 5-fold cross-validated accuracy were 61% for line of therapy with the fine-tuned BERT model and 60% for the rule-based approach. For cancer stage, they were 56% and 74%, respectively. These scores are displayed in <xref ref-type="table" rid="table3">Table 3</xref>. Although these scores may seem low, it is important to understand that each model performs multiclass classification with 5 classes each, so they are doing much better than random.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Application to the BioMedTracker data set, performance of the fine-tuned bidirectional encoder representations from transformers (BERT) models at key stages for text classification corresponding to “Line of therapy” and “Stage of cancer.” Rule-based text mining (TEXT) accuracy decreases when more texts describing approvals are taken into account, as line of therapy or stage of cancer is expressed with slightly different formulations from one approval to the next. Deep learning accuracies generally increase when more texts describing approvals are added. Marked changes appear on the right-hand side of the curve, following expert’s intervention to homogenize the labeling and the addition of one target class.</p>
          </caption>
          <graphic xlink:href="formative_v7i1e44876_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Current 5-fold cross-validated accuracy scores are reported for line of therapy and cancer stage classification classes, and 5-fold cross-validated <italic>F</italic><sub>1</sub>-scores are reported for the Clinical Trials Named Entity Recognition task.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="400"/>
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <thead>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Line of therapy, %</td>
                <td>Cancer Stage, %</td>
                <td>Clinical Trial, %</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Fine-tuned BERT<sup>a</sup> model</td>
                <td>61</td>
                <td>56</td>
                <td>87</td>
              </tr>
              <tr valign="top">
                <td>Rule-based approach</td>
                <td>60</td>
                <td>74</td>
                <td>—<sup>b</sup></td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>BERT: bidirectional encoder representations from transformers.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>Not available.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Model Interpretability</title>
        <p>Model interpretability is key in deep learning algorithms, and models whose results are well understood can be preferred to less interpretable, higher-accuracy models [<xref ref-type="bibr" rid="ref44">44</xref>]. We use LIME [<xref ref-type="bibr" rid="ref44">44</xref>] to understand what the classification models see in the data.</p>
        <p>LIME results are generally easy to interpret, which builds confidence in the models. For line of therapy, the word “first” appeared repeatedly as the most important word for the class “First line,” and the word “maintenance” appeared often as the most important word for the class “Maintenance/Consolidation.” <xref rid="figure3" ref-type="fig">Figure 3</xref> illustrates this observation on 1 typical example for the class “First line” (top). On the left-hand side of <xref rid="figure3" ref-type="fig">Figure 3</xref>, LIME displays scores corresponding to individual classes, and the highest score is the BERT-based model’s result. When the highest score is close to 1, the choice is unambiguous for the model (<xref rid="figure3" ref-type="fig">Figure 3</xref>, top, 99% for class First line). In other circumstances, the choice is more balanced (<xref rid="figure3" ref-type="fig">Figure 3</xref>, bottom), and in this second example, the model fails to predict the correct class. In the middle part of <xref rid="figure3" ref-type="fig">Figure 3</xref>, LIME displays words leading to decision with the most important word at the top and the least important word at the bottom. This ranking was obtained by LIME through deletion of randomly selected words in the text and the reevaluation of the final score. Words appearing with the color of the class reinforce the decision taken by the algorithm, while words displayed in blue weaken the decision. In the right part of <xref rid="figure3" ref-type="fig">Figure 3</xref>, LIME displays the input text and highlights the most important words.</p>
        <p>Besides these straightforward cases, other words appear as important which are a lot less intuitive. For example, “Korea” was often identified as important in first line and “carcinoma” in second line. These examples show that biases can appear when applied to a new data set. We think that these biases will disappear or be attenuated when the number of inputs increases, something to be checked over time. Since a subject matter expert is involved to correct the results of the algorithm before they are used in the internal software, these possible biases are appropriately handled in the project (see section Deployment to production).</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Model interpretability by LIME algorithm. Top: typical LIME results for first line of therapy; bottom: example where the model fails. Left: scores for each category. The highest score corresponds to the model’s results. A highest score close to one is an unambiguous decision, while a lower highest score is a less certain decision. Middle: most important words, where positive values increase the model’s score, and negative values decrease it. Right: input text, important words are highlighted.</p>
          </caption>
          <graphic xlink:href="formative_v7i1e44876_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>NER Task: Clinical Study</title>
        <p>To identify trials leading to an approval, we adopted a NER algorithm [<xref ref-type="bibr" rid="ref45">45</xref>]. We selected 5-fold cross-validated <italic>F</italic><sub>1</sub>-score as a metric for this task, in agreement with recent literature [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref26">26</xref>] for NER tasks. <italic>F</italic><sub>1</sub>-scores are the harmonic mean of precision and recall and are classically used as a measure of success in NER tasks (unbalanced problems, where accuracy is not sufficient as a metric) [<xref ref-type="bibr" rid="ref45">45</xref>]. We also applied the hyperparameters grid search strategy described for the classification tasks. In this project, we found that concatenating the BioMedTracker data set with another data set from the literature [<xref ref-type="bibr" rid="ref46">46</xref>-<xref ref-type="bibr" rid="ref48">48</xref>] and solving for all end points simultaneously was necessary to get a 5-fold cross-validated <italic>F</italic><sub>1</sub>-score of 87%, as reported in <xref ref-type="table" rid="table3">Table 3</xref>. <xref ref-type="table" rid="table4">Table 4</xref> illustrates this process and summarizes the number of entities per class available in the merged data set when the merge is done with the wnut data set [<xref ref-type="bibr" rid="ref48">48</xref>]. The improved scores are in agreement with previous work [<xref ref-type="bibr" rid="ref49">49</xref>-<xref ref-type="bibr" rid="ref52">52</xref>], which reports that “multitasking” improves NER results. However, multitask learning generally leads to a few percent increase in <italic>F</italic><sub>1</sub>-score. In this study, the <italic>F</italic><sub>1</sub>-score is null when we only use the BioMedTracker data set, and it reaches 87% when we concatenate this data set with one of the conll, ncbi, or wnut data sets [<xref ref-type="bibr" rid="ref46">46</xref>-<xref ref-type="bibr" rid="ref48">48</xref>].</p>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Number of entities per class when the data set is merged with the wnut [<xref ref-type="bibr" rid="ref48">48</xref>] data set for simultaneous named entity recognition problem resolution. The data set from this study contains only 1 entity type, named clinical trial in the table. All other entity types come from the wnut [<xref ref-type="bibr" rid="ref48">48</xref>] data set.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="400"/>
            <col width="600"/>
            <thead>
              <tr valign="top">
                <td>Metric</td>
                <td>Entries per class in the train data set, n</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>person</td>
                <td>470</td>
              </tr>
              <tr valign="top">
                <td>location</td>
                <td>74</td>
              </tr>
              <tr valign="top">
                <td>corporation</td>
                <td>34</td>
              </tr>
              <tr valign="top">
                <td>product</td>
                <td>114</td>
              </tr>
              <tr valign="top">
                <td>creative work</td>
                <td>104</td>
              </tr>
              <tr valign="top">
                <td>group</td>
                <td>39</td>
              </tr>
              <tr valign="top">
                <td>clinical trial</td>
                <td>345</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Deployment to Production</title>
        <p><xref rid="figure4" ref-type="fig">Figure 4</xref> illustrates the deployment to production of the 3 models described above. New texts describing approvals were collected automatically from the BioMedTracker API [<xref ref-type="bibr" rid="ref18">18</xref>]. Predicted labels were calculated using both the rule-based text mining approach and the deep learning approach described above. The algorithm leading to the highest accuracy was selected, and its results are displayed.</p>
        <p>The data set was then released both in internal software and to subject matter experts performing the labeling. Results that correspond to predictions are explicitly flagged as predictions to the user.</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Full workflow as deployed in preproduction phase. New texts describing approvals are collected automatically from the BioMedTracker API [<xref ref-type="bibr" rid="ref18">18</xref>]. Predicted labels are calculated using both the basic text mining approach and the deep learning approach. The algorithm leading to the highest accuracy is selected. The data set is then released both in the internal custom-made software and to subject matter experts performing the labeling. API: application programming interface; ErbB2: erythroblastic oncogene B; GUI: graphical user interface; HER2: human epidermal growth factor receptor 2; ML: machine learning.</p>
          </caption>
          <graphic xlink:href="formative_v7i1e44876_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Results</title>
        <p>We have developed and put in 3 deep learning models corresponding to fine-tuned versions of the BERT model. Each model is designed to automatically analyze free text describing approvals taken out of the BioMedTracker database and answer one of the following questions: (1) Which line of therapy has the compound been approved for? (2) Which stage of cancer has the compound been approved for? (3) Which clinical trials have supported this approved indication? The first 2 questions have been addressed as classification tasks, while the third question was addressed as an NER task. For this purpose, we have used publicly available packages that allow fine-tuning the BERT model with relative ease [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>], and we have used published grid search strategies for the hyperparameters [<xref ref-type="bibr" rid="ref1">1</xref>].</p>
        <p>Current scores of 5-fold cross-validated accuracy were 61% and 56% for line of therapy and cancer stage, respectively, and 87% 5-fold cross-validated <italic>F</italic><sub>1</sub>-scores for clinical trial. We have compared a rule-based approach for line of therapy and cancer stage, whose current scores are 60% and 74%, respectively.</p>
        <p>The tasks described in this paper are challenging because they rely on a variety of subtly different text formulations. Hence, machine learning results help focus the analysis of the subject matter expert. For example, they help identify quickly unambiguous cases (top of <xref rid="figure3" ref-type="fig">Figure 3</xref>): the model scores high (99% for class “First line”), and the highlighted words indicate the reason for the decision (the words “first-line treatment” are highlighted in the text). The second example at the bottom of <xref rid="figure3" ref-type="fig">Figure 3</xref> is more ambiguous, and the subject matter expert can focus on the analysis. Overall, the 3 machine learning models enable subject matter experts to leverage the results for deeper analysis and to accelerate information retrieval in a crowded clinical environment such as oncology.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>The main limitation of the application of deep learning to the BioMedTracker data set is the size of the labeled training data set, which currently is equal to 433 texts describing approvals. More training instances will become available when additional drug targets are considered or when new approval descriptions will be stored in the BioMedTracker database.</p>
        <p>It also seems that our problem can be considered a complex problem if we take as a comparison point data sets from the literature used in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Indeed, when we add more entry texts, accuracies increase slowly, at a rate similar to the Yahoo! Answers data set (40% accuracy with 200 entry texts and 77% accuracy for all 1.4 million texts).</p>
        <p>This small number of training instances leads to relatively low scores for the 2 classification tasks: the current 5-fold cross-validated accuracies for line of therapy and stage of cancer are 61% and 56%, respectively. However, these accuracies are still much better than random choice alone because each model comprises 5 different classes.</p>
        <p>Mitigation of these low accuracies for downstream, dependent systems is handled by the production pipeline, since a subject matter expert verifies and corrects the automatic labels produced by the deep learning model so as to return reliable results to end users.</p>
        <p>Despite the lower accuracies seen for the classification tasks, subject matter experts reported that the labeling experience was improved by the presence of model predictions; even for a human, it is a nontrivial task to assess the approved populations for a large number of event descriptions.</p>
      </sec>
      <sec>
        <title>Comparison With Previous Work</title>
        <p>In this work, we address the problem of extracting information for competitive intelligence. NLP tools have been widely applied to extract information from electronic health records [<xref ref-type="bibr" rid="ref5">5</xref>-<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref53">53</xref>-<xref ref-type="bibr" rid="ref55">55</xref>]. Even though the targets can be similar, for example, cancer stage or line of therapy, the nature of the documents is different, a lot less detailed in our case, and a new methodology is needed.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>We have described the development and application of 3 deep learning models, fine-tuned from BERT [<xref ref-type="bibr" rid="ref1">1</xref>]. They aim at extracting structured information from unstructured text, aiding information extraction and visualizations in downstream systems. The first model classifies the text describing the approval (<xref rid="figure1" ref-type="fig">Figure 1</xref>) in 1 of 5 categories corresponding to line of therapy. The second model performs the same task for cancer stage. The third model identifies trials in the paragraph only if they lead to the approval. We compared the results of these deep learning models to rule-based approaches for line of therapy and cancer stage.</p>
        <p>In our case, although much better than random, accuracies achieved are insufficient for automation, and human intervention is necessary. We describe how we implement human intervention, which leads to a process that is effective for the users, subject matter experts, and machine learning engineers.</p>
        <p>Accuracies are expected to improve through time as more training data become available. However, in the meantime, subject matter experts already find these results to be an insightful guide to labeling, saving much-needed time for extracting this information to support clinical insights and decision-making.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Comparison with benchmark datasets.</p>
        <media xlink:href="formative_v7i1e44876_app1.doc" xlink:title="DOC File , 316 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">API</term>
          <def>
            <p>application programming interface</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">BERT</term>
          <def>
            <p>Bidirectional Encoder Representations from Transformers</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">NER</term>
          <def>
            <p>named entity recognition</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>We thank AstraZeneca for funding this project.</p>
    </ack>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>Data sets used in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> are publicly available and relevant links are provided. BioMedTracker data are proprietary and are not allowed to be made public. Data for this study may be requested and purchased from BioMedTracker.</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="conflict">
        <p>All authors work for AstraZeneca. They hold stock options in AstraZeneca, except GD and RA.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Devlin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>MW</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Toutanova</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Bert: pre-training of deep bidirectional transformers for language understanding</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on 11 Oct 2018</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1810.04805"/>
          </comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.1810.04805</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vaswani</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shazeer</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Parmar</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Uszkoreit</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Gomez</surname>
              <given-names>AN</given-names>
            </name>
            <name name-style="western">
              <surname>Kaiser</surname>
              <given-names>Ł</given-names>
            </name>
            <name name-style="western">
              <surname>Polosukhin</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>Attention is all you need</article-title>
          <year>2017</year>
          <conf-name>Advances in Neural Information Processing Systems 30</conf-name>
          <conf-date>December 4-9, 2017</conf-date>
          <conf-loc>Long Beach, CA, USA</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://proceedings.neurips.cc/paper_files/paper/2017/hash/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Torrey</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Shavlik</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <person-group person-group-type="editor">
            <name name-style="western">
              <surname>Olivas</surname>
              <given-names>ES</given-names>
            </name>
            <name name-style="western">
              <surname>Guerrero</surname>
              <given-names>JDM</given-names>
            </name>
            <name name-style="western">
              <surname>Martinez-Sober</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Magdalena-Benedito</surname>
              <given-names>JR</given-names>
            </name>
            <name name-style="western">
              <surname>López</surname>
              <given-names>AJS</given-names>
            </name>
          </person-group>
          <article-title>Transfer learning</article-title>
          <source>Handbook of Research on Machine Learning Applications and Trends: Algorithms, Methods, and Techniques</source>
          <year>2010</year>
          <publisher-loc>Pennsylvania, United States</publisher-loc>
          <publisher-name>IGI Global</publisher-name>
          <fpage>242</fpage>
          <lpage>264</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Yoon</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>So</surname>
              <given-names>CH</given-names>
            </name>
            <name name-style="western">
              <surname>Kang</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>BioBERT: a pre-trained biomedical language representation model for biomedical text mining</article-title>
          <source>Bioinformatics</source>
          <year>2020</year>
          <volume>36</volume>
          <issue>4</issue>
          <fpage>1234</fpage>
          <lpage>1240</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://academic.oup.com/bioinformatics/article/36/4/1234/5566506?login=false"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btz682</pub-id>
          <pub-id pub-id-type="medline">31501885</pub-id>
          <pub-id pub-id-type="pii">5566506</pub-id>
          <pub-id pub-id-type="pmcid">PMC7703786</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shivade</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Raghavan</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Fosler-Lussier</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Embi</surname>
              <given-names>PJ</given-names>
            </name>
            <name name-style="western">
              <surname>Elhadad</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>SB</given-names>
            </name>
            <name name-style="western">
              <surname>Lai</surname>
              <given-names>AM</given-names>
            </name>
          </person-group>
          <article-title>A review of approaches to identifying patient phenotype cohorts using electronic health records</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2014</year>
          <volume>21</volume>
          <issue>2</issue>
          <fpage>221</fpage>
          <lpage>230</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://academic.oup.com/jamia/article/21/2/221/2909214?login=false"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/amiajnl-2013-001935</pub-id>
          <pub-id pub-id-type="medline">24201027</pub-id>
          <pub-id pub-id-type="pii">amiajnl-2013-001935</pub-id>
          <pub-id pub-id-type="pmcid">PMC3932460</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Meng</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Ou</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Chandwani</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Black</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Cai</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Temporal phenotyping by mining healthcare data to derive lines of therapy for cancer</article-title>
          <source>J Biomed Inform</source>
          <year>2019</year>
          <volume>100</volume>
          <fpage>103335</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.sciencedirect.com/science/article/pii/S1532046419302540?via%3Dihub"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2019.103335</pub-id>
          <pub-id pub-id-type="medline">31689549</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(19)30254-0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>McCowan</surname>
              <given-names>IA</given-names>
            </name>
            <name name-style="western">
              <surname>Moore</surname>
              <given-names>DC</given-names>
            </name>
            <name name-style="western">
              <surname>Nguyen</surname>
              <given-names>AN</given-names>
            </name>
            <name name-style="western">
              <surname>Bowman</surname>
              <given-names>RV</given-names>
            </name>
            <name name-style="western">
              <surname>Clarke</surname>
              <given-names>BE</given-names>
            </name>
            <name name-style="western">
              <surname>Duhig</surname>
              <given-names>EE</given-names>
            </name>
            <name name-style="western">
              <surname>Fry</surname>
              <given-names>M-J</given-names>
            </name>
          </person-group>
          <article-title>Collection of cancer stage data by classifying free-text medical reports</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2007</year>
          <volume>14</volume>
          <issue>6</issue>
          <fpage>736</fpage>
          <lpage>745</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://academic.oup.com/jamia/article/14/6/736/752957?login=false"/>
          </comment>
          <pub-id pub-id-type="doi">10.1197/jamia.M2130</pub-id>
          <pub-id pub-id-type="medline">17712093</pub-id>
          <pub-id pub-id-type="pii">M2130</pub-id>
          <pub-id pub-id-type="pmcid">PMC2213490</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yim</surname>
              <given-names>W-W</given-names>
            </name>
            <name name-style="western">
              <surname>Kwan</surname>
              <given-names>SW</given-names>
            </name>
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Yetisgen</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Classification of hepatocellular carcinoma stages from free-text clinical and radiology reports</article-title>
          <year>2017</year>
          <conf-name>AMIA Annual Symposium Proceedings</conf-name>
          <conf-date>November 6-8, 2017</conf-date>
          <conf-loc>Washington Hilton Hotel, Washington, DC</conf-loc>
          <publisher-name>American Medical Informatics Association</publisher-name>
          <fpage>1858</fpage>
          <lpage>1867</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nguyen</surname>
              <given-names>AN</given-names>
            </name>
            <name name-style="western">
              <surname>Lawley</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Hansen</surname>
              <given-names>DP</given-names>
            </name>
            <name name-style="western">
              <surname>Bowman</surname>
              <given-names>RV</given-names>
            </name>
            <name name-style="western">
              <surname>Clarke</surname>
              <given-names>BE</given-names>
            </name>
            <name name-style="western">
              <surname>Duhig</surname>
              <given-names>EE</given-names>
            </name>
            <name name-style="western">
              <surname>Colquist</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Symbolic rule-based classification of lung cancer stages from free-text pathology reports</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2010</year>
          <volume>17</volume>
          <issue>4</issue>
          <fpage>440</fpage>
          <lpage>445</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://academic.oup.com/jamia/article/17/4/440/866997?login=false"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/jamia.2010.003707</pub-id>
          <pub-id pub-id-type="medline">20595312</pub-id>
          <pub-id pub-id-type="pii">17/4/440</pub-id>
          <pub-id pub-id-type="pmcid">PMC2995652</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Automatic extraction of lung cancer staging information from computed tomography reports: deep learning approach</article-title>
          <source>JMIR Med Inform</source>
          <year>2021</year>
          <volume>9</volume>
          <issue>7</issue>
          <fpage>e27955</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2021/7/e27955"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/27955</pub-id>
          <pub-id pub-id-type="medline">34287213</pub-id>
          <pub-id pub-id-type="pii">v9i7e27955</pub-id>
          <pub-id pub-id-type="pmcid">PMC8339987</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>CancerBERT: a cancer domain-specific language model for extracting breast cancer phenotypes from electronic health records</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2022</year>
          <volume>29</volume>
          <issue>7</issue>
          <fpage>1208</fpage>
          <lpage>1216</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://academic.oup.com/jamia/article/29/7/1208/6554005"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocac040</pub-id>
          <pub-id pub-id-type="medline">35333345</pub-id>
          <pub-id pub-id-type="pii">6554005</pub-id>
          <pub-id pub-id-type="pmcid">PMC9196678</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Davidoff</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Seal</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Edelman</surname>
              <given-names>MJ</given-names>
            </name>
          </person-group>
          <article-title>Chemotherapy and survival benefit in elderly patients with advanced non-small-cell lung cancer</article-title>
          <source>J Clin Oncol</source>
          <year>2010</year>
          <volume>28</volume>
          <issue>13</issue>
          <fpage>2191</fpage>
          <lpage>2197</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ascopubs.org/doi/10.1200/JCO.2009.25.4052"/>
          </comment>
          <pub-id pub-id-type="doi">10.1200/JCO.2009.25.4052</pub-id>
          <pub-id pub-id-type="medline">20351329</pub-id>
          <pub-id pub-id-type="pii">JCO.2009.25.4052</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Datta</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Bernstam</surname>
              <given-names>EV</given-names>
            </name>
            <name name-style="western">
              <surname>Roberts</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>A frame semantic overview of NLP-based information extraction for cancer-related EHR notes</article-title>
          <source>J Biomed Inform</source>
          <year>2019</year>
          <volume>100</volume>
          <fpage>103301</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.sciencedirect.com/science/article/pii/S1532046419302217?via%3Dihub"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2019.103301</pub-id>
          <pub-id pub-id-type="medline">31589927</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(19)30221-7</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cary</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Roberts</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Church</surname>
              <given-names>AK</given-names>
            </name>
            <name name-style="western">
              <surname>Eckert</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Ouyang</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Haggstrom</surname>
              <given-names>DA</given-names>
            </name>
          </person-group>
          <article-title>Development of a novel algorithm to identify staging and lines of therapy for bladder cancer</article-title>
          <source>J Clin Oncol</source>
          <year>2017</year>
          <volume>35</volume>
          <issue>15 suppl</issue>
          <fpage>e18235</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ascopubs.org/doi/abs/10.1200/JCO.2017.35.15_suppl.e18235"/>
          </comment>
          <pub-id pub-id-type="doi">10.1200/jco.2017.35.15_suppl.e18235</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Meng</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Mosesso</surname>
              <given-names>KM</given-names>
            </name>
            <name name-style="western">
              <surname>Lane</surname>
              <given-names>KA</given-names>
            </name>
            <name name-style="western">
              <surname>Roberts</surname>
              <given-names>AR</given-names>
            </name>
            <name name-style="western">
              <surname>Griffith</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ou</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Dexter</surname>
              <given-names>PR</given-names>
            </name>
          </person-group>
          <article-title>An automated line-of-therapy algorithm for adults with metastatic non-small cell lung cancer: validation study using blinded manual chart review</article-title>
          <source>JMIR Med Inform</source>
          <year>2021</year>
          <volume>9</volume>
          <issue>10</issue>
          <fpage>e29017</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2021/10/e29017"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/29017</pub-id>
          <pub-id pub-id-type="medline">34636730</pub-id>
          <pub-id pub-id-type="pii">v9i10e29017</pub-id>
          <pub-id pub-id-type="pmcid">PMC8548977</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kiritchenko</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>de Bruijn</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Carini</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Martin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sim</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>ExaCT: automatic extraction of clinical trial characteristics from journal publications</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2010</year>
          <volume>10</volume>
          <fpage>56</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/1472-6947-10-56"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/1472-6947-10-56</pub-id>
          <pub-id pub-id-type="medline">20920176</pub-id>
          <pub-id pub-id-type="pii">1472-6947-10-56</pub-id>
          <pub-id pub-id-type="pmcid">PMC2954855</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Marshall</surname>
              <given-names>IJ</given-names>
            </name>
            <name name-style="western">
              <surname>Nye</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Kuiper</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Noel-Storr</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Marshall</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Maclean</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Soboczenski</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Nenkova</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Thomas</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wallace</surname>
              <given-names>BC</given-names>
            </name>
          </person-group>
          <article-title>Trialstreamer: a living, automatically updated database of clinical trial reports</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2020</year>
          <volume>27</volume>
          <issue>12</issue>
          <fpage>1903</fpage>
          <lpage>1912</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://academic.oup.com/jamia/article/27/12/1903/5907063?login=false"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocaa163</pub-id>
          <pub-id pub-id-type="medline">32940710</pub-id>
          <pub-id pub-id-type="pii">5907063</pub-id>
          <pub-id pub-id-type="pmcid">PMC7727361</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="web">
          <article-title>Informa, December 2021</article-title>
          <source>Biomedtracker</source>
          <access-date>2023-05-09</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.biomedtracker.com/">https://www.biomedtracker.com/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tkachenko</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Malyuk</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Shevchenko</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Holmanyuk</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Liubimov</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <source>Label Studio: Data Labeling Software</source>
          <year>2020</year>
          <access-date>2023-05-09</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/heartexlabs/label-studio">https://github.com/heartexlabs/label-studio</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wolf</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Debut</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Sanh</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Chaumond</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Delangue</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Moi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Cistac</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Rault</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Louf</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Funtowicz</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Davison</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Shleifer</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>von Platen</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Jernite</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Plu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Le Scao</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Gugger</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Drame</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lhoest</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Rush</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Transformers: state-of-the-art natural language processing</article-title>
          <year>2020</year>
          <conf-name>Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations</conf-name>
          <conf-date>October 2020</conf-date>
          <conf-loc>Virtual Event, EMNLP</conf-loc>
          <publisher-name>Association for Computational Linguistics</publisher-name>
          <fpage>38</fpage>
          <lpage>45</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/2020.emnlp-demos.6/"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2020.emnlp-demos.6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lhoest</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>del Moral</surname>
              <given-names>AV</given-names>
            </name>
            <name name-style="western">
              <surname>Jernite</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Thakur</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>von Platen</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Patil</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chaumond</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Drame</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Plu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tunstall</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Davison</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Šaško</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chhablani</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Malik</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Brandeis</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Le Scao</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Sanh</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Patry</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>McMillan-Major</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Schmid</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Gugger</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Delangue</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Matussière</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Debut</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Bekman</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Cistac</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Goehringer</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Mustar</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Lagunas</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Rush</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Wolf</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Datasets: a community library for natural language processing</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on 07 Sep 2021</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2109.02846"/>
          </comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2109.02846</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sanh</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Debut</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Chaumond</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wolf</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on 02 Oct 2019</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1910.01108"/>
          </comment>
          <pub-id pub-id-type="doi">10.5260/chara.21.2.8</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gururangan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Marasović</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Swayamdipta</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Lo</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Beltagy</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Downey</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>NA</given-names>
            </name>
          </person-group>
          <article-title>Don't stop pretraining: adapt language models to domains and tasks</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on 23 Apr 2020</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2004.10964"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2020.acl-main.740</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="web">
          <article-title>Text Classification</article-title>
          <source>github</source>
          <access-date>2023-05-09</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/huggingface/notebooks/blob/main/examples/text_classification.ipynb">https://github.com/huggingface/notebooks/blob/main/examples/text_classification.ipynb</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="web">
          <article-title>Token Classification</article-title>
          <source>github</source>
          <access-date>2023-05-09</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/huggingface/notebooks/blob/master/examples/token_classification.ipynb">https://github.com/huggingface/notebooks/blob/master/examples/token_classification.ipynb</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="web">
          <article-title>Text Classification</article-title>
          <source>Papers with code</source>
          <access-date>2023-05-09</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://paperswithcode.com/task/text-classification">https://paperswithcode.com/task/text-classification</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="web">
          <article-title>datasets</article-title>
          <source>Hugging Face</source>
          <access-date>2023-05-09</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://huggingface.co/datasets">https://huggingface.co/datasets</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>LeCun</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Character-level convolutional networks for text</article-title>
          <source>Advances in neural information processing systems</source>
          <year>2015</year>
          <fpage>28</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://proceedings.neurips.cc/paper_files/paper/2015/file/250cf8b51c773f3f8dc8b4be867a9a02-Paper.pdf">https://proceedings.neurips.cc/paper_files/paper/2015/file/250cf8b51c773f3f8dc8b4be867a9a02-Paper.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="web">
          <article-title>datasets ag_news</article-title>
          <source>Hugging Face</source>
          <access-date>2023-06-12</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://huggingface.co/datasets/ag_news">https://huggingface.co/datasets/ag_news</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="book">
          <source>Dbpedia: A nucleus for a web of open data</source>
          <year>2007</year>
          <month>11</month>
          <day>11</day>
          <publisher-loc>Berlin, Heidelberg</publisher-loc>
          <publisher-name>Springer</publisher-name>
          <fpage>722</fpage>
          <lpage>735</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://link.springer.com/chapter/10.1007/978-3-540-76298-0_52">https://link.springer.com/chapter/10.1007/978-3-540-76298-0_52</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="web">
          <article-title>datasets dbpedia_14</article-title>
          <source>Hugging Face</source>
          <access-date>2023-06-12</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://huggingface.co/datasets/dbpedia_14">https://huggingface.co/datasets/dbpedia_14</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Roth</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Learning question classifiers</article-title>
          <year>2002</year>
          <conf-name>COLING 2002: The 19th International Conference on Computational Linguistics 2002</conf-name>
          <conf-date>2002</conf-date>
          <conf-loc>Taipei, Taiwan</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="web">
          <article-title>Datasets Trec</article-title>
          <source>Hugging Face</source>
          <access-date>2023-06-12</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://huggingface.co/datasets/trec">https://huggingface.co/datasets/trec</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cachopo</surname>
              <given-names>AM</given-names>
            </name>
          </person-group>
          <article-title>Improving methods for single-label text categorization</article-title>
          <source>Instituto Superior Técnico</source>
          <year>2007</year>
          <month>07</month>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://citeseerx.ist.psu.edu/document?repid=rep1&#38;type=pdf&#38;doi=d8d6afd46d75b8115afd0b22c19fbdf020cbd754">https://citeseerx.ist.psu.edu/document?repid=rep1&#38;type=pdf&#38;doi=d8d6afd46d75b8115afd0b22c19fbdf020cbd754</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="web">
          <article-title>datasets newsgroup</article-title>
          <source>Hugging Face</source>
          <access-date>2023-06-12</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://huggingface.co/datasets/newsgroup">https://huggingface.co/datasets/newsgroup</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="confproc">
          <article-title>Learning word vectors for sentiment analysis</article-title>
          <year>2011</year>
          <month>06</month>
          <conf-name>Proceedings of the 49th annual meeting of the association for computational linguistics: Human language technologies</conf-name>
          <conf-date>2011</conf-date>
          <conf-loc>Portland, Oregon, USA</conf-loc>
          <fpage>142</fpage>
          <lpage>150</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="web">
          <article-title>datasets imdb</article-title>
          <source>Hugging Face</source>
          <access-date>2023-06-12</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://huggingface.co/datasets/imdb">https://huggingface.co/datasets/imdb</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Adamic</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bakshy</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Ackerman</surname>
              <given-names>MS</given-names>
            </name>
          </person-group>
          <article-title>Knowledge sharing and yahoo answers: everyone knows something</article-title>
          <year>2008</year>
          <month>4</month>
          <day>21</day>
          <conf-name>Proceedings of the 17th international conference on World Wide Web</conf-name>
          <conf-date>April, 2008</conf-date>
          <conf-loc>Beijing</conf-loc>
          <fpage>665</fpage>
          <lpage>674</lpage>
          <pub-id pub-id-type="doi">10.1145/1367497.1367587</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="web">
          <article-title>datasets yahoo_answers_topics</article-title>
          <source>Hugging Face</source>
          <access-date>2023-06-12</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://huggingface.co/datasets/yahoo_answers_topics">https://huggingface.co/datasets/yahoo_answers_topics</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="web">
          <article-title>datasets conll2003</article-title>
          <source>Hugging Face</source>
          <access-date>2023-06-12</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://huggingface.co/datasets/conll2003">https://huggingface.co/datasets/conll2003</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="web">
          <article-title>datasets ncbi_disease</article-title>
          <source>Hugging Face</source>
          <access-date>2023-06-12</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://huggingface.co/datasets/ncbi_disease">https://huggingface.co/datasets/ncbi_disease</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="web">
          <article-title>datasets wnut_17</article-title>
          <source>Hugging Face</source>
          <access-date>2023-06-12</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://huggingface.co/datasets/wnut_17">https://huggingface.co/datasets/wnut_17e</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Jiao</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>Yueping</given-names>
            </name>
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>Robin J</given-names>
            </name>
            <name name-style="western">
              <surname>Sciaky</surname>
              <given-names>Daniela</given-names>
            </name>
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>Chih-Hsuan</given-names>
            </name>
            <name name-style="western">
              <surname>Leaman</surname>
              <given-names>Robert</given-names>
            </name>
            <name name-style="western">
              <surname>Davis</surname>
              <given-names>Allan Peter</given-names>
            </name>
            <name name-style="western">
              <surname>Mattingly</surname>
              <given-names>Carolyn J</given-names>
            </name>
            <name name-style="western">
              <surname>Wiegers</surname>
              <given-names>Thomas C</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Zhiyong</given-names>
            </name>
          </person-group>
          <article-title>BioCreative V CDR task corpus: a resource for chemical disease relation extraction</article-title>
          <source>Database (Oxford)</source>
          <year>2016</year>
          <volume>2016</volume>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/27161011"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/database/baw068</pub-id>
          <pub-id pub-id-type="medline">27161011</pub-id>
          <pub-id pub-id-type="pii">baw068</pub-id>
          <pub-id pub-id-type="pmcid">PMC4860626</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ribeiro</surname>
              <given-names>MT</given-names>
            </name>
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Guestrin</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>"Why should I trust you?" Explaining the predictions of any classifier</article-title>
          <year>2016</year>
          <conf-name>KDD '16: Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining</conf-name>
          <conf-date>August 13-17, 2016</conf-date>
          <conf-loc>San Francisco, California, USA</conf-loc>
          <fpage>1135</fpage>
          <lpage>1144</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dl.acm.org/doi/abs/10.1145/2939672.2939778"/>
          </comment>
          <pub-id pub-id-type="doi">10.1145/2939672.2939778</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="web">
          <source>Named Entity Recognition (NER)</source>
          <access-date>2023-05-09</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://paperswithcode.com/task/named-entity-recognition-ner">https://paperswithcode.com/task/named-entity-recognition-ner</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kim Sang</surname>
              <given-names>EFT</given-names>
            </name>
            <name name-style="western">
              <surname>De Meulder</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Introduction to the CoNLL-2003 shared task: language-independent named entity recognition</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on 12 Jun 2003</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/cs/0306050"/>
          </comment>
          <pub-id pub-id-type="doi">10.3115/1119176.1119195</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Doğan</surname>
              <given-names>RI</given-names>
            </name>
            <name name-style="western">
              <surname>Leaman</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>NCBI disease corpus: a resource for disease name recognition and concept normalization</article-title>
          <source>J Biomed Inform</source>
          <year>2014</year>
          <volume>47</volume>
          <fpage>1</fpage>
          <lpage>10</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.sciencedirect.com/science/article/pii/S1532046413001974?via%3Dihub"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2013.12.006</pub-id>
          <pub-id pub-id-type="medline">24393765</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(13)00197-4</pub-id>
          <pub-id pub-id-type="pmcid">PMC3951655</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Derczynski</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Nichols</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>van</surname>
              <given-names>EM</given-names>
            </name>
            <name name-style="western">
              <surname>Limsopatham</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Results of the WNUT2017 shared task on novel and emerging entity recognition</article-title>
          <year>2017</year>
          <conf-name>Proceedings of the 3rd Workshop on Noisy User-Generated Text</conf-name>
          <conf-date>September 2017</conf-date>
          <conf-loc>Copenhagen, Denmark</conf-loc>
          <fpage>140</fpage>
          <lpage>147</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/w17-4418</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Caruana</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Multitask learning</article-title>
          <source>Machine learning</source>
          <year>1997</year>
          <volume>28</volume>
          <issue>1</issue>
          <fpage>41</fpage>
          <lpage>75</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://link.springer.com/article/10.1023/A:1007379606734"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/978-1-4615-5529-2_5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref50">
        <label>50</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Collobert</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Weston</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>A unified architecture for natural language processing: deep neural networks with multitask learning</article-title>
          <year>2008</year>
          <conf-name>Proceedings of the 25th International Conference on Machine Learning</conf-name>
          <conf-date>July 5-9, 2008</conf-date>
          <conf-loc>Helsinki Finland</conf-loc>
          <publisher-loc>New York, NY, United States</publisher-loc>
          <publisher-name>Association for Computing Machinery</publisher-name>
          <fpage>160</fpage>
          <lpage>167</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dl.acm.org/doi/abs/10.1145/1390156.1390177"/>
          </comment>
          <pub-id pub-id-type="doi">10.1145/1390156.1390177</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref51">
        <label>51</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Crichton</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Pyysalo</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chiu</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Korhonen</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>A neural network multi-task learning approach to biomedical named entity recognition</article-title>
          <source>BMC Bioinformatics</source>
          <year>2017</year>
          <volume>18</volume>
          <issue>1</issue>
          <fpage>368</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-017-1776-8"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12859-017-1776-8</pub-id>
          <pub-id pub-id-type="medline">28810903</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12859-017-1776-8</pub-id>
          <pub-id pub-id-type="pmcid">PMC5558737</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref52">
        <label>52</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Ren</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zitnik</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Shang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Langlotz</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Han</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Cross-type biomedical named entity recognition with deep multi-task learning</article-title>
          <source>Bioinformatics</source>
          <year>2019</year>
          <volume>35</volume>
          <issue>10</issue>
          <fpage>1745</fpage>
          <lpage>1752</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://academic.oup.com/bioinformatics/article/35/10/1745/5126922?login=false"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/bty869</pub-id>
          <pub-id pub-id-type="medline">30307536</pub-id>
          <pub-id pub-id-type="pii">5126922</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref53">
        <label>53</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Savova</surname>
              <given-names>GK</given-names>
            </name>
            <name name-style="western">
              <surname>Masanz</surname>
              <given-names>JJ</given-names>
            </name>
            <name name-style="western">
              <surname>Ogren</surname>
              <given-names>PV</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sohn</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kipper-Schuler</surname>
              <given-names>KC</given-names>
            </name>
            <name name-style="western">
              <surname>Chute</surname>
              <given-names>CG</given-names>
            </name>
          </person-group>
          <article-title>Mayo clinical text analysis and knowledge extraction system (cTAKES): architecture, component evaluation and applications</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2010</year>
          <volume>17</volume>
          <issue>5</issue>
          <fpage>507</fpage>
          <lpage>513</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://academic.oup.com/jamia/article/17/5/507/830823?login=false"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/jamia.2009.001560</pub-id>
          <pub-id pub-id-type="medline">20819853</pub-id>
          <pub-id pub-id-type="pii">17/5/507</pub-id>
          <pub-id pub-id-type="pmcid">PMC2995668</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref54">
        <label>54</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Soysal</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Pakhomov</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>CLAMP - a toolkit for efficiently building customized clinical natural language processing pipelines</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2018</year>
          <volume>25</volume>
          <issue>3</issue>
          <fpage>331</fpage>
          <lpage>336</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://academic.oup.com/jamia/article/25/3/331/4657212?login=false"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocx132</pub-id>
          <pub-id pub-id-type="medline">29186491</pub-id>
          <pub-id pub-id-type="pii">4657212</pub-id>
          <pub-id pub-id-type="pmcid">PMC7378877</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref55">
        <label>55</label>
        <nlm-citation citation-type="web">
          <source>I2E is Developed and Marketed IQVIA Ltd</source>
          <access-date>2023-05-09</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.linguamatics.com/">http://www.linguamatics.com/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
