<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JFR</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id>
      <journal-title>JMIR Formative Research</journal-title>
      <issn pub-type="epub">2561-326X</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v7i1e39077</article-id>
      <article-id pub-id-type="pmid">36853741</article-id>
      <article-id pub-id-type="doi">10.2196/39077</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>German Medical Named Entity Recognition Model and Data Set Creation Using Machine Translation and Word Alignment: Algorithm Development and Validation</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Mavragani</surname>
            <given-names>Amaryllis</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Roller</surname>
            <given-names>Roland</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Zhao</surname>
            <given-names>Chang</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Frei</surname>
            <given-names>Johann</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>IT Infrastructure for Translational Medical Research</institution>
            <institution>University of Augsburg</institution>
            <addr-line>Alter Postweg 101</addr-line>
            <addr-line>Augsburg, 86159</addr-line>
            <country>Germany</country>
            <phone>49 17691464136</phone>
            <email>johann.frei@informatik.uni-augsburg.de</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-0323-0904</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Kramer</surname>
            <given-names>Frank</given-names>
          </name>
          <degrees>Prof Dr</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2857-7122</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>IT Infrastructure for Translational Medical Research</institution>
        <institution>University of Augsburg</institution>
        <addr-line>Augsburg</addr-line>
        <country>Germany</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Johann Frei <email>johann.frei@informatik.uni-augsburg.de</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2023</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>28</day>
        <month>2</month>
        <year>2023</year>
      </pub-date>
      <volume>7</volume>
      <elocation-id>e39077</elocation-id>
      <history>
        <date date-type="received">
          <day>27</day>
          <month>4</month>
          <year>2022</year>
        </date>
        <date date-type="rev-request">
          <day>22</day>
          <month>8</month>
          <year>2022</year>
        </date>
        <date date-type="rev-recd">
          <day>11</day>
          <month>9</month>
          <year>2022</year>
        </date>
        <date date-type="accepted">
          <day>3</day>
          <month>11</month>
          <year>2022</year>
        </date>
      </history>
      <copyright-statement>©Johann Frei, Frank Kramer. Originally published in JMIR Formative Research (https://formative.jmir.org), 28.02.2023.</copyright-statement>
      <copyright-year>2023</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on https://formative.jmir.org, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://formative.jmir.org/2023/1/e39077" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Data mining in the field of medical data analysis often needs to rely solely on the processing of unstructured data to retrieve relevant data. For German natural language processing, few open medical neural named entity recognition (NER) models have been published before this work. A major issue can be attributed to the lack of German training data.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>We developed a synthetic data set and a novel German medical NER model for public access to demonstrate the feasibility of our approach. In order to bypass legal restrictions due to potential data leaks through model analysis, we did not make use of internal, proprietary data sets, which is a frequent veto factor for data set publication.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>The underlying German data set was retrieved by translation and word alignment of a public English data set. The data set served as a foundation for model training and evaluation. For demonstration purposes, our NER model follows a simple network architecture that is designed for low computational requirements.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>The obtained data set consisted of 8599 sentences including 30,233 annotations. The model achieved a class frequency–averaged <italic>F</italic><sub>1</sub> score of 0.82 on the test set after training across 7 different NER types. Artifacts in the synthesized data set with regard to translation and alignment induced by the proposed method were exposed. The annotation performance was evaluated on an external data set and measured in comparison with an existing baseline model that has been trained on a dedicated German data set in a traditional fashion. We discussed the drop in annotation performance on an external data set for our simple NER model. Our model is publicly available.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>We demonstrated the feasibility of obtaining a data set and training a German medical NER model by the exclusive use of public training data through our suggested method. The discussion on the limitations of our approach includes ways to further mitigate remaining problems in future work.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>natural language processing</kwd>
        <kwd>named entity recognition</kwd>
        <kwd>information extraction</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Overview</title>
        <p>Despite continuous efforts to transform the storage and processing of medical data in health care systems into a framework of machine-readable highly structured data, implementation designs that aim to fulfill such requirements are only slowly gaining traction in the clinical health care environment. In addition to common technical challenges, physicians tend to bypass or completely avoid inconvenient data input interfaces, which enforce structured data formats, by encoding relevant information as free-form unstructured text [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>].</p>
        <p>Electronic data capturing systems are developed to improve the situation of structured data capturing. Yet their primary focus lies on clinical studies. The involvement of these systems needs to be designed in early stages and requires active software management and maintenance. Such electronic data capturing solutions are commonly considered in the context of clinical research but are largely omitted in non–research-centric health care services, and paper-based solutions are preferred [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref4">4</xref>].</p>
        <p>Because of the rise of data mining and big data analysis, finding and understanding novel relationships of disease, disease-indicating biomarkers, drug effects, and other input variables require large-scale data acquisition and collection. This induces additional pressure on finding and exploring new possible data sources.</p>
        <p>Although new data sets can be designed and created for specific use cases, the amount of obtained data might be very limited and not sufficient for modern data-driven methods. Furthermore, such data collection efforts can turn out as rather inefficient in terms of time and work involved in creating new data sets with respect to the number of acquired data samples.</p>
        <p>In contrast, unstructured data of sources from legacy systems and non–research-centric health care, referred to as second use, offer a potential alternative. However, techniques for information extraction and retrieval, mainly from the natural language processing (NLP) domain, need to be applied to transform raw data into structured information.</p>
        <p>While the availability of existing NLP models in English, and other non–NLP-based techniques, for medical use cases is the focus of active research, the situation of medical NLP models for non-English languages is less satisfying. As the performance of an NLP model often depends on its dedicated target language, most models cannot be shared and reused easily in different languages but require retraining on new data from the desired target language.</p>
        <p>In particular, for the case of detection of entities like prescribed drugs and level or frequency of dosage from German medical documents like doctoral letters, few open and publicly available models have been published. We attribute this to two main contributing factors:</p>
        <list list-type="order">
          <list-item>
            <p>Lack of public German data sets: Most open public data sets are designed for English data only. Until 2020, no such dedicated German data set has been published. Specifically in the context of clinical data, legal restrictions and privacy policies prevent the collection and publication of German data sets. Data-driven NLP research for medical applications uses largely internal data for training and evaluation. In addition to the data set itself, to model relevant text features with supervised learning, high-quality annotations of the data set are essential for robust model performance.</p>
          </list-item>
          <list-item>
            <p>Protection of sensitive data and privacy concerns: Although few works have been published that present data-driven models for German texts, the weights of these models have not been openly published. Because respective training data have been used in a nonanonymized or pseudonymized fashion, the publication of the model weights inherently comes at the risk of possible data leakage issues through training data extraction [<xref ref-type="bibr" rid="ref5">5</xref>] from the model, potentially exposing sensitive information like patient names or ID numbers.</p>
          </list-item>
        </list>
        <p>In this paper, we aim to tackle the scarcity issue of anonymous training data and publicly available medical German NLP models. Our main contributions are as follows:</p>
        <list list-type="bullet">
          <list-item>
            <p>Automated retrieval of German data set: We propose a method to create a custom data set for our target language, based on a public English data set. In addition, we apply a strategy to preserve relevant annotation information across languages.</p>
          </list-item>
          <list-item>
            <p>Training of medical German NLP model component: We trained and built a named entity recognition (NER) component on the custom data set. The model pipeline supports multiple types of medical entities.</p>
          </list-item>
          <list-item>
            <p>Evaluation and publication of the NLP component: The retrieved data set and the NER model were evaluated as part of an NLP pipeline. The trained model is publicly available for further use by third parties.</p>
          </list-item>
        </list>
      </sec>
      <sec>
        <title>Related Work</title>
        <p>In recent years, substantial progress has been made in the area of NLP, which can mostly be attributed to the joint use of large amounts of data and their processing through large language models like BERT (Bidirectional Encoder Representations from Transformers) [<xref ref-type="bibr" rid="ref6">6</xref>] and its (bio)medical-specific models [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref12">12</xref>]. Such elements display a straightforward way to encode representations of semantic information for further processing in downstream tasks like text classification or text segmentation. These works mostly focus on the English language because of available language corpora like scientific texts from PubMed or specifically designed corpora such as <italic>n2c2</italic> [<xref ref-type="bibr" rid="ref13">13</xref>] (with annotations) and <italic>MIMIC-III</italic> [<xref ref-type="bibr" rid="ref14">14</xref>]. For German, only a few works such as <italic>GGPONC</italic> [<xref ref-type="bibr" rid="ref15">15</xref>] and <italic>BRONCO</italic> [<xref ref-type="bibr" rid="ref16">16</xref>] have been published in recent years as data sets that carry annotation information. Other German data sets [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>] lack annotation information. Moreover, the <italic>Technical-Laymen</italic> [<xref ref-type="bibr" rid="ref19">19</xref>] corpus provides an annotated corpus, yet it is based on crawled texts from nonprofessional online forums. Various other German medical text corpora exist [<xref ref-type="bibr" rid="ref20">20</xref>-<xref ref-type="bibr" rid="ref31">31</xref>] as a basis for certain NLP and information extraction use cases but are inaccessible for public distribution.</p>
        <p>In the field of NLP systems for German medical texts, <italic>medSynDiKATe</italic> [<xref ref-type="bibr" rid="ref32">32</xref>,<xref ref-type="bibr" rid="ref33">33</xref>] approaches information extraction on pathological finding reports by parsing and mapping text elements to (semi)automatically build knowledge representation structures. Processing of pathological findings in German has also been applied to the tasks of sentence classification [<xref ref-type="bibr" rid="ref22">22</xref>].</p>
        <p>In the context of patient records, a hybrid relation extraction (RE) and NER parsing approach using the <italic>SProUT</italic> [<xref ref-type="bibr" rid="ref34">34</xref>] parser has been proposed [<xref ref-type="bibr" rid="ref35">35</xref>]; however, the entity tags lack medical relevance. A similar general NER for nonmedical entity tags has been applied to enable the deidentification of clinical records [<xref ref-type="bibr" rid="ref36">36</xref>] using statistical and regex-based models through the <italic>StanfordNLP</italic> parser [<xref ref-type="bibr" rid="ref37">37</xref>].</p>
        <p>Neural methods have been shown to perform well on certain NLP tasks. In particular, convolutional neural network (CNN) approaches for RE [<xref ref-type="bibr" rid="ref38">38</xref>-<xref ref-type="bibr" rid="ref40">40</xref>] have become popular in recent years. For German texts, the performance of various methods has been investigated for medical NER tasks [<xref ref-type="bibr" rid="ref41">41</xref>], such as CNN, long short-term memory, or support vector machine–based models. In this context, the text processing platform <italic>mEx</italic> [<xref ref-type="bibr" rid="ref42">42</xref>] uses CNN-based methods for solving medical NER in German texts. Similar to our work, <italic>mEx</italic> is built on <italic>SpaCy</italic> [<xref ref-type="bibr" rid="ref43">43</xref>] but provides custom models for other NLP tasks such as RE. However, the platform has been partially trained on internal clinical data, and thus, its statistical models have not been openly published and may only be used under certain legal restrictions on request. An updated version has been published [<xref ref-type="bibr" rid="ref44">44</xref>], yet the models can only be retrieved on request under a usage agreement. As additional work, <italic>GGPONC</italic> (release 2.0) [<xref ref-type="bibr" rid="ref45">45</xref>] provides a baseline model on request. For a more exhaustive survey on non-English clinical NLP in general, we point to [<xref ref-type="bibr" rid="ref46">46</xref>].</p>
        <p>With respect to obtaining cross-lingual annotation information, the basic concept of projecting label data in language pairs via word alignment has been discussed in various NLP contexts [<xref ref-type="bibr" rid="ref47">47</xref>-<xref ref-type="bibr" rid="ref57">57</xref>]. For medical use cases, little research exists [<xref ref-type="bibr" rid="ref49">49</xref>], with focus on English and Chinese data and models. Yet German medical contexts remain largely unexplored.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Overview</title>
        <p>In this section, we first describe our method to synthesize the data set and then describe the used NER model for German text tagging. The entire pipeline is illustrated in <xref rid="figure1" ref-type="fig">Figure 1</xref>.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Illustration of the data set creation and natural language processing (NLP) model training process. The initial English <italic>n2c2</italic> data set is transformed into a synthetic German data set. The data set is used for training an NLP NER model. NER: named entity recognition.</p>
          </caption>
          <graphic xlink:href="formative_v7i1e39077_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Custom Data Set Creation</title>
        <p>We relied on the publicly available training data from the <italic>n2c2 NLP 2018 Track 2</italic> [<xref ref-type="bibr" rid="ref13">13</xref>] data set (adverse drug event [ADE] and medication extraction challenge) as our initial source data set. The data were composed of 303 annotated text documents that have been post processed by the editor for anonymization purposes to explicitly mask sensitive privacy-concerning information. They featured the annotation labels <italic>Drug</italic>, <italic>Route</italic>, <italic>Reason</italic>, <italic>Strength</italic>, <italic>Frequency</italic>, <italic>Duration</italic>, <italic>Form</italic>, <italic>Dosage,</italic> and <italic>ADE</italic>.</p>
        <p>To transform the data into a semantically plausible text, we identified the type and text span of text masks such that we were able to replace the text masks by sampling type-compatible data randomly from a set of sample entries. During the sampling stage, depending on the type of mask, text samples for entities like dates, names, years, or phone numbers were generated and inserted into the text. Because every replacement step might affect the location of the text annotation labels as provided by the character-wise start and stop indices, these label annotation indices must be updated accordingly. For further preprocessing, we split up the text into single sentences such that we could omit all sentences with no associated annotation labels.</p>
        <p>For automated translation, we made use of the open source <italic>fairseq</italic> (version 0.10.2) [<xref ref-type="bibr" rid="ref58">58</xref>] model architecture. <italic>fairseq</italic> is an implementation of a neural machine translation model that supports the automatic translation of sequential text data using pretrained models. For our purposes, we ran the <italic>transformer.wmt19.en-de</italic> pretrained model to translate our set of English sentences into German because the model shows a strong BLEU (BiLingual Evaluation Understudy) translation score for English-German translation tasks [<xref ref-type="bibr" rid="ref59">59</xref>] while maintaining its simplicity for deployment.</p>
        <p>The reconstructive mapping of the annotation labels from the English source text to the German target text was tackled by <italic>fast_align</italic> [<xref ref-type="bibr" rid="ref60">60</xref>]. <italic>fast_align</italic> is an unsupervised method for aligning words from 2 sentences of source and target language. The choice for <italic>fast_align</italic> was reasoned by its low-resource footprint, and it can align sentences fast through its simple statistical model. We projected the annotation labels onto the translated German sentences using the word-level mapping between the corresponding English and German sentence to obtain new annotation label indices in the German sentence. In nonmedical contexts, similar work on non-German target languages exists (eg, [<xref ref-type="bibr" rid="ref57">57</xref>]).</p>
        <p>The word alignment mapping tends to induce errors in situations of sentences with irregular structures such as tabular or itemized text sections. We mitigated the issue and potential subsequent error propagation by inspecting the structure of the word mapping matrix <italic>A</italic>:</p>
        <disp-formula>
          <graphic xlink:href="formative_v7i1e39077_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>In situations where <italic>fast_align</italic> fails to establish a meaningful mapping between the source and target sentence, it can be observed that the resulting mapping table collapses to a highly nondiagonal matrix structure, as illustrated by the following example:</p>
        <disp-formula>
          <graphic xlink:href="formative_v7i1e39077_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>Severely ill-aligned word mapping matrices can be detected and removed from the final set of sentences by applying the simple filter decision rule:</p>
        <disp-formula>
          <graphic xlink:href="formative_v7i1e39077_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>where the average distance between a nonzero entry and the diagonal line from <italic>A</italic><sub>1,1</sub> to <inline-graphic xlink:href="formative_v7i1e39077_fig7.png" xlink:type="simple" mimetype="image"/> is evaluated, given <italic>w<sub>en</sub></italic> as the number of words in the English sentence and <italic>w<sub>de</sub></italic> as the number of words in the German sentence. If the value exceeds the threshold <italic>t</italic>, the sentence pair is disregarded for the final set of sentences.</p>
        <p>The word mapping matrices describe a nonsymmetric cross-correspondence between 2 language-dependent token sets, which enables the projection of tokens within the English annotation span onto the semantically corresponding tokens in the German translation text. Therefore, the annotation label indices for the English text can be resolved to the actual indices for the translated German text at a character level.</p>
      </sec>
      <sec>
        <title>NER Model Architecture</title>
        <p>For the buildup of our NER model as part of an NLP pipeline, we use <italic>SpaCy</italic> as an NLP framework for training and inference. In comparison with state-of-the-art models, we select a lightweight non–transformer-based model because it serves primarily as a demonstration model and can be trained without significant compute costs.</p>
        <list list-type="bullet">
          <list-item>
            <p>Embedding: The word tokens are embedded by Bloom embeddings [<xref ref-type="bibr" rid="ref61">61</xref>] where different linguistic features are concatenated into a single vector and passed through <italic>n</italic><sub>embed</sub>
            separate dense layers, followed by a final max pooling and layer norm step. This step enables the model to learn meaningful linear combinations of single input feature embeddings while reducing the number of dimensions.</p>
          </list-item>
          <list-item>
            <p>Context-aware token encoding: To extract context-aware features that are able to capture larger token window sizes, the final token embedding is passed through a multilayered convolutional network. Each convolution step consists of the convolution itself and the following max-pooling operation to keep the dimensions constrained. For each convolution step, a residual (skip) connection is added to allow the model to pass intermediate data representations from previous layers to subsequent layers.</p>
          </list-item>
          <list-item>
            <p>NER parsing: For each encoded token, a corresponding feature-token vector is precomputed in advance by a dense layer. For parsing, the document is processed token-wise in a stateful manner. For NER, the state at a given position consists of the current token, the first token of the last entity, and the previous token by index. Given the state, the feature-position vectors are retrieved by indexing the values from the precomputed data and summed up. A dense layer is applied to predict the next action. Depending on the action, the current token is annotated and the next state is generated until the entire document has been parsed.</p>
          </list-item>
        </list>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>Because of the nature of our proposed method, our work does not involve data or human subject research, which could potentially violate basic human ethics in a narrow sense.</p>
        <p>The public data approach shifts the responsibility of privacy-preserving measures to the data set publisher. We assume that the <italic>n2c2</italic> data set has been deidentified correctly and no privacy-related information can be retrieved anymore.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Data Set Synthesis</title>
        <p>The source data set consists of 303 documents from the <italic>n2c2</italic> training data set. As an initial preprocessing step, we needed to replace the anonymization masks with meaningful regular text data to reconstruct the natural appearance of the text and alleviate a potential data set bias that leads to gaps between the data set and real-world data. For numerical data, we could retrieve mask replacements by random sampling. Similar to numerical data, dates and years are sampled and formatted to common date formats. For semantically relevant data types, we used the Python package <italic>Faker</italic>. The package maintains lists of plausible data of various types such as first names, last names, addresses, or phone numbers. We made use of these data entries for certain types of anonymization masks.</p>
        <p>To obtain our custom data set, we split the texts from the original data set into single sentences using the sentence splitting algorithm from <italic>SpaCy</italic>. The English sentences were translated into German by the <italic>fairseq</italic> library with beam search (<italic>b</italic>=5). The sentence-wise word alignments were obtained by <italic>fast_align</italic> and cleaned up by our filter decision rule (<italic>t</italic>=1.8). To determine this particular hyperparameter, we sampled 10 ill-aligned samples without applying the filter and gradually lowered the threshold <italic>t</italic> until all 10 samples were detected by the decision rule.</p>
        <p>The labels <italic>Reason</italic> and <italic>ADE</italic> were removed from the data set because of the fact that their definitions are rather ambiguous in general contexts beyond the scope of the initial source data set.</p>
        <p>Our final custom data set consisted of 8599 sentence pairs, annotated with 30,233 annotations of 7 different class labels. The different class labels and their corresponding frequency in absolute numbers are shown in <xref ref-type="table" rid="table1">Table 1</xref>. The German sentences consisted of 172,695 tokens in total.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>The model performance scores per named entity recognition (NER) tag and the annotation distribution in the custom data set in absolute numbers.<sup>a</sup></p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="180"/>
            <col width="230"/>
            <col width="170"/>
            <col width="210"/>
            <col width="210"/>
            <thead>
              <tr valign="top">
                <td>NER tag</td>
                <td>Precision (%)</td>
                <td>Recall (%)</td>
                <td><italic>F</italic><sub>1</sub> score (%)</td>
                <td>Label tags,n</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Drug</td>
                <td>67.33</td>
                <td>66.17</td>
                <td>66.74</td>
                <td>8305</td>
              </tr>
              <tr valign="top">
                <td>Strength</td>
                <td>92.34</td>
                <td>90.99</td>
                <td>91.66</td>
                <td>4071</td>
              </tr>
              <tr valign="top">
                <td>Route</td>
                <td>89.93</td>
                <td>90.14</td>
                <td>90.04</td>
                <td>4549</td>
              </tr>
              <tr valign="top">
                <td>Form</td>
                <td>91.94</td>
                <td>89.24</td>
                <td>90.57</td>
                <td>4238</td>
              </tr>
              <tr valign="top">
                <td>Dosage</td>
                <td>87.83</td>
                <td>87.57</td>
                <td>87.70</td>
                <td>409</td>
              </tr>
              <tr valign="top">
                <td>Frequency</td>
                <td>79.14</td>
                <td>76.92</td>
                <td>78.01</td>
                <td>5242</td>
              </tr>
              <tr valign="top">
                <td>Duration</td>
                <td>67.86</td>
                <td>52.78</td>
                <td>59.37</td>
                <td>3419</td>
              </tr>
              <tr valign="top">
                <td>Total</td>
                <td>82.31</td>
                <td>80.79</td>
                <td>81.54</td>
                <td>30,233</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>The evaluation is based on the separated test set. Total scores are aggregated by label-frequency-weighted averaging. The total data set consists of 8599 sentence samples (172,695 tokens). A single-tag sample may span multiple tokens.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Translation and Alignment Artifacts</title>
        <p>We sampled and selected a set of sentence pairs to investigate and illustrate the artifacts that we could observe in the synthesized data set with regard to translation as well as word alignment. The selection of samples is presented in <xref rid="figure2" ref-type="fig">Figure 2</xref>. Overall, we found the alignment and translation quality acceptable in sentences of simple structure and semantics (sample 1). However, the translation tended to fail in abbreviations such as PO (samples 2 and 5) as well as in text with uncommon syntax such as uppercase text (sample 4) or domain-dependent context (samples 2 and 6). To our surprise, the translation model was able to translate the sequence “One (1)” correctly in sample 5 but failed for the same term in sample 2. We attribute this to the context-sensitive, neural black-box model of the translation engine. In terms of alignment, most tokens were well aligned in sentence pairs of simple syntax and structure. Alignment errors could be found in sentence pairs with different sentence structures in English and German (sample 3) where our filter rule does not apply.</p>
        <p>Because of parsing and alignment issues, we found that annotations were discarded (samples 5 and 6) in cases of single-token annotations that start as the first word of the sentence. This artifact affected primarily the label class <italic>Drug</italic>.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Selection of sampled sentence pairs from the synthetic data set. Most samples show correct outputs. The translation and word alignment artifacts occur in unusual syntactical contexts (translation) or complex sentence structures (alignment). Both English sentence (top) and its translated sentence (bottom) are depicted. Annotations with failed German correspondence resolution are not shown in the English sentence.</p>
          </caption>
          <graphic xlink:href="formative_v7i1e39077_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>NER Model Training and Evaluation</title>
        <p>For training, we used our custom German data set as our training data and split the data set into a training set (80%, 6879 sentence samples), validation set, and test set (both 10%, 860 samples). The training setup followed the default NER setup of <italic>SpaCy</italic>; the Adam optimizer with a learning rate of 0.001 with decay (β<sub>1</sub>=.9, β<sub>2</sub>=.999) was used. The training took 10 minutes on an Intel i7-8665U CPU.</p>
        <p>The model performance during training is shown in <xref rid="figure3" ref-type="fig">Figure 3</xref>. The corresponding performance scores were evaluated on the validation set (as part of the training set).</p>
        <p>We selected the final model based on the highest <italic>F</italic><sub>1</sub> score on the validation set. The performance of the selected model was evaluated on the test set per NER tag as well as in total. The evaluation concerns the token-wise IOB (Inside, Outside, Begin)-action prediction. The results are shown in <xref ref-type="table" rid="table1">Table 1</xref>.</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Training scores on validation set (as part of the training set): evaluation scores are computed at every 200th iteration.</p>
          </caption>
          <graphic xlink:href="formative_v7i1e39077_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Comparison to the English Baseline Data Set</title>
        <p>To empirically quantify the error propagation through translation and word alignment, we retrained an equivalent model with all English sentences from our sentence pairs. The evaluation strategy remained similar to the strategy for the scores from <xref ref-type="table" rid="table1">Table 1</xref>. The scores are reported in <xref ref-type="table" rid="table2">Table 2</xref>. The results from the English model show comparable results to the German model for all labels except for <italic>Drug</italic>.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Test set performance scores per named entity recognition (NER) tag of the model trained on the English sentences from the obtained data set in absolute numbers.<sup>a</sup></p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="180"/>
            <col width="230"/>
            <col width="170"/>
            <col width="210"/>
            <col width="210"/>
            <thead>
              <tr valign="top">
                <td>NER tag</td>
                <td>Precision (%)</td>
                <td>Recall (%)</td>
                <td><italic>F</italic><sub>1</sub> score (%)</td>
                <td>German <italic>F</italic><sub>1</sub> score (%)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Drug</td>
                <td>80.94</td>
                <td>82.02</td>
                <td>81.47</td>
                <td>−14.73 (66.74)</td>
              </tr>
              <tr valign="top">
                <td>Strength</td>
                <td>89.02</td>
                <td>90.12</td>
                <td>89.57</td>
                <td>2.09 (91.66)</td>
              </tr>
              <tr valign="top">
                <td>Route</td>
                <td>85.55</td>
                <td>95.08</td>
                <td>90.06</td>
                <td>−0.02 (90.04)</td>
              </tr>
              <tr valign="top">
                <td>Form</td>
                <td>94.36</td>
                <td>87.12</td>
                <td>90.60</td>
                <td>−0.03 (90.57)</td>
              </tr>
              <tr valign="top">
                <td>Dosage</td>
                <td>89.41</td>
                <td>89.97</td>
                <td>89.69</td>
                <td>−1.99 (87.70)</td>
              </tr>
              <tr valign="top">
                <td>Frequency</td>
                <td>80.55</td>
                <td>80.15</td>
                <td>80.35</td>
                <td>−2.34 (78.01)</td>
              </tr>
              <tr valign="top">
                <td>Duration</td>
                <td>62.50</td>
                <td>51.02</td>
                <td>56.18</td>
                <td>3.19 (59.37)</td>
              </tr>
              <tr valign="top">
                <td>Total</td>
                <td>85.14</td>
                <td>85.82</td>
                <td>85.48</td>
                <td>−3.94 (81.54)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>The evaluation similar to the results from <xref ref-type="table" rid="table1">Table 1</xref>. Total scores are aggregated by label-frequency-weighted averaging. For comparison, the <italic>F</italic><sub>1</sub> score differences of the German model to the English model are provided.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Advanced Evaluation and Model Comparison on a Separated Data Set</title>
        <p>To further estimate the performance scores on a separated data set, we evaluated the model on a custom out-of-distribution (OoD) data set. The data set was created internally by clinical physicians by manually writing down and annotating 30 fake sentences (<italic>Internal Gold</italic>). For model comparison, we used the baseline model from <italic>GGPONC</italic> (release 2.0) [<xref ref-type="bibr" rid="ref45">45</xref>] and evaluated its annotation performance on the label class we considered equivalent to our <italic>Drug</italic> label class. In comparison with our model (approximately 5 MB), the <italic>GGPONC</italic> model is orders of magnitudes larger due to its use of pretrained transformers (approximately 500 MB). The results are given in <xref ref-type="table" rid="table3">Table 3</xref>. The <italic>F</italic><sub>1</sub> score corresponds to the character-wise label classification performance.</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Evaluation on our out-of-distribution data set with the related GGPONC baseline model for reference: the model performance drops significantly for certain infrequent label classes.<sup>a</sup></p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="270"/>
            <col width="200"/>
            <col width="150"/>
            <col width="200"/>
            <col width="150"/>
            <thead>
              <tr valign="top">
                <td colspan="2">
                Data set and GERNERMED </td>
                <td>Sample, n</td>
                <td><italic>F</italic><sub>1</sub> score (%)</td>
                <td><italic>GGPONC</italic> baseline</td>
                <td>F<sub>1</sub> score (%)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="6">
                  <bold>Internal Gold (30 sentences)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td/>
                <td>Drug</td>
                <td>36</td>
                <td>54.48</td>
                <td>Chemicals_Drug</td>
                <td>56.07</td>
              </tr>
              <tr valign="top">
                <td/>
                <td>Strength</td>
                <td>37</td>
                <td>67.70</td>
                <td>No equivalent</td>
                <td>N/A<sup>b</sup></td>
              </tr>
              <tr valign="top">
                <td/>
                <td>Form</td>
                <td>19</td>
                <td>23.83</td>
                <td>No equivalent</td>
                <td>N/A</td>
              </tr>
              <tr valign="top">
                <td/>
                <td>Dosage</td>
                <td>4</td>
                <td>02.47</td>
                <td>No equivalent</td>
                <td>N/A</td>
              </tr>
              <tr valign="top">
                <td/>
                <td>Frequency</td>
                <td>20</td>
                <td>48.14</td>
                <td>No equivalent</td>
                <td>N/A</td>
              </tr>
              <tr valign="top">
                <td/>
                <td>Duration</td>
                <td>3</td>
                <td>0</td>
                <td>No equivalent</td>
                <td>N/A</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>The <italic>F</italic><sub>1</sub> scores are evaluated as performance scores of character-wise label classifications. The label classes Dosage and Duration occur less frequently and therefore their scores are less reliable.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>N/A: not applicable.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>We were able to obtain a synthetic German data set for medical purposes from an English data set by the method proposed in previous sections. As expected, the translation and alignment method introduced artifacts into the output data, but our model was still able to yield proper performance on the test set after training on the training set from our synthetic data set.</p>
        <p>Separate training on the English sentence pairs yielded similar results for all label classes except for the <italic>Drug</italic> label class. Because it can be assumed that the inherent structure and vocabulary bias from the data set are preserved through translation, the drop for <italic>Drug</italic> can be explained by 2 joint reasons. First, the lexical properties of a translated <italic>Drug</italic> word to its source word can differ frequently and severely. More basic tokens like from the label <italic>Strength</italic> lack language-dependent elements and can be aligned in a robust manner. In cases of other label classes, phrases are often less diverse or are used repeatedly because of the data set bias. This enables robust alignments from <italic>fast_align</italic> due to their statistically frequent correlations.</p>
        <p>When evaluating and comparing our model with another model on an OoD data set, we observed a drop in performance scores across labels. Aside from the label classes of low sample size, we attributed the gaps to the data set shift, which was not captured well by the underlying model architecture. The model cannot rely on high-level semantic embeddings but relies on basic structural patterns, and thus, it works well on the test set but yields less accurate results on independent data sets. The model is intentionally kept primitive as it is meant to serve as a demonstration of feasibility and does not make use of a pretrained transformer.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>Because non–drug-related label classes are not available as annotation data in most external data sets, we cannot independently quantify the drop in performance on these label classes. In the context of this work, it further remains unclear how impactful the use of pretrained transformer networks will be in terms of annotation performance on external data sets if it is trained on the synthesized data set. In this work, the choice of the statistical model and the slim neural model architecture, in particular, is attributed to its small computational footprint while being able to achieve satisfying results. In addition, the NER pipeline of <italic>SpaCy</italic> explicitly induces inductive bias through hand-crafted feature extraction during the token embedding stage. However, the focus of our work lies on the presentation of the translation and alignment method for data set synthesis and its demonstration data for training purposes in the German medical context. We consider an exhaustive hyperparameter optimization as well as the use of a transformer-based model as future work.</p>
        <p>In general, the availability of German NER models and methods for medical and clinical domains still leaves much to be desired as described in previous sections. German data sets in this domain have been largely kept unpublished in the past. However, its implications are significantly broader. In the case of unpublished NLP models, it renders independent reproduction of results and fair comparisons impossible. In the case of lacking data sets or inconsistent annotations, novel competitive data-driven techniques cannot be developed or validated easily.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>In this paper, we presented our method for obtaining a synthesized data set and neural NER model for German medical text as an open, publicly available model. We trained the model on our custom German data set from a publicly available English data set. We described the method to extract and postprocess texts from the masked English texts and generate German texts by translating and cross-lingual token aligning. In addition, the NER model architecture was described and the final model performance was evaluated for single NER tags as well as its performance in total. We discussed the observed issues with the synthesized data set and the performance drop through data set shifts. The advanced evaluation was done on an independent OoD data set. We believe that our method is a well-suited foundation for future work in the context of German medical entity recognition and natural language processing. In particular, the use of primitive NER model architecture remains an important point for future work. The need for independent data sets to further improve the situation for the research community on this matter has been highlighted.</p>
        <p>The model and the test set corpus are available on GitHub [<xref ref-type="bibr" rid="ref62">62</xref>].</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">ADE</term>
          <def>
            <p>adverse drug event</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">CNN</term>
          <def>
            <p>convolutional neural network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">NER</term>
          <def>
            <p>named entity recognition</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">OoD</term>
          <def>
            <p>out-of-distribution</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">RE</term>
          <def>
            <p>relation extraction</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This work is a part of the Data Integration for Future Medicine (DIFUTURE) project funded by the German Ministry of Education and Research (Bundesministerium für Bildung und Forschung, BMBF) grant FKZ01ZZ1804E.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Spasic</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Nenadic</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Clinical text data in machine learning: systematic review</article-title>
          <source>JMIR Med Inform</source>
          <year>2020</year>
          <month>03</month>
          <day>31</day>
          <volume>8</volume>
          <issue>3</issue>
          <fpage>e17984</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2020/3/e17984/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/17984</pub-id>
          <pub-id pub-id-type="medline">32229465</pub-id>
          <pub-id pub-id-type="pii">v8i3e17984</pub-id>
          <pub-id pub-id-type="pmcid">PMC7157505</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Percha</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Modern clinical text mining: a guide and review</article-title>
          <source>Annu Rev Biomed Data Sci</source>
          <year>2021</year>
          <month>07</month>
          <day>20</day>
          <volume>4</volume>
          <fpage>165</fpage>
          <lpage>187</lpage>
          <pub-id pub-id-type="doi">10.1146/annurev-biodatasci-030421-030931</pub-id>
          <pub-id pub-id-type="medline">34465177</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Krüger-Brand</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Osterloh</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Elektronische Patientenakte: Viele Modelle - noch keine Strategie</article-title>
          <source>Dtsch Arztebl Int</source>
          <year>2017</year>
          <volume>114</volume>
          <issue>43</issue>
          <fpage>A1960</fpage>
          <lpage>A1966</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pohlmann</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kunz</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ose</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Winkler</surname>
              <given-names>EC</given-names>
            </name>
            <name name-style="western">
              <surname>Brandner</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Poss-Doering</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Szecsenyi</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wensing</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Digitalizing health services by implementing a personal electronic health record in Germany: qualitative analysis of fundamental prerequisites from the perspective of selected experts</article-title>
          <source>J Med Internet Res</source>
          <year>2020</year>
          <volume>22</volume>
          <issue>1</issue>
          <fpage>e15102</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2020/1/e15102/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/15102</pub-id>
          <pub-id pub-id-type="medline">32012060</pub-id>
          <pub-id pub-id-type="pii">v22i1e15102</pub-id>
          <pub-id pub-id-type="pmcid">PMC7016629</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Carlini</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Tramèr</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Wallace</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Jagielski</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Herbert-Voss</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Roberts</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Brown</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Song</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Erlingsson</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Oprea</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Raffel</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Extracting training data from large language models</article-title>
          <source>Usenix Association</source>
          <year>2021</year>
          <access-date>2023-01-27</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.usenix.org/conference/usenixsecurity21/presentation/carlini-extracting">https://www.usenix.org/conference/usenixsecurity21/presentation/carlini-extracting</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Devlin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>MW</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Toutanova</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>BERT: pre-training of deep bidirectional transformers for language understanding</article-title>
          <source>Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)</source>
          <year>2019</year>
          <month>06</month>
          <conf-name>Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</conf-name>
          <conf-date>June 2019</conf-date>
          <conf-loc>Minneapolis, MN</conf-loc>
          <fpage>4171</fpage>
          <lpage>4186</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/N19-1423</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Yan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Transfer learning in biomedical natural language processing: an evaluation of BERT and ELMo on ten benchmarking datasets</article-title>
          <source>Proceedings of the 18th BioNLP Workshop and Shared Task</source>
          <year>2019</year>
          <conf-name>18th BioNLP Workshop and Shared Task</conf-name>
          <conf-date>August 2019</conf-date>
          <conf-loc>Florence, Italy</conf-loc>
          <fpage>58</fpage>
          <lpage>65</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/w19-5006</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rasmy</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Xiang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Xie</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Tao</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Zhi</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Med-BERT: pretrained contextualized embeddings on large-scale structured electronic health records for disease prediction</article-title>
          <source>NPJ Digit Med</source>
          <year>2021</year>
          <month>05</month>
          <day>20</day>
          <volume>4</volume>
          <issue>1</issue>
          <fpage>86</fpage>
          <pub-id pub-id-type="doi">10.1038/s41746-021-00455-y</pub-id>
          <pub-id pub-id-type="medline">34017034</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41746-021-00455-y</pub-id>
          <pub-id pub-id-type="pmcid">PMC8137882</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Yoon</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>So</surname>
              <given-names>CH</given-names>
            </name>
            <name name-style="western">
              <surname>Kang</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>BioBERT: a pre-trained biomedical language representation model for biomedical text mining</article-title>
          <source>Bioinformatics</source>
          <year>2020</year>
          <month>02</month>
          <day>15</day>
          <volume>36</volume>
          <issue>4</issue>
          <fpage>1234</fpage>
          <lpage>1240</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/31501885"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btz682</pub-id>
          <pub-id pub-id-type="medline">31501885</pub-id>
          <pub-id pub-id-type="pii">5566506</pub-id>
          <pub-id pub-id-type="pmcid">PMC7703786</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Alsentzer</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Murphy</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Boag</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Weng</surname>
              <given-names>WH</given-names>
            </name>
            <name name-style="western">
              <surname>Jindi</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Naumann</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>McDermott</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Publicly available clinical BERT embeddings</article-title>
          <source>Proceedings of the 2nd Clinical Natural Language Processing Workshop</source>
          <year>2019</year>
          <conf-name>2nd Clinical Natural Language Processing Workshop</conf-name>
          <conf-date>June 2019</conf-date>
          <conf-loc>Minneapolis, MN</conf-loc>
          <fpage>72</fpage>
          <lpage>78</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/W19-1909"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/w19-1909</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Beltagy</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Lo</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Cohan</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>SciBERT: pretrained language model for scientific text</article-title>
          <source>Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing</source>
          <year>2019</year>
          <conf-name>EMNLP-IJCNLP</conf-name>
          <conf-date>November 2019</conf-date>
          <conf-loc>Hong Kong, China</conf-loc>
          <fpage>3615</fpage>
          <lpage>3620</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/d19-1371</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Rawat</surname>
              <given-names>BPS</given-names>
            </name>
            <name name-style="western">
              <surname>Cai</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Fine-tuning Bidirectional Encoder Representations From Transformers (BERT)-based models on large-scale electronic health record notes: an empirical study</article-title>
          <source>JMIR Med Inform</source>
          <year>2019</year>
          <month>09</month>
          <day>12</day>
          <volume>7</volume>
          <issue>3</issue>
          <fpage>e14830</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2019/3/e14830/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/14830</pub-id>
          <pub-id pub-id-type="medline">31516126</pub-id>
          <pub-id pub-id-type="pii">v7i3e14830</pub-id>
          <pub-id pub-id-type="pmcid">PMC6746103</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Henry</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Buchan</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Filannino</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Stubbs</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Uzuner</surname>
              <given-names>O</given-names>
            </name>
          </person-group>
          <article-title>2018 n2c2 shared task on adverse drug events and medication extraction in electronic health records</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2020</year>
          <month>01</month>
          <day>01</day>
          <volume>27</volume>
          <issue>1</issue>
          <fpage>3</fpage>
          <lpage>12</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/31584655"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocz166</pub-id>
          <pub-id pub-id-type="medline">31584655</pub-id>
          <pub-id pub-id-type="pii">5581277</pub-id>
          <pub-id pub-id-type="pmcid">PMC7489085</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pollard</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mark</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>The MIMIC-III clinical database</article-title>
          <source>PhysioNet</source>
          <year>2016</year>
          <access-date>2023-01-27</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://physionet.org/content/mimiciii/1.4/">https://physionet.org/content/mimiciii/1.4/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Borchert</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Lohr</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Modersohn</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Langer</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Follmann</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sachs</surname>
              <given-names>JP</given-names>
            </name>
            <name name-style="western">
              <surname>Hahn</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Schapranow</surname>
              <given-names>MP</given-names>
            </name>
          </person-group>
          <article-title>GGPONC: a corpus of german medical text with rich metadata based on clinical practice guidelines</article-title>
          <source>Proceedings of the 11th International Workshop on Health Text Mining and Information Analysis</source>
          <year>2020</year>
          <conf-name>11th International Workshop on Health Text Mining and Information Analysis</conf-name>
          <conf-date>November 2020</conf-date>
          <conf-loc>Online</conf-loc>
          <fpage>38</fpage>
          <lpage>48</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/2020.louhi-1.5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kittner</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lamping</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Rieke</surname>
              <given-names>DT</given-names>
            </name>
            <name name-style="western">
              <surname>Götze</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bajwa</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Jelas</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Rüter</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Hautow</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Sänger</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Habibi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zettwitz</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>de Bortoli</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Ostermann</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Ševa</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Starlinger</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kohlbacher</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Malek</surname>
              <given-names>NP</given-names>
            </name>
            <name name-style="western">
              <surname>Keilholz</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Leser</surname>
              <given-names>U</given-names>
            </name>
          </person-group>
          <article-title>Annotation and initial evaluation of a large annotated German oncological corpus</article-title>
          <source>JAMIA Open</source>
          <year>2021</year>
          <month>04</month>
          <volume>4</volume>
          <issue>2</issue>
          <fpage>ooab025</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/33898938"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamiaopen/ooab025</pub-id>
          <pub-id pub-id-type="medline">33898938</pub-id>
          <pub-id pub-id-type="pii">ooab025</pub-id>
          <pub-id pub-id-type="pmcid">PMC8054032</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Suominen</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Kelly</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Goeuriot</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Krallinger</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <person-group person-group-type="editor">
            <name name-style="western">
              <surname>Jose</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Yilmaz</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Magalhães</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Castells</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Ferro</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Silva</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Martins</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>CLEF eHealth Evaluation Lab 2020</article-title>
          <source>Advances in Information Retrieval: 42nd European Conference on IR Research, ECIR 2020, Lisbon, Portugal, April 14–17, 2020, Proceedings, Part II</source>
          <year>2020</year>
          <publisher-loc>Cham</publisher-loc>
          <publisher-name>Springer</publisher-name>
          <fpage>587</fpage>
          <lpage>594</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lohr</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Buechel</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hahn</surname>
              <given-names>U</given-names>
            </name>
          </person-group>
          <article-title>Sharing copies of synthetic clinical corpora without physical distribution: a case study to get around IPRs and privacy constraints featuring the German JSYNCC corpus</article-title>
          <source>Proceedings of the Eleventh International Conference on Language Resources and Evaluation</source>
          <year>2018</year>
          <conf-name>LREC 2018</conf-name>
          <conf-date>May 2018</conf-date>
          <conf-loc>Miyazaki, Japan</conf-loc>
          <fpage>7</fpage>
          <lpage>12</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Seiffe</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Marten</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Mikhailov</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Schmeier</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Möller</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Roller</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>From witch's shot to music making bones: resources for medical laymen to technical language and vice versa</article-title>
          <source>Proceedings of the Twelfth Language Resources and Evaluation Conference</source>
          <year>2020</year>
          <conf-name>Twelfth Language Resources and Evaluation Conference</conf-name>
          <conf-date>May 2020</conf-date>
          <conf-loc>Marseille, France</conf-loc>
          <fpage>6185</fpage>
          <lpage>6192</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wermter</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hahn</surname>
              <given-names>U</given-names>
            </name>
          </person-group>
          <article-title>An annotated German-language medical text corpus as language resource</article-title>
          <source>Proceedings of the Fourth International Conference on Language Resources and Evaluation</source>
          <year>2004</year>
          <conf-name>LREC</conf-name>
          <conf-date>May 2004</conf-date>
          <conf-loc>Lisbon, Portugal</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fette</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Ertl</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Wörner</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kluegl</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Störk</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Puppe</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <person-group person-group-type="editor">
            <name name-style="western">
              <surname>Goltz</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Magnor</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Appelrath</surname>
              <given-names>HJ</given-names>
            </name>
            <name name-style="western">
              <surname>Matthies</surname>
              <given-names>HK</given-names>
            </name>
            <name name-style="western">
              <surname>Balke</surname>
              <given-names>WT</given-names>
            </name>
            <name name-style="western">
              <surname>Wolf</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Information extraction from unstructured electronic health records and integration into a data warehouse</article-title>
          <source>INFORMATIK</source>
          <year>2012</year>
          <publisher-loc>Bonn</publisher-loc>
          <publisher-name>Gesellschaft für Informatik e.V</publisher-name>
          <fpage>1237</fpage>
          <lpage>1251</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bretschneider</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Zillner</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hammon</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Identifying pathological findings in German radiology reports using a syntacto-semantic parsing approach</article-title>
          <source>Proceedings of the 2013 Workshop on Biomedical Natural Language Processing</source>
          <year>2013</year>
          <conf-name>Workshop on Biomedical Natural Language Processing</conf-name>
          <conf-date>August 2013</conf-date>
          <conf-loc>Sofia, Bulgaria</conf-loc>
          <fpage>27</fpage>
          <lpage>35</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Roller</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Uszkoreit</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Seiffe</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Mikhailov</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Staeck</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Budde</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Halleck</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Schmidt</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>A fine-grained corpus annotation schema of German nephrology records</article-title>
          <source>Proceedings of the Clinical Natural Language Processing Workshop</source>
          <year>2016</year>
          <conf-name>ClinicalNLP</conf-name>
          <conf-date>December 2016</conf-date>
          <conf-loc>Osaka, Japan</conf-loc>
          <fpage>69</fpage>
          <lpage>77</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lohr</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>McDevitt</surname>
              <given-names>DT</given-names>
            </name>
            <name name-style="western">
              <surname>Lutter</surname>
              <given-names>KS</given-names>
            </name>
            <name name-style="western">
              <surname>Roedersheimer</surname>
              <given-names>LR</given-names>
            </name>
            <name name-style="western">
              <surname>Sampson</surname>
              <given-names>MG</given-names>
            </name>
          </person-group>
          <article-title>Operative management of greater saphenous thrombophlebitis involving the saphenofemoral junction</article-title>
          <source>Am J Surg</source>
          <year>1992</year>
          <month>09</month>
          <volume>164</volume>
          <issue>3</issue>
          <fpage>269</fpage>
          <lpage>75</lpage>
          <pub-id pub-id-type="doi">10.1016/s0002-9610(05)81084-0</pub-id>
          <pub-id pub-id-type="medline">1415928</pub-id>
          <pub-id pub-id-type="pii">S0002-9610(05)81084-0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kreuzthaler</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Schulz</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Detection of sentence boundaries and abbreviations in clinical narratives</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2015</year>
          <volume>15 Suppl 2</volume>
          <issue>Suppl 2</issue>
          <fpage>S4</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/1472-6947-15-S2-S4"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/1472-6947-15-S2-S4</pub-id>
          <pub-id pub-id-type="medline">26099994</pub-id>
          <pub-id pub-id-type="pii">1472-6947-15-S2-S4</pub-id>
          <pub-id pub-id-type="pmcid">PMC4474545</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cotik</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Roller</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Uszkoreit</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Budde</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Schmidt</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Negation detection in clinical reports written in German</article-title>
          <source>Proceedings of the Fifth Workshop on Building and Evaluating Resources for Biomedical Text Mining</source>
          <year>2016</year>
          <conf-name>BioTxtM2016</conf-name>
          <conf-date>December 2016</conf-date>
          <conf-loc>Osaka, Japan</conf-loc>
          <fpage>115</fpage>
          <lpage>124</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Krebs</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Corovic</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Dietrich</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Ertl</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Fette</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Kaspar</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Krug</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Stoerk</surname>
              <given-names>Stefan</given-names>
            </name>
            <name name-style="western">
              <surname>Puppe</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Semi-automatic terminology generation for information extraction from german chest x-ray reports</article-title>
          <source>Stud Health Technol Inform</source>
          <year>2017</year>
          <volume>243</volume>
          <fpage>80</fpage>
          <lpage>84</lpage>
          <pub-id pub-id-type="medline">28883175</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hahn</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Matthies</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Lohr</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Löffler</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>3000PA-towards a national reference corpus of German clinical language</article-title>
          <source>Stud Health Technol Inform</source>
          <year>2018</year>
          <volume>247</volume>
          <fpage>26</fpage>
          <lpage>30</lpage>
          <pub-id pub-id-type="medline">29677916</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Miñarro-Giménez</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Cornet</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Jaulent</surname>
              <given-names>MC</given-names>
            </name>
            <name name-style="western">
              <surname>Dewenter</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Thun</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Gøeg</surname>
              <given-names>KR</given-names>
            </name>
            <name name-style="western">
              <surname>Karlsson</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Schulz</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Quantitative analysis of manual annotation of clinical text samples</article-title>
          <source>Int J Med Inform</source>
          <year>2019</year>
          <month>03</month>
          <volume>123</volume>
          <fpage>37</fpage>
          <lpage>48</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1386-5056(18)30544-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.ijmedinf.2018.12.011</pub-id>
          <pub-id pub-id-type="medline">30654902</pub-id>
          <pub-id pub-id-type="pii">S1386-5056(18)30544-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>König</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sander</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Demuth</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Diekmann</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Steinhagen-Thiessen</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Knowledge-based best of breed approach for automated detection of clinical events based on German free text digital hospital discharge letters</article-title>
          <source>PLoS One</source>
          <year>2019</year>
          <volume>14</volume>
          <issue>11</issue>
          <fpage>e0224916</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0224916"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0224916</pub-id>
          <pub-id pub-id-type="medline">31774830</pub-id>
          <pub-id pub-id-type="pii">PONE-D-19-04658</pub-id>
          <pub-id pub-id-type="pmcid">PMC6881027</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Toepfer</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Corovic</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Fette</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Klügl</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Störk</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Puppe</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Fine-grained information extraction from German transthoracic echocardiography reports</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2015</year>
          <month>11</month>
          <day>12</day>
          <volume>15</volume>
          <fpage>91</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-015-0215-x"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12911-015-0215-x</pub-id>
          <pub-id pub-id-type="medline">26563260</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12911-015-0215-x</pub-id>
          <pub-id pub-id-type="pmcid">PMC4643516</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hahn</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Romacker</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Schulz</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>MEDSYNDIKATE--a natural language system for the extraction of medical information from findings reports</article-title>
          <source>Int J Med Inform</source>
          <year>2002</year>
          <month>12</month>
          <day>04</day>
          <volume>67</volume>
          <issue>1-3</issue>
          <fpage>63</fpage>
          <lpage>74</lpage>
          <pub-id pub-id-type="doi">10.1016/s1386-5056(02)00053-9</pub-id>
          <pub-id pub-id-type="medline">12460632</pub-id>
          <pub-id pub-id-type="pii">S1386505602000539</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hahn</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Romacker</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Schulz</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>How knowledge drives understanding--matching medical ontologies with the needs of medical language processing</article-title>
          <source>Artif Intell Med</source>
          <year>1999</year>
          <month>01</month>
          <volume>15</volume>
          <issue>1</issue>
          <fpage>25</fpage>
          <lpage>51</lpage>
          <pub-id pub-id-type="doi">10.1016/s0933-3657(98)00044-x</pub-id>
          <pub-id pub-id-type="medline">9930615</pub-id>
          <pub-id pub-id-type="pii">S0933-3657(98)00044-X</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Piskorski</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Homola</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Marciniak</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Mykowiecka</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Przepiórkowski</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Wolinski</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <person-group person-group-type="editor">
            <name name-style="western">
              <surname>Kłopotek</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Wierzchoń</surname>
              <given-names>ST</given-names>
            </name>
            <name name-style="western">
              <surname>Trojanowski</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Information extraction for Polish using the SProUT platform</article-title>
          <source>Intelligent Information Processing and Web Mining Proceedings of the International IIS: IIPWM‘04 Conference held in Zakopane, Poland, May 17–20, 2004</source>
          <year>2004</year>
          <publisher-loc>Berlin, Heidelberg</publisher-loc>
          <publisher-name>Springer</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Krieger</surname>
              <given-names>HU</given-names>
            </name>
            <name name-style="western">
              <surname>Spurk</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Uszkoreit</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Müller</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Tolxdorff</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Information extraction from German patient records via hybrid parsing and relation extraction strategies</article-title>
          <source>Proceedings of the Ninth International Conference on Language Resources and Evaluation</source>
          <year>2014</year>
          <conf-name>LREC14</conf-name>
          <conf-date>May 2014</conf-date>
          <conf-loc>Reykjavik, Iceland</conf-loc>
          <fpage>2043</fpage>
          <lpage>2048</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Richter-Pechanski</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Riezler</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Dieterich</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>De-identification of German medical admission notes</article-title>
          <source>Stud Health Technol Inform</source>
          <year>2018</year>
          <volume>253</volume>
          <fpage>165</fpage>
          <lpage>169</lpage>
          <pub-id pub-id-type="medline">30147065</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Manning</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Surdeanu</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bauer</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Finkel</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bethard</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>McClosky</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>The stanford coreNLP natural language processing toolkit</article-title>
          <source>Proceedings of 52nd Annual Meeting of the Association for Computational Linguistics: System Demonstrations</source>
          <year>2014</year>
          <conf-name>52nd Annual Meeting of the Association for Computational Linguistics: System Demonstrations</conf-name>
          <conf-date>June 2014</conf-date>
          <conf-loc>Baltimore, MD</conf-loc>
          <fpage>55</fpage>
          <lpage>60</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.aclweb.org/anthology/P/P14/P14-5010"/>
          </comment>
          <pub-id pub-id-type="doi">10.3115/v1/p14-5010</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nguyen</surname>
              <given-names>TH</given-names>
            </name>
            <name name-style="western">
              <surname>Grishman</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Relation extraction: perspective from convolutional neural networks</article-title>
          <source>Proceedings of the 1st Workshop on Vector Space Modeling for Natural Language Processing</source>
          <year>2015</year>
          <conf-name>1st Workshop on Vector Space Modeling for Natural Language Processing</conf-name>
          <conf-date>June 2015</conf-date>
          <conf-loc>Denver, CO</conf-loc>
          <fpage>39</fpage>
          <lpage>48</lpage>
          <pub-id pub-id-type="doi">10.3115/v1/w15-1506</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sahu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Anand</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Oruganty</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Gattu</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Relation extraction from clinical texts using domain invariant convolutional neural network</article-title>
          <source>Proceedings of the 15th Workshop on Biomedical Natural Language Processing</source>
          <year>2016</year>
          <conf-name>15th Workshop on Biomedical Natural Language Processing</conf-name>
          <conf-date>August 2016</conf-date>
          <conf-loc>Berlin, Germany</conf-loc>
          <fpage>206</fpage>
          <lpage>215</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/W16-2928</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zeng</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Lai</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Relation classification via convolutional deep neural network</article-title>
          <source>Proceedings of COLING 2014, the 25th International Conference on Computational Linguistics: Technical Papers</source>
          <year>2014</year>
          <conf-name>COLING 2014</conf-name>
          <conf-date>August 2014</conf-date>
          <conf-loc>Dublin, Ireland</conf-loc>
          <fpage>2335</fpage>
          <lpage>2344</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Roller</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Rethmeier</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Thomas</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Hübner</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Uszkoreit</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Staeck</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Budde</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Halleck</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Schmidt</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <person-group person-group-type="editor">
            <name name-style="western">
              <surname>Rehm</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Declerck</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Detecting named entities and relations in German clinical reports</article-title>
          <source>Language Technologies for the Challenges of the Digital Age: 27th International Conference, GSCL 2017, Berlin, Germany, September 13-14, 2017, Proceedings</source>
          <year>2017</year>
          <publisher-loc>Cham</publisher-loc>
          <publisher-name>Springer</publisher-name>
          <fpage>146</fpage>
          <lpage>154</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Roller</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Alt</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Seiffe</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>mEx - an information extraction platform for German medical text</article-title>
          <source>Proceedings of the 11th International Conference on Semantic Web Applications and Tools for Healthcare and Life Sciences</source>
          <year>2018</year>
          <conf-name>SWAT4HCLS-2018</conf-name>
          <conf-date>December 3-5, 2018</conf-date>
          <conf-loc>Antwerp, Belgium</conf-loc>
          <fpage>3</fpage>
          <lpage>5</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Honnibal</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Montani</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Van Landeghem</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Boyd</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Industrial-strength natural language processing (NLP) with Python</article-title>
          <source>Zenodo</source>
          <access-date>2023-01-28</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://zenodo.org/record/3701227#.Y9VgXnZBw2w">https://zenodo.org/record/3701227#.Y9VgXnZBw2w</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Roller</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Seiffe</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Ayach</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Möller</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Marten</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Mikhailov</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Alt</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Schmidt</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Halleck</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Naik</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Duettmann</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Budde</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Information extraction models for German clinical text</article-title>
          <year>2020</year>
          <conf-name>IEEE International Conference on Healthcare Informatics</conf-name>
          <conf-date>November 30-December 3, 2020</conf-date>
          <conf-loc>Oldenburg, Germany</conf-loc>
          <fpage>1</fpage>
          <lpage>2</lpage>
          <pub-id pub-id-type="doi">10.1109/ichi48887.2020.9374385</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Borchert</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Lohr</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Modersohn</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Witt</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Langer</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Follmann</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gietzelt</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Arnrich</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Hahn</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Schapranow</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>GGPONC 2.0—the German clinical guideline corpus for oncology: curation workflow, annotation policy, baseline NER taggers</article-title>
          <source>Proceedings of the 13th Language Resources and Evaluation Conference</source>
          <year>2022</year>
          <conf-name>13th Language Resources and Evaluation Conference</conf-name>
          <conf-date>November 2020</conf-date>
          <conf-loc>Online</conf-loc>
          <fpage>3650</fpage>
          <lpage>3660</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/2022.lrec-1.389"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2020.louhi-1.5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Névéol</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Dalianis</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Velupillai</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Savova</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Zweigenbaum</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Clinical natural language processing in languages other than english: opportunities and challenges</article-title>
          <source>J Biomed Semantics</source>
          <year>2018</year>
          <month>03</month>
          <day>30</day>
          <volume>9</volume>
          <issue>1</issue>
          <fpage>12</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://jbiomedsem.biomedcentral.com/articles/10.1186/s13326-018-0179-8"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s13326-018-0179-8</pub-id>
          <pub-id pub-id-type="medline">29602312</pub-id>
          <pub-id pub-id-type="pii">10.1186/s13326-018-0179-8</pub-id>
          <pub-id pub-id-type="pmcid">PMC5877394</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ehrmann</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Turchi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Steinberger</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Building a multilingual named entity-annotated corpus using annotation projection</article-title>
          <source>Proceedings of the International Conference Recent Advances in Natural Language Processing 2011</source>
          <year>2011</year>
          <conf-name>International Conference Recent Advances in Natural Language Processing</conf-name>
          <conf-date>September 2011</conf-date>
          <conf-loc>Hissar, Bulgaria</conf-loc>
          <fpage>118</fpage>
          <lpage>124</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/R11-1017"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mayhew</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Tsai</surname>
              <given-names>CT</given-names>
            </name>
            <name name-style="western">
              <surname>Roth</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Cheap translation for cross-lingual named entity recognition</article-title>
          <source>Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing</source>
          <year>2017</year>
          <conf-name>Conference on Empirical Methods in Natural Language Processing</conf-name>
          <conf-date>September 2017</conf-date>
          <conf-loc>Copenhagen, Denmark</conf-loc>
          <fpage>2536</fpage>
          <lpage>2545</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/D17-1269"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/d17-1269</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ding</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Yan</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <person-group person-group-type="editor">
            <name name-style="western">
              <surname>Nah</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Cui</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>SW</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>JX</given-names>
            </name>
            <name name-style="western">
              <surname>Moon</surname>
              <given-names>YS</given-names>
            </name>
            <name name-style="western">
              <surname>Whang</surname>
              <given-names>SE</given-names>
            </name>
          </person-group>
          <article-title>Cross-lingual transfer learning for medical named entity recognition</article-title>
          <source>Database Systems for Advanced Applications: 25th International Conference, DASFAA 2020, Jeju, South Korea, September 24–27, 2020, Proceedings, Part I</source>
          <year>2020</year>
          <publisher-loc>Cham</publisher-loc>
          <publisher-name>Springer</publisher-name>
          <fpage>403</fpage>
          <lpage>418</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref50">
        <label>50</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Haider</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Mansour</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>End-to-end slot alignment and recognition for cross-lingual NLU</article-title>
          <source>Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing</source>
          <year>2020</year>
          <conf-name>EMNLP</conf-name>
          <conf-date>November 2020</conf-date>
          <conf-loc>Online</conf-loc>
          <fpage>5052</fpage>
          <lpage>5063</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/2020.emnlp-main.410"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2020.emnlp-main.410</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref51">
        <label>51</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yarowsky</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Ngai</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Wicentowski</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Inducing multilingual text analysis tools via robust projection across aligned corpora</article-title>
          <source>Proceedings of the First International Conference on Human Language Technology Research</source>
          <year>2001</year>
          <conf-name>HLT '01</conf-name>
          <conf-date>March 18-21, 2001</conf-date>
          <conf-loc>San Diego, CA</conf-loc>
          <fpage>1</fpage>
          <lpage>8</lpage>
          <pub-id pub-id-type="doi">10.3115/1072133.1072187</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref52">
        <label>52</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zitouni</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Florian</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Mention detection crossing the language barrier</article-title>
          <source>Proceedings of the Conference on Empirical Methods in Natural Language Processing</source>
          <year>2008</year>
          <conf-name>EMNLP '08</conf-name>
          <conf-date>October 25-27, 2008</conf-date>
          <conf-loc>Honolulu, HI</conf-loc>
          <fpage>600</fpage>
          <lpage>609</lpage>
          <pub-id pub-id-type="doi">10.3115/1613715.1613789</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref53">
        <label>53</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ni</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Dinu</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Florian</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Weakly supervised cross-lingual named entity recognition via effective annotation and representation projection</article-title>
          <source>Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</source>
          <year>2017</year>
          <month>07</month>
          <conf-name>55th Annual Meeting of the Association for Computational Linguistics</conf-name>
          <conf-date>July 2017</conf-date>
          <conf-loc>Vancouver, Canada</conf-loc>
          <fpage>1470</fpage>
          <lpage>1480</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/P17-1135"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/p17-1135</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref54">
        <label>54</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xie</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Neubig</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>NA</given-names>
            </name>
            <name name-style="western">
              <surname>Carbonell</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Neural cross-lingual named entity recognition with minimal resources</article-title>
          <source>Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing</source>
          <year>2018</year>
          <conf-name>Conference on Empirical Methods in Natural Language Processing</conf-name>
          <conf-date>October-November 2018</conf-date>
          <conf-loc>Brussels, Belgium</conf-loc>
          <fpage>369</fpage>
          <lpage>379</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/D18-1034"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/d18-1034</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref55">
        <label>55</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jain</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Paranjape</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Lipton</surname>
              <given-names>ZC</given-names>
            </name>
          </person-group>
          <article-title>Entity projection via machine translation for cross-lingual NER</article-title>
          <source>Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing</source>
          <year>2019</year>
          <conf-name>EMNLP-IJCNLP</conf-name>
          <conf-date>November 2019</conf-date>
          <conf-loc>Hong Kong, China</conf-loc>
          <fpage>1083</fpage>
          <lpage>1092</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aclweb.org/anthology/D19-1100"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/d19-1100</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref56">
        <label>56</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Schuster</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Gupta</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Lewis</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Cross-lingual transfer learning for multilingual task oriented dialog</article-title>
          <source>Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)</source>
          <year>2019</year>
          <conf-name>Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</conf-name>
          <conf-date>June 2019</conf-date>
          <conf-loc>Minneapolis, MN</conf-loc>
          <fpage>3795</fpage>
          <lpage>3805</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/N19-1380"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/n19-1380</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref57">
        <label>57</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hatami</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mitkov</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Corpas</surname>
              <given-names>PG</given-names>
            </name>
          </person-group>
          <article-title>Cross-lingual named entity recognition via fastAlign: a case study</article-title>
          <source>Proceedings of the Translation and Interpreting Technology Online Conference</source>
          <year>2021</year>
          <conf-name>Translation and Interpreting Technology Online Conference</conf-name>
          <conf-date>July 2021</conf-date>
          <conf-loc>Online</conf-loc>
          <fpage>85</fpage>
          <lpage>92</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/2021.triton-1.10"/>
          </comment>
          <pub-id pub-id-type="doi">10.26615/978-954-452-071-7_010</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref58">
        <label>58</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ott</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Edunov</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Baevski</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Fan</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Gross</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ng</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Grangier</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Auli</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>fairseq: a fast, extensible toolkit for sequence modeling</article-title>
          <source>Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics (Demonstrations)</source>
          <year>2019</year>
          <conf-name>Conference of the North American Chapter of the Association for Computational Linguistics</conf-name>
          <conf-date>June 2019</conf-date>
          <conf-loc>Minneapolis, MN</conf-loc>
          <fpage>48</fpage>
          <lpage>53</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/n19-4009</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref59">
        <label>59</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ng</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Yee</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Baevski</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ott</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Auli</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Edunov</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Facebook FAIR's WMT19 news translation task submission</article-title>
          <source>Proceedings of the Fourth Conference on Machine Translation (Volume 2: Shared Task Papers, Day 1)</source>
          <year>2019</year>
          <month>08</month>
          <conf-name>Fourth Conference on Machine Translation</conf-name>
          <conf-date>August 2019</conf-date>
          <conf-loc>Florence, Italy</conf-loc>
          <fpage>314</fpage>
          <lpage>319</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/w19-5333</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref60">
        <label>60</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dyer</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Chahuneau</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>NA</given-names>
            </name>
          </person-group>
          <article-title>A simple, fast, and effective reparameterization of IBM model 2</article-title>
          <source>Proceedings of the 2013 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</source>
          <year>2013</year>
          <conf-name>Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</conf-name>
          <conf-date>June 2013</conf-date>
          <conf-loc>Atlanta, GA</conf-loc>
          <fpage>644</fpage>
          <lpage>648</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref61">
        <label>61</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Svenstrup</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Hansen</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Winther</surname>
              <given-names>O</given-names>
            </name>
          </person-group>
          <article-title>Hash embeddings for efficient word representations</article-title>
          <year>2017</year>
          <conf-name>31st Conference on Neural Information Processing Systems</conf-name>
          <conf-date>December 4-9, 2017</conf-date>
          <conf-loc>Long Beach, CA</conf-loc>
          <fpage>4935</fpage>
          <lpage>4943</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref62">
        <label>62</label>
        <nlm-citation citation-type="web">
          <article-title>GERNERMED—an open German medical NER model</article-title>
          <source>GitHub</source>
          <year>2022</year>
          <access-date>2023-01-27</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/frankkramer-lab/GERNERMED">https://github.com/frankkramer-lab/GERNERMED</ext-link>
          </comment>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
