<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JFR</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id>
      <journal-title>JMIR Formative Research</journal-title>
      <issn pub-type="epub">2561-326X</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v7i1e47434</article-id>
      <article-id pub-id-type="pmid">37594844</article-id>
      <article-id pub-id-type="doi">10.2196/47434</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>A Deep Learning Model for the Normalization of Institution Names by Multisource Literature Feature Fusion: Algorithm Development Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Mavragani</surname>
            <given-names>Amaryllis</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Hu</surname>
            <given-names>Danqing</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Zhao</surname>
            <given-names>Peng, PhD</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Chen</surname>
            <given-names>Yifei</given-names>
          </name>
          <degrees>MA</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0002-9393-6299</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Li</surname>
            <given-names>Xiaoying</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4407-6616</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Li</surname>
            <given-names>Aihua</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6742-3268</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Li</surname>
            <given-names>Yongjie</given-names>
          </name>
          <degrees>MA</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0004-6306-8288</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Yang</surname>
            <given-names>Xuemei</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2927-4166</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Lin</surname>
            <given-names>Ziluo</given-names>
          </name>
          <degrees>MA</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-8384-1275</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Yu</surname>
            <given-names>Shirui</given-names>
          </name>
          <degrees>MA</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-5533-0010</ext-link>
        </contrib>
        <contrib id="contrib8" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Tang</surname>
            <given-names>Xiaoli</given-names>
          </name>
          <degrees>MA</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Institute of Medical Information</institution>
            <institution>Chinese Academy of Medical Sciences</institution>
            <addr-line>69, Dongdan North Street</addr-line>
            <addr-line>Beijing, 100005</addr-line>
            <country>China</country>
            <phone>86 10 52328902</phone>
            <email>tang.xiaoli@imicams.ac.cn</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6946-3482</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Institute of Medical Information</institution>
        <institution>Chinese Academy of Medical Sciences</institution>
        <addr-line>Beijing</addr-line>
        <country>China</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Xiaoli Tang <email>tang.xiaoli@imicams.ac.cn</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2023</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>18</day>
        <month>8</month>
        <year>2023</year>
      </pub-date>
      <volume>7</volume>
      <elocation-id>e47434</elocation-id>
      <history>
        <date date-type="received">
          <day>21</day>
          <month>3</month>
          <year>2023</year>
        </date>
        <date date-type="rev-request">
          <day>4</day>
          <month>5</month>
          <year>2023</year>
        </date>
        <date date-type="rev-recd">
          <day>24</day>
          <month>5</month>
          <year>2023</year>
        </date>
        <date date-type="accepted">
          <day>16</day>
          <month>6</month>
          <year>2023</year>
        </date>
      </history>
      <copyright-statement>©Yifei Chen, Xiaoying Li, Aihua Li, Yongjie Li, Xuemei Yang, Ziluo Lin, Shirui Yu, Xiaoli Tang. Originally published in JMIR Formative Research (https://formative.jmir.org), 18.08.2023.</copyright-statement>
      <copyright-year>2023</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on https://formative.jmir.org, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://formative.jmir.org/2023/1/e47434" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>The normalization of institution names is of great importance for literature retrieval, statistics of academic achievements, and evaluation of the competitiveness of research institutions. Differences in authors’ writing habits and spelling mistakes lead to various names of institutions, which affects the analysis of publication data. With the development of deep learning models and the increasing maturity of natural language processing methods, training a deep learning–based institution name normalization model can increase the accuracy of institution name normalization at the semantic level.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aimed to train a deep learning–based model for institution name normalization based on the feature fusion of affiliation data from multisource literature, which would realize the normalization of institution name variants with the help of authority files and achieve a high specification accuracy after several rounds of training and optimization.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>In this study, an institution name normalization–oriented model was trained based on bidirectional encoder representations from transformers (BERT) and other deep learning models, including the institution classification model, institutional hierarchical relation extraction model, and institution matching and merging model. The model was then trained to automatically learn institutional features by pretraining and fine-tuning, and institution names were extracted from the affiliation data of 3 databases to complete the normalization process: Dimensions, Web of Science, and Scopus.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>It was found that the trained model could achieve at least 3 functions. First, the model could identify the institution name that is consistent with the authority files and associate the name with the files through the unique institution ID. Second, it could identify the nonstandard institution name variants, such as singular forms, plural changes, and abbreviations, and update the authority files. Third, it could identify the unregistered institutions and add them to the authority files, so that when the institution appeared again, the model could identify and regard it as a registered institution. Moreover, the test results showed that the accuracy of the normalization model reached 93.79%, indicating the promising performance of the model for the normalization of institution names.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>The deep learning–based institution name normalization model trained in this study exhibited high accuracy. Therefore, it could be widely applied in the evaluation of the competitiveness of research institutions, analysis of research fields of institutions, and construction of interinstitutional cooperation networks, among others, showing high application value.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>multisource literature</kwd>
        <kwd>institution name normalization</kwd>
        <kwd>deep learning</kwd>
        <kwd>bidirectional encoder representations from transformers</kwd>
        <kwd>BERT</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>The institution name is essential for describing the scientific research entity in scientific and technical literature. It is not only a key entrance for literature retrieval but also a vital statistical unit for the statistics of academic achievements and evaluation of the influence and competitiveness of scientific research institutions. However, differences in writing habits, spelling mistakes, space variations, use of abbreviations, etc, result in various name variants of the same institution in different publications, which may influence the analysis results of the publication data. It has been shown that Iranian universities fall behind in global rankings just because the authors used the nonstandard names of universities [<xref ref-type="bibr" rid="ref1">1</xref>]. Consequently, the normalization of institution names has become a key issue in academic research.</p>
        <p>The key to the normalization of institution names is the calculation of the similarity between different institution names. In most early studies, people often determined whether 2 different names refer to the same institution based on the literal similarity by 2 commonly used methods, namely, Edit Distance [<xref ref-type="bibr" rid="ref2">2</xref>] and Jaccard Index [<xref ref-type="bibr" rid="ref3">3</xref>]. Such methods may group 2 institutions with similar names together because of the high literal similarity. In recent studies, scholars have focused on the measurement of the semantic similarity of institution names and the recognition of institution name variants using machine learning and deep learning, which greatly improve the accuracy of institution name normalization [<xref ref-type="bibr" rid="ref4">4</xref>].</p>
        <p>In this study, a model for institution name normalization was trained by the fusion of multisource literature features based on deep learning to achieve the normalized management of institution names. We developed 4 submodels: an institution classification model, institutional hierarchical relation extraction model, zip code extraction model, and institution matching and merging model. On the basis of the results of the model normalization process, rules were established to enhance the model’s performance. Consequently, the model is capable of processing various types of affiliation data and can be effectively used in multiple domains. The performance of our model was found to be highly promising, with an overall accuracy rate of 93.79%, a recall rate of 93.08%, and an <italic>F</italic><sub>1</sub>-score of 93.43%.</p>
        <p>The focuses of this study are three-fold:</p>
        <list list-type="order">
          <list-item>
            <p>To correctly identify the institution name that matches the authority files and associate the name with the files by assigning the appropriate unique institution ID</p>
          </list-item>
          <list-item>
            <p>To correctly recognize the name variants of the registered institutions, such as single and plural forms, the text transform of words, and the abbreviation of the institution names, and normalize them to canonical names</p>
          </list-item>
          <list-item>
            <p>To correctly identify an unregistered institution and add relevant information to the authority files. When the institution appears again, the model treats it as a registered institution</p>
          </list-item>
        </list>
        <p>This study innovates in the following aspects:</p>
        <list list-type="order">
          <list-item>
            <p>The model for institution name normalization was developed based on bidirectional encoder representations from transformers (BERT) and other deep learning models, which could be used to analyze and process the institutional data from multiple sources (Dimensions, Scopus, and Web of Science [WoS]) and construct authority files to normalize the institution name at the semantic level.</p>
          </list-item>
          <list-item>
            <p>In this study, the entire process of institution name normalization was sorted out in detail, and the model for institution name normalization was designed, which includes submodels such as the classification model, hierarchical relation extraction model, and matching and merging model. These models work together to address the matching problem caused by the differences in the text transform and punctuation in the institution name and realize the specification of institution names in the complete sense. Thus, this method is worthy of widespread application and promotion.</p>
          </list-item>
          <list-item>
            <p>Most of the existing studies on institution name normalization focus on coarse-grained normalization, that is, only the top-level institution names are regulated. This study achieved a fine-grained level of institution name normalization by extracting institution hierarchical relationships, which would regulate the institution names at each level, and the relationships between superior and subordinate institutions, making literature retrieval and statistics more convenient and faster.</p>
          </list-item>
        </list>
      </sec>
      <sec>
        <title>Related Works</title>
        <sec>
          <title>Development of Institution Name Normalization</title>
          <p>Numerous scholars have conducted research on institution name normalization and achieved promising performance. According to the key techniques for institution name normalization, these methods can be divided into 4 groups: the string similarity matching–based method, rule-based method, machine learning–based method, and deep learning–based method.</p>
          <p>The string similarity matching–based method, dominated by the Edit Distance and Jaccard Index, has been applied to determine whether 2 names represent the same institution by comparing their literal similarity, thereby realizing the identification of 1 institution despite its different names. Edit Distance has been used to calculate differences in the strings of institution names and to recognize spelling mistakes and other name variants since 1997 [<xref ref-type="bibr" rid="ref2">2</xref>]. Subsequently, James et al [<xref ref-type="bibr" rid="ref3">3</xref>] further improved the clustering algorithm by combining the Jaccard Index and Edit Distance and clustered similar strings by replacing each string in the cluster with a standard form to achieve name normalization. Jacob et al [<xref ref-type="bibr" rid="ref5">5</xref>] designed the sCooL system, which can be used to normalize institution names in the resumes using Levenshtein, N-gram, Jaccard similarity, and other algorithms. However, some issues cannot be addressed by relying solely on the string similarity matching–based method. For example, similar names of different institutions may easily cause mismatching.</p>
          <p>The principle of the rule-based method is to manually build the rules using the author’s address information, institution attributes, and other features to recognize possible matching institution names based on the rules and eliminate the wrong matching pairs according to the threshold. To eliminate the limitations of string similarity matching, Huang et al [<xref ref-type="bibr" rid="ref6">6</xref>] built a rule-based institution name disambiguation algorithm using data mining technology, which has reached a high precision in mathematics, psychology, and other fields. Huang et al [<xref ref-type="bibr" rid="ref7">7</xref>] also established institution name matching rules by introducing the unique identifier of the institution and determined whether 2 names represent the same institution by comparing the Global Research Identifier Database ID, International Standard Name ID, and other identifiers. Some institutions have also conducted corresponding research. For example, the bibliometric database built by the Swedish Research Council based on WoS according to the rules was subjected to affiliation disambiguation for the matching of institutions, cities, addresses, etc. These rules can be used to process more than 99% of the Swedish address strings [<xref ref-type="bibr" rid="ref8">8</xref>]. Generally, this type of method has poor portability owing to the manually developed rules.</p>
          <p>Various machine learning models have been trained for institution name normalization and name disambiguation, such as naive Bayes, support vector machine (SVM), and K-means clustering. Han et al [<xref ref-type="bibr" rid="ref9">9</xref>] compared the disambiguation effects of naive Bayes and SVM on the ambiguity caused by name abbreviations and spelling mistakes in different scenarios. Balsmeier et al [<xref ref-type="bibr" rid="ref10">10</xref>] disambiguated the inventor’s name in the patent database using the K-means clustering algorithm.</p>
          <p>Compared with the machine learning–based methods involving much human labor, the deep learning–based models may automatically learn the institutional features through less annotated data. He et al [<xref ref-type="bibr" rid="ref11">11</xref>] established a deep neural network–based entity disambiguation model, which was used to measure the entity similarity in combination with the context and optimize the representations of entities and documents, thus achieving promising disambiguation results. With regard to the neglect of the word order in text by the Bag of Word model, Phan et al [<xref ref-type="bibr" rid="ref4">4</xref>] firstly used long short-term memory (LSTM) and attention mechanism for entity disambiguation. Jiang et al [<xref ref-type="bibr" rid="ref12">12</xref>] developed a Dual-Channel Hybrid Network model by fusing a convolutional neural network (CNN) model and a capsule model with a self-attention mechanism, outperforming the mainstream deep learning model in Chinese short text disambiguation. Entity normalization and disambiguation based on deep learning models have higher accuracy and recall rates than traditional methods and therefore deserve more research attention.</p>
        </sec>
        <sec>
          <title>Deep Learning Models</title>
          <p>On the basis of the initial success of machine learning, deep learning models have been widely used in speech recognition, image classification and search, natural language processing (NLP), and other fields. Deep learning models can learn features from the training data and exhibit a stronger data processing ability [<xref ref-type="bibr" rid="ref13">13</xref>]. The institution name normalization mentioned in this study actually refers to the completion of NLP tasks using deep learning models. Common models mainly include CNN, recurrent neural network (RNN), LSTM, and BERT models. Previous developmental studies have demonstrated that both CNN and RNN models have showed their respective advantages in emotion analysis, entity recognition, part-of-speech tagging, and other NLP tasks [<xref ref-type="bibr" rid="ref14">14</xref>]. Initially, the CNN model was primarily used for image processing and then applied to effectively mine the semantic information of the context [<xref ref-type="bibr" rid="ref15">15</xref>], which is a typical NLP task. With a relatively simple internal architecture and low requirements for computing resources, the RNN model cannot effectively deal with the correlation between long sequences and thus easily causes a vanishing gradient or exploding gradient [<xref ref-type="bibr" rid="ref16">16</xref>]. Therefore, the gating mechanism of LSTM was proposed to address the gradient issues.</p>
          <p>In this study, we trained the normalization model of institution names developed on BERT, which is a pretrained language model that has been broadly used for entity extraction, text classification, emotion analysis, and other NLP tasks with the best performance [<xref ref-type="bibr" rid="ref17">17</xref>]. With the architecture of a bidirectional transformer [<xref ref-type="bibr" rid="ref18">18</xref>], BERT can complete parallel computing and thus greatly improve the operation efficiency. In addition, the core mechanism of the transformer, the self-attention mechanism, enables the BERT model to pay more attention to the valuable information among the input data and assign different weights to the words by fully learning contextual features [<xref ref-type="bibr" rid="ref19">19</xref>]. The BERT model has been broadly applied to named entity recognition (NER) tasks and provides promising results [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>]. Commonly used pretrained language models include generative pretrained transformer, Global Vectors for Word Representation (GloVe), and Embeddings from Language Models. As the BERT model is state-of-the-art in many NLP tasks [<xref ref-type="bibr" rid="ref22">22</xref>] and the study [<xref ref-type="bibr" rid="ref23">23</xref>] shows that the BERT-based model is easily fine-tuned and can be applied to any form of entity name normalization in the biomedical field, it was adopted in this study. It could empower this study and enable the institution name normalization model to achieve higher accuracy.</p>
        </sec>
        <sec>
          <title>NER Tasks</title>
          <p>NER is a common NLP task. Initially, NER was mainly used for string matching based on dictionaries and manual rules, but it showed poor feasibility and portability when dealing with complex data [<xref ref-type="bibr" rid="ref24">24</xref>]. With the deepening of research, machine learning–based methods have been gradually adopted in NER, such as Hidden Markov Model [<xref ref-type="bibr" rid="ref25">25</xref>], conditional random field (CRF) [<xref ref-type="bibr" rid="ref26">26</xref>], and SVM [<xref ref-type="bibr" rid="ref27">27</xref>] models, but they require much manual work. Deep learning methods are widely used in NER tasks. Hammerton et al [<xref ref-type="bibr" rid="ref28">28</xref>] first used LSTM to complete NER tasks with data from the Reuters Corpus and the European Corpus Initiative Multilingual Corpus, and Peng et al [<xref ref-type="bibr" rid="ref29">29</xref>] proposed the LSTM-CRF model on the basis of improved existing models and methods, which has significantly optimized the word segmentation effect compared with traditional methods. Lample et al [<xref ref-type="bibr" rid="ref30">30</xref>] proposed a neural network model combining bidirectional LSTM (BiLSTM) and CRF, and the context sequence information can be obtained with this bidirectional structure that has been widely used in NER tasks. To further recognize the overlapping nested entities in sentences, span-based methods have been proposed in the NER field. For example, Mandar et al [<xref ref-type="bibr" rid="ref31">31</xref>] proposed a pretrained model, SpanBERT, which was experimentally demonstrated to generally outperform the BERT model in terms of relation extraction and coreference resolution. The extraction of institutional hierarchical relationships is based on the accurate identification of institution names. In this study, we used the NER model to construct the institutional hierarchical relation extraction model.</p>
        </sec>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Ethical Considerations</title>
        <p>Ethics approval was not required because the data processed in this study are publicly available literature data, including article titles, institutional addresses, institution names, etc, with no user privacy involved.</p>
      </sec>
      <sec>
        <title>Study Overview</title>
        <p>To improve the effectiveness of institutional name normalization, we proposed a 4-stage institutional name normalization model applicable to multiple sources and types of literature data. In this study, the entire process involves 4 models: the institution classification model, institutional hierarchical relation extraction model, institution zip code extraction model, and institution matching and merging model. The research route is shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>, and the process is as follows: first, the affiliation data were obtained from multisource literature databases; second, information about the institution, including the name, address, and hierarchical relation, was extracted from the affiliation data using the BERT model; and third, the institution data were clustered and merged through the clustering algorithm and then compared with the normalized institution names in authority files. This process not only helped realize the normalization of different names and unify the identification of the same institution but also contributed to the update and improvement of the authority files of the institution.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Research route. BERT: bidirectional encoder representations from transformers; DBSCAN: Density-Based Spatial Clustering of Applications with Noise; GloVe: Global Vectors for Word Representation; ISO: International Organization for Standardization.</p>
          </caption>
          <graphic xlink:href="formative_v7i1e47434_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Analysis of Multisource Literature Features</title>
        <p>The model for institution name normalization developed on deep learning in this study could be used to simultaneously process the affiliation data from the Dimensions, WoS, and Scopus databases, greatly enhancing its practical application value in institution name normalization. Specifically, the literature data were collected from Dimensions, WoS, and Scopus (including 49 fields) at the same time as per the literature collection requirements, and data from different sources had different field types (the fields required by institution name normalization are indicated in italics in <xref ref-type="table" rid="table1">Table 1</xref>). In addition, data filtering was performed to eliminate duplicate and invalid data, and the fields required by institution name normalization were filtered for cleaning again to obtain the initial data set of the institution name normalization.</p>
        <p>Difficulties in multisource data processing were mainly reflected in two aspects: (1) the types and quantities of data fields from different sources were nonuniform, which increased the difficulty of analysis, and (2) we randomly selected a field. Even if the field types of data were the same, the presentation forms of the different databases were quite different. Taking the article “Model of Ischemic Heart Disease and Video-Based Comparison of Cardiomyocyte Contraction Using hiPSC-Derived Cardiomyocytes” as an example, we randomly selected the authors’ affiliations as the comparison field. The content of the field from Dimensions is represented as “Yun, Liu(Okayama University); Yin, Liang(Okayama University); Mengxue, Wang(Okayama University); Chen, Wang(Okayama University); Heng, Wei(Kyoto University); Keij, Naruse(Okayama University); Ken, Takahashi(Okayama University),” whereas the content of the same field from WoS is represented as “[Liu, Yun; Liang, Yin; Wang, Mengxue; Wang, Chen; Naruse, Keiji; Takahashi, Ken] Okayama Univ, Dept Cardiovasc Physiol, Grad Sch Med Dent &#38; Pharmaceut Sci, Okayama, Japan; [Heng, Wei] Kyoto Univ, Inst Lab Anim, Grad Sch Med, Kyoto, Japan,” and the content from Scopus is represented as “Liu, Y., Department of Cardiovascular Physiology, Graduate School of Medicine, Dentistry and Pharmaceutical Sciences, Okayama University, Japan; Liang, Y., Department of Cardiovascular Physiology, Graduate School of Medicine, Dentistry and Pharmaceutical Sciences, Okayama University, Japan; Wang, M., Department of Cardiovascular Physiology, Graduate School of Medicine, Dentistry and Pharmaceutical Sciences, Okayama University, Japan; Wang, C., Department of Cardiovascular Physiology, Graduate School of Medicine, Dentistry and Pharmaceutical Sciences, Okayama University, Japan; Wei, H., Institute of Laboratory Animals, Graduate School of Medicine, Kyoto University, Japan; Naruse, K., Department of Cardiovascular Physiology, Graduate School of Medicine, Dentistry and Pharmaceutical Sciences, Okayama University, Japan; Takahashi, K., Department of Cardiovascular Physiology, Graduate School of Medicine, Dentistry and Pharmaceutical Sciences, Okayama University, Japan.” Upon observing these 3 fields, it is evident that there are significant differences in their contents. If the reliability of data from the 3 databases was different, the priority should be Scopus, followed by Dimensions and WoS. When an article was included in multiple databases at the same time, the model further processed the article according to the priority ranking.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Multisource literature fields.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="110"/>
            <col width="890"/>
            <thead>
              <tr valign="top">
                <td>Source</td>
                <td>Fields</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Dimensions</td>
                <td><italic>WoS</italic><sup>a</sup><italic>DOI</italic><sup>b</sup><italic>, PMID</italic><sup>c</sup><italic>, PMCID</italic><sup>d</sup><italic>, Title, Abstract, Authors-D, Authors (Raw Affiliation), Corresponding Authors, Authors Affiliations, Research Organizations—standardized, GRID</italic><sup>e</sup><italic>IDs, City of Research organization, State of Research organization, Country of Research organization,</italic> Acknowledgments, Source Title, Publisher, PubYear, MeSH<sup>f</sup> terms, Publication Date (web), Publication Date (print), Publication Type, Funder, Funder Group, Funder Country, UIDs<sup>g</sup> of Supporting Grants, Supporting Grants, and Times Cited</td>
              </tr>
              <tr valign="top">
                <td>Scopus</td>
                <td><italic>Authors-S (Authors–Abbreviated Source), Authors with affiliations, Correspondence Address, Affiliations, Authors ID</italic><italic>,</italic> Funding Text 1, and Document Type</td>
              </tr>
              <tr valign="top">
                <td>WoS</td>
                <td><italic>AF</italic><sup>h</sup><italic>, AU</italic><sup>i</sup><italic>, C1</italic><sup>j</sup><italic>, RP</italic><sup>k</sup><italic>,</italic> RI<sup>l</sup>, OI<sup>m</sup>, DT<sup>n</sup>, LA<sup>o</sup>, FU<sup>p</sup>, PI<sup>q</sup>, PA<sup>r</sup>, WC<sup>s</sup>, SC<sup>t</sup>, and UT<sup>u</sup></td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>WoS: Web of Science; the fields required by institution name normalization are indicated in italics.</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>DOI: digital object unique identifier.</p>
            </fn>
            <fn id="table1fn3">
              <p><sup>c</sup>PMID: PubMed unique identifier.</p>
            </fn>
            <fn id="table1fn4">
              <p><sup>d</sup>PMCID: PubMed central identifier.</p>
            </fn>
            <fn id="table1fn5">
              <p><sup>e</sup>GRID: Global Research Identifier Database.</p>
            </fn>
            <fn id="table1fn6">
              <p><sup>f</sup>MeSH: Medical Subject Headings.</p>
            </fn>
            <fn id="table1fn7">
              <p><sup>g</sup>UIDs: unique identifiers.</p>
            </fn>
            <fn id="table1fn8">
              <p><sup>h</sup>AF: author full name.</p>
            </fn>
            <fn id="table1fn9">
              <p><sup>i</sup>AU: author.</p>
            </fn>
            <fn id="table1fn10">
              <p><sup>j</sup>C1: author address.</p>
            </fn>
            <fn id="table1fn11">
              <p><sup>k</sup>RP: reprint address.</p>
            </fn>
            <fn id="table1fn12">
              <p><sup>l</sup>RI: Researcher ID.</p>
            </fn>
            <fn id="table1fn13">
              <p><sup>m</sup>OI: ORCID identifier.</p>
            </fn>
            <fn id="table1fn14">
              <p><sup>n</sup>DT: document type.</p>
            </fn>
            <fn id="table1fn15">
              <p><sup>o</sup>LA: language.</p>
            </fn>
            <fn id="table1fn16">
              <p><sup>p</sup>FU: funding agency and grant number.</p>
            </fn>
            <fn id="table1fn17">
              <p><sup>q</sup>PI: publisher city.</p>
            </fn>
            <fn id="table1fn18">
              <p><sup>r</sup>PA: publisher address.</p>
            </fn>
            <fn id="table1fn19">
              <p><sup>s</sup>WC: Web of Science categories.</p>
            </fn>
            <fn id="table1fn20">
              <p><sup>t</sup>SC: research areas.</p>
            </fn>
            <fn id="table1fn21">
              <p><sup>u</sup>UT: accession number.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Deep Learning Model for Institution Name Normalization</title>
        <p>The complete architecture of the institution name normalization model is shown in <xref rid="figure2" ref-type="fig">Figure 2</xref> and is explained in detail in the following 4 parts of this section.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Models’ architecture. BERT: bidirectional encoder representations from transformers; CRF: conditional random fields; DBSCAN: Density-Based Spatial Clustering of Applications with Noise; GloVe: Global Vectors for Word Representation.</p>
          </caption>
          <graphic xlink:href="formative_v7i1e47434_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <sec>
          <title>Institution Classification Model</title>
          <p>The affiliation data from the literature database contain a large number of institution names and attribute information, and it is difficult for the machine to distinguish institutional information from noninstitutional information. Therefore, the accurate extraction of institutional names is the first step in institution name normalization. Owing to the complex and variable forms of addresses extracted from the affiliation data, this field often contains multitier institutions and detailed addresses of the institutions (including the information on street, zip code, region, and country). For example, in “Division of Biology and Biological Engineering, California Institute of Technology, 1200 E. California Blvd., MC 114-96, Pasadena, CA 91125, USA,” it is difficult to recognize the institution name accurately according to the general rules. Considering that the specific keywords and syntactic structures of institution names differ from the detailed address information, a supervised learning–based classifier model was developed in this study to recognize the address information as institutional phrases and noninstitutional phrases through automatic learning of institutional features, so as to extract institution names.</p>
          <p>The accurate separation of institutional and detailed address information (noninstitutional part) from the original address is essentially a binary classification problem, specifically for short texts. In the process of institution name recognition, a machine’s understanding of word semantics is crucial. The institution classification model consists of a pretrained BERT model, transformer, and fully connected classifier. Considering that the institution name usually appears before the noninstitutional information (detailed address) in the full address field, the model uses a reverse classification method to segment the original address. That is, it identifies each word in reverse order until it reads a word that represents institutional information and then stops classifying. At this point, the first half of the address is considered to be institutional information and the second half is considered to be noninstitutional information. The BERT [<xref ref-type="bibr" rid="ref32">32</xref>] model is a deep learning–based language representation model and can provide richer semantic information of words (especially keywords), such as “Institute,” “Department,” and “University,” by virtue of its multilayer transformer and the ability of converting the input texts into word vector representation. The transformer encodes the word order and background information of a word in a phrase. For example, the institutional hierarchical order and the differences between institutional and noninstitutional features can be encoded by the transformer. At present, the commonly used encoders include transformer [<xref ref-type="bibr" rid="ref18">18</xref>], BiLSTM [<xref ref-type="bibr" rid="ref30">30</xref>], and CNN [<xref ref-type="bibr" rid="ref33">33</xref>]. The transformer incorporates a self-attention mechanism that focuses on the valuable information among the input data, which are beneficial for encoding important words; thus, it was adopted as an encoder herein. The output result from the transformer is mapped into a dichotomy by a fully connected classifier, with 0 representing noninstitution and 1 representing institution, thereby achieving the separation of institution names and detailed addresses.</p>
          <p>The training process of the model is as follows:</p>
          <list list-type="order">
            <list-item>
              <p>According to the regular rules, the institution affiliation data collected and cleaned will be divided into positive samples (institutional phrases) and negative samples (noninstitutional phrases such as detailed address, region, and zip code) and used as the manual annotation corpus required for the training. The data size of corpus is 33,379, with 22,927 positive samples and 10,452 negative samples.</p>
            </list-item>
            <list-item>
              <p>The annotated corpus was divided into training, validation, and testing sets in a ratio of 8:1:1. Then, the training set was entered into the model to enable the model to automatically learn institutional features, by which a trained institution classification model could be obtained. After many rounds of training and optimization, the precision of the model can reach &#62;94%, indicating that it can be used to accurately distinguish the institution name and address.</p>
            </list-item>
          </list>
          <p>After training, when a phrase was entered, such as “California Institute of Technology,” the model returned “True” (institution) or “False” (noninstitution), and when an original address was entered, the model split it into detailed address and institution information. The operating effects of the institutional classification model are shown in <xref ref-type="table" rid="table2">Table 2</xref>.</p>
          <table-wrap position="float" id="table2">
            <label>Table 2</label>
            <caption>
              <p>Model classification results.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="390"/>
              <col width="230"/>
              <col width="380"/>
              <thead>
                <tr valign="top">
                  <td>Original address field</td>
                  <td>Address information</td>
                  <td>Institution information</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Institute of Hematology, General Medical Center, Blood Diseases Hospital, Chinese Academy of Medical Sciences and Peking Union Medical College, Tianjin, China</td>
                  <td>Tianjin, China</td>
                  <td>Institute of Hematology, General Medical Center, Blood Diseases Hospital, Chinese Academy of Medical Sciences and Peking Union Medical College</td>
                </tr>
                <tr valign="top">
                  <td>Medical Research Council Population Health Research Unit, Nuffield Department of Population Health, University of Oxford, Oxford, United Kingdom</td>
                  <td>Oxford, United Kingdom</td>
                  <td>Medical Research Council Population Health Research Unit, Nuffield Department of Population Health, University of Oxford</td>
                </tr>
                <tr valign="top">
                  <td>Department of Epidemiology, Peking University Health Science Center, Beijing, China</td>
                  <td>Beijing, China</td>
                  <td>Department of Epidemiology, Peking University Health Science Center</td>
                </tr>
                <tr valign="top">
                  <td>Institute of Population Health Sciences, Queen Mary University of London, London, United Kingdom</td>
                  <td>London, United Kingdom</td>
                  <td>Institute of Population Health Sciences, Queen Mary University of London</td>
                </tr>
                <tr valign="top">
                  <td>Department of Prosthodontics, Peking University School and Hospital of Stomatology and National Clinical Research Center for Oral Diseases and National Engineering Laboratory for Digital and Material Technology of Stomatology and Beijing Key Laboratory of Digital Stomatology, 22 Zhongguancun South Avenue, Haidian District, Beijing, 100081, China</td>
                  <td>22 Zhongguancun South Avenue, Haidian District, Beijing, 100081, China</td>
                  <td>Department of Prosthodontics, Peking University School and Hospital of Stomatology and National Clinical Research Center for Oral Diseases and National Engineering Laboratory for Digital and Material Technology of Stomatology and Beijing Key Laboratory of Digital Stomatology</td>
                </tr>
                <tr valign="top">
                  <td>Liuyang Center for Disease Control and Prevention, Liuyang, Hunan Province, China</td>
                  <td>Liuyang, Hunan Province, China</td>
                  <td>Liuyang Center for Disease Control and Prevention</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
        </sec>
        <sec>
          <title>Institutional Hierarchical Relation Extraction Model</title>
          <p>Hierarchical relation is embedded in the institution information. For example, in “Medical Research Council Population Health Research Unit, Nuffield Department of Population Health, University of Oxford,” the “University of Oxford” is the first-tier institution, “Nuffield Department of Population Health” is the second, and “Medical Research Council Population Health Research Unit” is the third. In this study, an institutional hierarchical relation extraction model was constructed with a structure of the NER model plus rules plus hierarchical tree to maximize the effectiveness from multiple perspectives, such as rules and semantics, and normalize institutions at a finer granularity. First, the NER model sequentially labels the original address field with the Beginning Inside and Outside method. Once the labeling is completed, the deep learning model identifies the distinguishing features of institutions across different levels and provides an output for the labeled address field. At this stage, the institution-related information in the original address is labeled based on the primary, secondary, and tertiary levels. To ensure the precise extraction of hierarchical structures, this study also incorporates rules and hierarchical trees, which will be further explained in the latter part of this section.</p>
          <p>It is worth noting that most existing techniques for institution name normalization and entity disambiguation operate at a broad level, with fewer studies focusing on the subdivision of institutional hierarchies. In this study, the model not only normalizes the names of institutions at each level but also finely organizes the hierarchical relationships among primary, secondary, and tertiary institutions, which is convenient for researchers to use the hierarchical structure of institutions for literature retrieval and statistics.</p>
          <p>The X-CRF framework was adopted as the main model for institution extraction, as this task does not involve the issue of span overlapping. The BiLSTM-CRF and BERT-CRF models were used in the specific experiments for the institutional NER. The experimental results showed that the effect of the BiLSTM model was restricted by the training data, resulting in inaccurate labels predicted by the model on the one hand and out of vocabulary when a word was used as a token on the other hand. Therefore, in this study, a large-scale pretrained BERT model with word pieces as its token was adopted to obtain text representation, which alleviated out of vocabulary problems to some extent. The institutional hierarchical relation in the address was extracted using the BERT-CRF model, and the institutional span and label were obtained. However, there might be span deviation, label error, institutional deficiencies and redundancies, or other problems in the extraction results. Therefore, in this study, the model’s prediction results were corrected primarily through the use of rules and hierarchical trees. The formulated rules include the institution name completion rule and the institutional order correction rule. The former determines whether the extracted content is a complete field, typically by checking whether it ends with a comma. For instance, the institution name is “University of Antwerp (Campus Drie Eiken),” but the NER model only extracts “University of Antwerp.” Meanwhile, the latter corrects the institutional hierarchical order in original addresses that are not standardized, such as “Queen Mary University of London, Institute of Population Health Sciences,” by adjusting the order according to the characteristic words of each level of institutions. Once the rules were manually established, they were incorporated into the model. Finally, for the special hierarchical relationships that cannot be identified by the machine, the institutional hierarchical relationships are imported into the model by constructing institutional hierarchical trees to complement the extraction results of the model. The data size in this study was small; therefore, the hierarchical tree was constructed using the open Neo4j database to correct the institutional hierarchical order. The constructed hierarchical relationship is shown in <xref rid="figure3" ref-type="fig">Figure 3</xref>.</p>
          <fig id="figure3" position="float">
            <label>Figure 3</label>
            <caption>
              <p>Hierarchical data construction using the Neo4j database.</p>
            </caption>
            <graphic xlink:href="formative_v7i1e47434_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Zip Code Extraction Model</title>
          <p>The zip code extraction model was applied to extract the zip code from the address information and integrate it with other affiliation data. In the process of canonical information matching, the zip code information plays an auxiliary role in determination. As mentioned in the section of <italic>Institution Classification Model</italic> before, the institution classification model divides the original address into 2 parts: institutional information and noninstitutional information. The noninstitutional information may contain city, zip code, administrative division, and country. First, the model reads the noninstitutional information in reverse order with the help of the International Organization for Standardization (ISO) standard; eliminates the country, city, administrative division, etc; and then manually researches the coding rules of zip codes of different countries (eg, the zip code of China consists of 6 consecutive digits), builds a zip code rule base, and extracts the zip code from the remaining information, which has been tested to be more accurate than directly extracting the zip code. The specific process for extracting zip code is as follows:</p>
          <list list-type="order">
            <list-item>
              <p>High-quality websites, including those with more complete zip codes, were filtered and selected through manual searches.</p>
            </list-item>
            <list-item>
              <p>The structure of the target website was analyzed, and zip code data were obtained using requests+concurrent futures, asynchronous programming, or other web crawler techniques. Then, the data obtained by the web crawler were cleaned, and invalid data were eliminated by lxml, BeautifulSoup, and regular expressions to obtain effective data for the construction of the zip code library.</p>
            </list-item>
            <list-item>
              <p>As the zip code rules of each country are different, a country+zip code rule base was constructed manually to split the data after cleaning and to obtain the retrieval vector. Next, country information in the retrieval vector was obtained from the national database, zip code rules were obtained based on the country information, and zip codes were finally obtained according to the rules.</p>
            </list-item>
          </list>
          <p>The extraction results are shown in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. The zip code rules for China, the United States, the United Kingdom, and Germany were included in the database; thus, the zip code extraction model achieved an accuracy of almost 100%.</p>
        </sec>
        <sec>
          <title>Institution Matching and Merging Model</title>
          <p>The institution name extracted from the institution address will be compared and matched with the name in authority files, which are computer-identifiable documents consisting of specification records. If the name matches the institution name in the authority files, then the institution will become a registered institution, and its specification information, such as the new alias, zip code, and official website, can be completed in authority files; otherwise, the institution is unregistered and will be included in the authority files as a new institution after manual review. In the process of matching and merging, the recall rate using traditional matching methods is relatively low because of the differences in text transform, punctuation, and word expression order among the institution names. In this study, the institution matching and merging model was developed using country grouping and vector clustering to improve the matching recall rate and reduce the running time of the model. The specific process is as follows: all registered and unregistered institutions are divided by countries, and rules are established for the preliminary processing of unregistered institutions, which include converting the names to lowercase for text vectorization, and eliminating nonalphabetic characters other than “&#38;” in names and converting “&#38;” to “and.” The GloVe model was used to transform the processed data into vectors, and the Density-Based Spatial Clustering of Applications with Noise (DBSCAN) algorithm was then used to cluster institution names under the same country. Subsequently, the new unregistered organizations are added or aliases of registered organizations are included in the authority files based on the clustering results. This operation can achieve multithreaded matching of different countries, greatly improving the matching efficiency.</p>
          <p>Text representation is a critical step in the text clustering process, and the method and effect of text representation have a great influence on the model clustering effect. The most commonly used methods in text representation include bag of word and term frequency–inverse document frequency, which perform well in classification and clustering tasks; however, there are still some problems, such as an extremely high vector dimension, sparse data, failure to focus on the word order in sentences, and failure to learn text semantic information [<xref ref-type="bibr" rid="ref34">34</xref>,<xref ref-type="bibr" rid="ref35">35</xref>]. Multiple word embedding representation methods have been developed to overcome this limitation, such as Word2Vec [<xref ref-type="bibr" rid="ref36">36</xref>], GloVe [<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref38">38</xref>], and Embeddings from Language Models [<xref ref-type="bibr" rid="ref39">39</xref>], which can effectively address the semantic problems of words in the text. In this study, GloVe was applied for text representation owing to its advantages of high accuracy and a short training period.</p>
          <p>The classical clustering algorithms include K-means, Gaussian mixture model, and DBSCAN. For the K-means and Gaussian mixture models, the number of clusters must be set manually, and the same institution cannot be clustered automatically. However, the DBSCAN algorithm speeds up a rapid clustering and requires no manual setting for the number of clusters [<xref ref-type="bibr" rid="ref40">40</xref>]. Moreover, the fault tolerance rate of the model can be reduced by setting a shorter intercluster distance when the DBSCAN algorithm is used. Therefore, the DBSCAN algorithm was applied in this study. The normalization of institution names was achieved using vector clustering. When an institution name vector cluster contains both registered and unregistered institutions, the unregistered institutions are grouped under the registered ones and listed as aliases in authority files. In cases where the cluster only contains unregistered institution names, the institution name with the largest number is taken as the base data for grouping and written in authority files after manual review.</p>
          <p>In summary, the institution matching and merging model consists of 3 components: GloVe, DBSCAN, and a set of rules. First, we manually formulated rules to transform and process the data. Next, the organization names were represented as vectors using GloVe. Finally, the DBSCAN algorithm clusters the vectors, and the institution names are matched and merged with the authority files according to the clustering results. These 3 components work together to guarantee the precision of institution name matching while maintaining high operational efficiency.</p>
        </sec>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Qualitative Analysis</title>
        <p>In this study, we combined the affiliation data from WoS, Dimensions, Scopus, and initially constructed institutional authority files through cleaning, clustering, and other normalization processes, which include not only English and Chinese normalized names, English and Chinese aliases but also the countries where institutions are located. To verify the accuracy of institution name normalization, the affiliation data of 129 articles were obtained from Dimensions, WoS, and Scopus, and a total of 665 institution names were extracted from them. Of the 129 articles, 100 were the latest published literature in January 2023, and were excluded from the training data. They were cleaned, deweighted, and imported into the model for analysis, thus normalizing the institution names. We invited 3 librarians to review the data item by item; 2 of them performed a back-to-back review and then asked the third person to review when there was a discrepancy between the 2 review results, and the review results of the 3 people were unified. The model achieved excellent results, with a 93.79% accuracy rate, 93.08% recall rate, and 93.43% F<sub>1</sub>-score. Here is a success example: the affiliation data are “Plastic Surgery Hospital, Chinese Academy of Medical Sciences and Peking Union Medical College, Beijing,100144, China.” The trained model could exactly extract the institution name and normalize it to the “Plastic Surgery Institute and Hospital, Chinese Academy of Medical Science and Peking Union Medical College.”</p>
        <p>The three main functions that could be achieved by the model are outlined in the <italic>Background</italic> section and analyzed in depth as follows:</p>
        <list list-type="order">
          <list-item>
            <p>Identification of registered institutions: Once the name of a given institution from publications matched the name in authority files, the model could not only accurately identify the institution but also associate it with the corresponding institution in authority files. In addition, a unique ID was assigned according to the authority files.</p>
          </list-item>
          <list-item>
            <p>Identification of the aliases of registered institutions: Although the input institution names are actually the same institution but have different expression forms, the model can identify such irregularly written name variants, accurately associate them with the corresponding institutions in authority files, and add aliases and affiliations to them (eg, the correct form of the author’s institution is “Chinese Academy of Medical Sciences &#38; Peking Union Medical College,” but it is often written as “Chinese Academy of Medical Science &#38; Peking Union Medical College”). It is worth noting that many institutions have very similar names, such as “Chinese Academy of Medical Sciences” and “China Academy of Chinese Medical Sciences.” They have similar names but actually point to different institutions. Mismatches of institution names are prone to occur if clustering is based solely on literal similarity. The deep learning model built in this study helped recognize institution names at the semantic level, which could identify the name variants of the same institution and effectively avoid mismatches between different institutions.</p>
          </list-item>
          <list-item>
            <p>Identification of unregistered institutions: If the model identifies the input institution as an unregistered institution, a new institution would be created in authority files, and the information of normalized names and institution attributes can be completed with the help of the institution’s official website and other web resources. When the same entity reappeared, regardless of its normalization name or alias, the model could associate it with the corresponding new entity in authority files rather than treating it as an unregistered entity.</p>
          </list-item>
        </list>
      </sec>
      <sec>
        <title>Error Analysis</title>
        <p>Error analysis is critical for understanding the model shortcomings, thereby contributing to the in-depth analysis and improvement of the model [<xref ref-type="bibr" rid="ref41">41</xref>]. The model of institution name normalization achieved satisfactory results; however, 46 errors remained. We analyzed the specification of 665 institution names and then observed the error types and causes of errors to better optimize the model. The error analysis results are shown in <xref ref-type="table" rid="table3">Table 3</xref>.</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Error analysis results.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="190"/>
            <col width="450"/>
            <col width="360"/>
            <thead>
              <tr valign="top">
                <td>Error type</td>
                <td>Reason</td>
                <td>Improvement direction</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Ambiguous institution aliases in authority files</td>
                <td>One of the aliases of this secondary institution in authority files is “School of Life Science,” which is the same as the aliases of several secondary institutions, resulting in a mismatch of models.</td>
                <td>Improve the authority files and increase the discrimination of the secondary institution aliases to avoid the appearance of the same or similar aliases.</td>
              </tr>
              <tr valign="top">
                <td>Hierarchical relationship of nested institutions were not identified</td>
                <td>Nested institution is a special type of institutional entity. It is necessary to specifically train a model that recognizes the relationships of nested institutions in order to accurately identify them. However, this study doesn’t train such a model.</td>
                <td>Train a model that can recognize hierarchical relationships of nested institutions.</td>
              </tr>
              <tr valign="top">
                <td>Missing identification of institution names</td>
                <td>Artificial matching rules are not perfect enough, which affects model matching.</td>
                <td>Further refine the matching rules and prepare more corpus for model training.</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <p>As mentioned in <italic>Deep Learning Models</italic> section, the deep learning–based model performed well overall, but there were still some issues, mainly in the following three aspects (<xref ref-type="table" rid="table3">Table 3</xref>):</p>
        <list list-type="order">
          <list-item>
            <p>The aliases in the specification document are ambiguous, which may cause the model to be matched incorrectly.</p>
          </list-item>
          <list-item>
            <p>Some of the hierarchical relationships among nested institutions were not correctly identified.</p>
          </list-item>
          <list-item>
            <p>The model sometimes failed to identify all the institution names, especially when there were a considerable number of institutions co-occurring in 1 article.</p>
          </list-item>
        </list>
        <p>In view of the abovementioned problems, the follow-up work will focus on 3 aspects: first, further improve the authority files and the model to avoid the occurrence of ambiguous aliases, and if the model cannot accurately identify the parent institution owing to the similarity of institution aliases, a hint should be given to guide the technical staff to further review; secondly, train an institution classification model for nested institutions to further expand the matching range of the model and improve its matching accuracy; and third, adjust the matching rules to cover all possible address types to the maximum extent so that the institution names at each level can be correctly and fully matched.</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Major Applications of the Proposed Model</title>
        <p>The deep learning–based model for institution name normalization trained in this study could be widely applied for the evaluation of institutions’ scientific research competitiveness, analysis of institutions’ research fields, and creation of interinstitutional cooperation networks. Accurate and comprehensive publication data form the basis of econometric analysis, and both the evaluation of the competitiveness of research institutions and the analysis of research fields require scientific econometric data. The former usually requires statistics on various indicators, such as the number of articles published by institutions, the citation frequency of papers, and the number of paper awards, whereas the latter can analyze the development trend of institutions by counting the fields in which their articles are published and cited more frequently. The accuracy of data statistics is based on the normalization of institution names. The normalization can effectively reduce errors in institutional academic achievement statistics and improve the quality and credibility of evaluation and analysis.</p>
        <p>Interinstitutional scientific cooperation is an integral part of promoting scientific communication, which will help researchers discover cooperative relationships and spatial distribution and further analyze the core institutions in a research field using network indicators such as network density and betweenness centrality [<xref ref-type="bibr" rid="ref42">42</xref>]. Commonly used tools include CiteSpace (Chaomei Chen), VOSviewer (Centre for Science and Technology Research, University of Leiden), and Pajek (Vladimir Batagelj and Andrej Mrvar). Generally, literature citation data are directly imported into the aforementioned software for the analysis and mapping of the cooperation network. However, owing to nonnormalized institution names, the resources of the same institution might be displayed as different institutions, which is not conducive to the integration and use of resources. Normalizing the names of institutions effectively avoids this issue and makes the interinstitutional cooperation relationship clearer and more explicit.</p>
      </sec>
      <sec>
        <title>Limitation and Future Work</title>
        <p>A potential limitation of this study is that the extraction of hierarchical relationships for nested entities falls short of expectations. The structure of nested entities exhibits a unique pattern in which multiple entities are intertwined and nested within their names. As entity recognition usually involves assigning starting and ending markers to the entities, the intertwining of entities makes the marking process extremely challenging. The identification of nested entities presents a challenge for current NER tasks. In the near future, we will further optimize the model in terms of the classification of nested institutions’ affiliation data, try to construct a model for nested institution hierarchical relationship extraction, and then count the academic results of research institutions to actually validate the model.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>A unified and accurate description of institutional names is of great significance for the precise attribution of scientific research results, evaluation of the competitiveness of scientific research institutions, and even knowledge discovery. In this study, a deep learning–based model for institution name normalization was trained based on the integration of several submodels, such as the classification model, matching model, and merging model. The proposed model could accurately extract institution names and other information from multisource affiliation data by matching with the authority files and realize the normalization of institution names. After several rounds of testing, we found that the model could achieve 93.79% accuracy and has a promising specification effect, which would be widely used in downstream tasks such as institutional research field analysis and institutional influence assessment.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Zip code extraction results.</p>
        <media xlink:href="formative_v7i1e47434_app1.png" xlink:title="PNG File , 192 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">BERT</term>
          <def>
            <p>bidirectional encoder representations from transformers</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">BiLSTM</term>
          <def>
            <p>bidirectional long short-term memory</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">CNN</term>
          <def>
            <p>convolutional neural network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">CRF</term>
          <def>
            <p>conditional random field</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">DBSCAN</term>
          <def>
            <p>Density-Based Spatial Clustering of Applications with Noise</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">GloVe</term>
          <def>
            <p>Global Vectors for Word Representation</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">ISO</term>
          <def>
            <p>International Organization for Standardization</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">LSTM</term>
          <def>
            <p>long short-term memory</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">NER</term>
          <def>
            <p>named entity recognition</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">RNN</term>
          <def>
            <p>recurrent neural network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">SVM</term>
          <def>
            <p>support vector machine</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">WoS</term>
          <def>
            <p>Web of Science</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This work was supported by the Innovation Fund for Medical Sciences of the Chinese Academy of Medical Sciences (grant 2021-I2M-1-033).</p>
    </ack>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>The data can be made available upon reasonable request, and please contact the corresponding author for further details. The authors would like to assure the readers that all data have been secured in accordance with the policies of the research institution and that the results have been reported accurately.</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Falahati Qadimi Fumani</surname>
              <given-names>MR</given-names>
            </name>
            <name name-style="western">
              <surname>Goltaji</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Parto</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Inconsistent transliteration of Iranian university names: a hazard to Iran’s ranking in ISI web of science</article-title>
          <source>Scientometrics</source>
          <year>2012</year>
          <month>07</month>
          <day>21</day>
          <volume>95</volume>
          <fpage>371</fpage>
          <lpage>84</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://link.springer.com/article/10.1007/s11192-012-0818-2"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s11192-012-0818-2</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>French</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>Powell</surname>
              <given-names>AL</given-names>
            </name>
            <name name-style="western">
              <surname>Schulman</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Pfaltz</surname>
              <given-names>JL</given-names>
            </name>
          </person-group>
          <article-title>Automating the construction of authority files in digital libraries: a case study</article-title>
          <source>Proceedings of the 1st European Conference on Research and Advanced Technology for Digital Libraries</source>
          <year>1997</year>
          <conf-name>ECDL '97</conf-name>
          <conf-date>September 1-3, 1997</conf-date>
          <conf-loc>Pisa, Italy</conf-loc>
          <fpage>55</fpage>
          <lpage>71</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://link.springer.com/chapter/10.1007/BFb0026721"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/bfb0026721</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>French</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>Powell</surname>
              <given-names>AL</given-names>
            </name>
            <name name-style="western">
              <surname>Schulman</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Using clustering strategies for creating authority files</article-title>
          <source>J Am Soc Inf Sci</source>
          <year>2000</year>
          <month>05</month>
          <volume>51</volume>
          <issue>8</issue>
          <fpage>774</fpage>
          <lpage>86</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dl.acm.org/doi/10.1002/%28SICI%291097-4571%282000%2951%3A8%253C774%3A%3AAID-ASI90%253E3.3.CO%3B2-G"/>
          </comment>
          <pub-id pub-id-type="doi">10.1002/(sici)1097-4571(2000)51:8&#60;774::aid-asi90&#62;3.0.co;2-p</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Phan</surname>
              <given-names>MC</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Tay</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Han</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>NeuPL: attention-based semantic matching and pair-linking for entity disambiguation</article-title>
          <source>Proceedings of the 2017 ACM on Conference on Information and Knowledge Management</source>
          <year>2017</year>
          <conf-name>CIKM '17</conf-name>
          <conf-date>November 6-10, 2017</conf-date>
          <conf-loc>Singapore, Republic of Singapore</conf-loc>
          <fpage>1667</fpage>
          <lpage>76</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dl.acm.org/doi/10.1145/3132847.3132963"/>
          </comment>
          <pub-id pub-id-type="doi">10.1145/3132847.3132963</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jacob</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Javed</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Mcnair</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>sCooL: a system for academic institution name normalization</article-title>
          <source>Proceedings of the 2014 International Conference on Collaboration Technologies and Systems</source>
          <year>2014</year>
          <conf-name>CTS '14</conf-name>
          <conf-date>May 19-23, 2014</conf-date>
          <conf-loc>Minneapolis, MN</conf-loc>
          <fpage>86</fpage>
          <lpage>93</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ieeexplore.ieee.org/document/6867547"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/cts.2014.6867547</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Yan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Rousseau</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Institution name disambiguation for research assessment</article-title>
          <source>Scientometrics</source>
          <year>2014</year>
          <volume>99</volume>
          <issue>3</issue>
          <fpage>823</fpage>
          <lpage>38</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://link.springer.com/article/10.1007/s11192-013-1214-2"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s11192-013-1214-2</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Xian</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Institution information specification and correlation based on institutional PIDs and IND tool</article-title>
          <source>Scientometrics</source>
          <year>2020</year>
          <volume>122</volume>
          <issue>1</issue>
          <fpage>381</fpage>
          <lpage>96</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://link.springer.com/article/10.1007/s11192-019-03268-9"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s11192-019-03268-9</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kronman</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Gunnarsson</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Karlsson</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>The bibliometric database at the Swedish Research Council – contents, methods and indicators</article-title>
          <source>Swedish Research Council</source>
          <year>2017</year>
          <month>01</month>
          <day>16</day>
          <access-date>2022-12-18</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.vr.se/download/18.781fb755163605b8cd21b8a3/1529480567144/The+bibliometric+database+at+the+Swedish+Research+Council+-+contents,+methods+and+indicators.pdf">https://www.vr.se/download/18.781fb755163605b8cd21b8a3/1529480567144/The+bibliometric+database+at+the+Swedish+Research+Council+-+contents,+methods+and+indicators.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Han</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Giles</surname>
              <given-names>LC</given-names>
            </name>
            <name name-style="western">
              <surname>Zha</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Tsioutsiouliklis</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Two supervised learning approaches for name disambiguation in author citations</article-title>
          <source>Proceedings of the 4th ACM/IEEE-CS Joint Conference on Digital Libraries</source>
          <year>2004</year>
          <conf-name>JCDL '04</conf-name>
          <conf-date>June 7-11, 2004</conf-date>
          <conf-loc>Tuscon, AZ</conf-loc>
          <fpage>296</fpage>
          <lpage>305</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dl.acm.org/doi/10.1145/996350.996419"/>
          </comment>
          <pub-id pub-id-type="doi">10.1145/996350.996419</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Balsmeier</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Assaf</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chesebro</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Fierro</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>G-C</given-names>
            </name>
            <name name-style="western">
              <surname>Lück</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>O'Reagan</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Yeh</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Zang</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Fleming</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Machine learning and natural language processing on the patent corpus: data, tools, and new measures</article-title>
          <source>J Econ Manag Strategy</source>
          <year>2018</year>
          <month>07</month>
          <day>18</day>
          <volume>27</volume>
          <issue>3</issue>
          <fpage>535</fpage>
          <lpage>53</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://onlinelibrary.wiley.com/doi/abs/10.1111/jems.12259"/>
          </comment>
          <pub-id pub-id-type="doi">10.1111/jems.12259</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>He</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Learning entity representation for entity disambiguation</article-title>
          <source>Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics</source>
          <year>2013</year>
          <conf-name>ACL '13</conf-name>
          <conf-date>August 4-9, 2013</conf-date>
          <conf-loc>Sofia, Bulgaria</conf-loc>
          <fpage>30</fpage>
          <lpage>4</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/P13-2006.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Altenbek</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Aierzhati</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Chinese short text entity disambiguation based on the dual-channel hybrid network</article-title>
          <source>IEEE Access</source>
          <year>2020</year>
          <volume>8</volume>
          <fpage>206164</fpage>
          <lpage>73</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ieeexplore.ieee.org/document/9256270/authors#authors"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/access.2020.3037333</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shinde</surname>
              <given-names>PP</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>A review of machine learning and deep learning applications</article-title>
          <source>Proceedings of the 4th International Conference on Computing Communication Control and Automation</source>
          <year>2018</year>
          <conf-name>ICCUBEA '18</conf-name>
          <conf-date>August 16-18, 2018</conf-date>
          <conf-loc>Pune, India</conf-loc>
          <fpage>1</fpage>
          <lpage>6</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ieeexplore.ieee.org/document/8697857"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/iccubea.2018.8697857</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yin</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Kann</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Schütze</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Comparative study of CNN and RNN for natural language processing</article-title>
          <source>arXiv. Preprint posted online February 7, 2017</source>
          <year>2023</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1702.01923"/>
          </comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.1702.01923</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yoon</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Convolutional neural networks for sentence classification</article-title>
          <source>Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing</source>
          <year>2014</year>
          <conf-name>EMNLP '14</conf-name>
          <conf-date>October 25-29, 2014</conf-date>
          <conf-loc>Doha, Qatar</conf-loc>
          <fpage>1746</fpage>
          <lpage>51</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/D14-1181.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.3115/v1/d14-1181</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Thomas</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sangeetha</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Deep learning architectures for named entity recognition: a survey</article-title>
          <source>Proceedings of the 2018 on Advanced Computing and Intelligent Engineering</source>
          <year>2018</year>
          <conf-name>ICACIE '18</conf-name>
          <conf-date>December 22-24, 2018</conf-date>
          <conf-loc>Bhubaneswar, India</conf-loc>
          <fpage>215</fpage>
          <lpage>25</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://link.springer.com/chapter/10.1007/978-981-15-1081-6_18"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/978-981-15-1081-6_18</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Acheampong</surname>
              <given-names>FA</given-names>
            </name>
            <name name-style="western">
              <surname>Nunoo-Mensah</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Transformer models for text-based emotion detection: a review of BERT-based approaches</article-title>
          <source>Artif Intell Rev</source>
          <year>2021</year>
          <month>02</month>
          <day>08</day>
          <volume>54</volume>
          <issue>8</issue>
          <fpage>5789</fpage>
          <lpage>829</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://link.springer.com/article/10.1007/s10462-021-09958-2"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s10462-021-09958-2</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vaswani</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shazeer</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Parmar</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Uszkoreit</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Gomez</surname>
              <given-names>AN</given-names>
            </name>
            <name name-style="western">
              <surname>Kaiser</surname>
              <given-names>Ł</given-names>
            </name>
            <name name-style="western">
              <surname>Polosukhin</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>Attention is all you need</article-title>
          <source>Proceedings of the 31st International Conference on Neural Information Processing Systems</source>
          <year>2017</year>
          <month>12</month>
          <conf-name>NIPS '17</conf-name>
          <conf-date>December 4-9, 2017</conf-date>
          <conf-loc>Long Beach, CA</conf-loc>
          <fpage>6000</fpage>
          <lpage>10</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dl.acm.org/doi/10.5555/3295222.3295349"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tanaka</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Cao</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Bai</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Shinnou</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Construction of document feature vectors using BERT</article-title>
          <source>Proceedings of the 2020 International Conference on Technologies and Applications of Artificial Intelligence</source>
          <year>2020</year>
          <conf-name>TAAI '20</conf-name>
          <conf-date>December 3-5, 2020</conf-date>
          <conf-loc>Taipei, Taiwan</conf-loc>
          <fpage>232</fpage>
          <lpage>6</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ieeexplore.ieee.org/document/9382454"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/taai51410.2020.00050</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Stojanov</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Popovski</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Cenikj</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Koroušić Seljak</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Eftimov</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>A fine-tuned bidirectional encoder representations from transformers model for food named-entity recognition: algorithm development and validation</article-title>
          <source>J Med Internet Res</source>
          <year>2021</year>
          <month>08</month>
          <day>09</day>
          <volume>23</volume>
          <issue>8</issue>
          <fpage>e28229</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2021/8/e28229/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/28229</pub-id>
          <pub-id pub-id-type="medline">34383671</pub-id>
          <pub-id pub-id-type="pii">v23i8e28229</pub-id>
          <pub-id pub-id-type="pmcid">PMC8415558</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Ji</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tian</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Ge</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Zou</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Nakamura</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Liao</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Chinese-named entity recognition from adverse drug event records: radical embedding-combined dynamic embedding-based BERT in a Bidirectional Long Short-term Conditional Random Field (Bi-LSTM-CRF) model</article-title>
          <source>JMIR Med Inform</source>
          <year>2021</year>
          <month>12</month>
          <day>01</day>
          <volume>9</volume>
          <issue>12</issue>
          <fpage>e26407</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2021/12/e26407/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/26407</pub-id>
          <pub-id pub-id-type="medline">34855616</pub-id>
          <pub-id pub-id-type="pii">v9i12e26407</pub-id>
          <pub-id pub-id-type="pmcid">PMC8686410</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Mahmood</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>The NLP cookbook: modern recipes for transformer based deep learning architectures</article-title>
          <source>IEEE Access</source>
          <year>2021</year>
          <volume>9</volume>
          <fpage>68675</fpage>
          <lpage>702</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ieeexplore.ieee.org/document/9422763"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/access.2021.3077350</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Rawat</surname>
              <given-names>BP</given-names>
            </name>
            <name name-style="western">
              <surname>Cai</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Fine-tuning bidirectional encoder representations from transformers (BERT)-based models on large-scale electronic health record notes: an empirical study</article-title>
          <source>JMIR Med Inform</source>
          <year>2019</year>
          <month>09</month>
          <day>12</day>
          <volume>7</volume>
          <issue>3</issue>
          <fpage>e14830</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2019/3/e14830/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/14830</pub-id>
          <pub-id pub-id-type="medline">31516126</pub-id>
          <pub-id pub-id-type="pii">v7i3e14830</pub-id>
          <pub-id pub-id-type="pmcid">PMC6746103</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Han</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>A survey on deep learning for named entity recognition</article-title>
          <source>IEEE Trans Knowl Data Eng</source>
          <year>2022</year>
          <month>01</month>
          <volume>34</volume>
          <issue>1</issue>
          <fpage>50</fpage>
          <lpage>70</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dl.acm.org/doi/10.1109/TKDE.2020.2981314"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/tkde.2020.2981314</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bikel</surname>
              <given-names>DM</given-names>
            </name>
            <name name-style="western">
              <surname>Schwartz</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Weischedel</surname>
              <given-names>RM</given-names>
            </name>
          </person-group>
          <article-title>An algorithm that learns what's in a name</article-title>
          <source>Mach Learn</source>
          <year>1999</year>
          <month>02</month>
          <volume>34</volume>
          <fpage>211</fpage>
          <lpage>31</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://link.springer.com/article/10.1023/A:1007558221122"/>
          </comment>
          <pub-id pub-id-type="doi">10.1023/A:1007558221122</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>McCallum</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Early results for named entity recognition with conditional random fields, feature induction and web-enhanced lexicons</article-title>
          <source>Proceedings of the 7th conference on Natural language learning at HLT-NAACL</source>
          <year>2003</year>
          <conf-name>CoNLL '03</conf-name>
          <conf-date>May 31, 2003</conf-date>
          <conf-loc>Edmonton, AB</conf-loc>
          <fpage>188</fpage>
          <lpage>91</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dl.acm.org/doi/10.3115/1119176.1119206"/>
          </comment>
          <pub-id pub-id-type="doi">10.3115/1119176.1119206</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mayfield</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>McNamee</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Piatko</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Named entity recognition using hundreds of thousands of features</article-title>
          <source>Proceedings of the 7th Conference on Natural Language Learning at HLT-NAACL</source>
          <year>2003</year>
          <conf-name>CoNLL '03</conf-name>
          <conf-date>May 31, 2003</conf-date>
          <conf-loc>Edmonton, AB</conf-loc>
          <fpage>184</fpage>
          <lpage>7</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/W03-0429.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.3115/1119176.1119205</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hammerton</surname>
              <given-names>JA</given-names>
            </name>
          </person-group>
          <article-title>Named entity recognition with long short-term memory</article-title>
          <source>Proceedings of the 7th conference on Natural language learning at HLT-NAACL</source>
          <year>2003</year>
          <conf-name>CONLL '03</conf-name>
          <conf-date>May 31, 2003</conf-date>
          <conf-loc>Edmonton, AB</conf-loc>
          <fpage>172</fpage>
          <lpage>5</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dl.acm.org/doi/10.3115/1119176.1119202"/>
          </comment>
          <pub-id pub-id-type="doi">10.3115/1119176.1119202</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Dredze</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Improving named entity recognition for Chinese social media with word segmentation representation learning</article-title>
          <source>Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics</source>
          <year>2016</year>
          <conf-name>ACL '16</conf-name>
          <conf-date>August 7-12, 2016</conf-date>
          <conf-loc>Berlin, Germany</conf-loc>
          <fpage>149</fpage>
          <lpage>55</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/P16-2025.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/p16-2025</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lample</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Ballesteros</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Subramanian</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kawakami</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Dyer</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Neural architectures for named entity recognition</article-title>
          <source>Proceedings of the 2016 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</source>
          <year>2016</year>
          <conf-name>NAACL-HLT '16</conf-name>
          <conf-date>June 12-17, 2016</conf-date>
          <conf-loc>San Diego, CA</conf-loc>
          <fpage>260</fpage>
          <lpage>70</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/N16-1030.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/n16-1030</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Joshi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Weld</surname>
              <given-names>DS</given-names>
            </name>
            <name name-style="western">
              <surname>Zettlemoyer</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Levy</surname>
              <given-names>O</given-names>
            </name>
          </person-group>
          <article-title>SpanBERT: improving pre-training by representing and predicting spans</article-title>
          <source>Trans Assoc Comput Linguist</source>
          <year>2020</year>
          <month>12</month>
          <volume>8</volume>
          <fpage>64</fpage>
          <lpage>77</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://direct.mit.edu/tacl/article/doi/10.1162/tacl_a_00300/43539/SpanBERT-Improving-Pre-training-by-Representing"/>
          </comment>
          <pub-id pub-id-type="doi">10.1162/tacl_a_00300</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Devlin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>MW</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Toutanova</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>BERT: pre-training of deep bidirectional transformers for language understanding</article-title>
          <source>Proceedings of the 17th Annual Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</source>
          <year>2019</year>
          <conf-name>NAACL-HLT '19</conf-name>
          <conf-date>June 2-7, 2019</conf-date>
          <conf-loc>Minneapolis, MN</conf-loc>
          <fpage>4171</fpage>
          <lpage>86</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/N19-1423.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lei</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Shi</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>A novel CNN-based method for question classification in intelligent question answering</article-title>
          <source>Proceedings of the 2018 International Conference on Algorithms, Computing and Artificial Intelligence</source>
          <year>2018</year>
          <conf-name>ACAI '18</conf-name>
          <conf-date>December 21-23, 2018</conf-date>
          <conf-loc>Sanya, China</conf-loc>
          <fpage>1</fpage>
          <lpage>6</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dl.acm.org/doi/abs/10.1145/3302425.3302483"/>
          </comment>
          <pub-id pub-id-type="doi">10.1145/3302425.3302483</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rui</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Xing</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Jai</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>BOWL: bag of word clusters text representation using word embeddings</article-title>
          <source>Proceedings of the 9th International Conference on Knowledge Science, Engineering and Management</source>
          <year>2016</year>
          <conf-name>KSEM '16</conf-name>
          <conf-date>October 5-7, 2016</conf-date>
          <conf-loc>Passau, Germany</conf-loc>
          <fpage>3</fpage>
          <lpage>14</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://link.springer.com/chapter/10.1007/978-3-319-47650-6_1"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/978-3-319-47650-6_1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kusner</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Kolkin</surname>
              <given-names>NI</given-names>
            </name>
            <name name-style="western">
              <surname>Weinberger</surname>
              <given-names>KQ</given-names>
            </name>
          </person-group>
          <article-title>From word embeddings to document distances</article-title>
          <source>Proceedings of the 32nd International Conference on International Conference on Machine Learning</source>
          <year>2015</year>
          <conf-name>ICML'15</conf-name>
          <conf-date>July 6-11, 2015</conf-date>
          <conf-loc>Lille, France</conf-loc>
          <fpage>957</fpage>
          <lpage>66</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dl.acm.org/doi/10.5555/3045118.3045221"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Çelik</surname>
              <given-names>Ö</given-names>
            </name>
            <name name-style="western">
              <surname>Koç</surname>
              <given-names>BC</given-names>
            </name>
          </person-group>
          <article-title>Classification of Turkish news text by TF-IDF, Word2vec and Fasttext vector model methods</article-title>
          <source>DEÜ FMD</source>
          <year>2021</year>
          <volume>23</volume>
          <issue>67</issue>
          <fpage>121</fpage>
          <lpage>7</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dergipark.org.tr/tr/download/article-file/940060"/>
          </comment>
          <pub-id pub-id-type="doi">10.21205/deufmd.2021236710</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cichosz</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Unsupervised modeling anomaly detection in discussion forums posts using global vectors for text representation</article-title>
          <source>Nat Lang Eng</source>
          <year>2020</year>
          <month>03</month>
          <day>04</day>
          <volume>26</volume>
          <issue>5</issue>
          <fpage>551</fpage>
          <lpage>78</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.cambridge.org/core/journals/natural-language-engineering/article/abs/unsupervised-modeling-anomaly-detection-in-discussion-forums-posts-using-global-vectors-for-text-representation/D48695A566706691800569E2D724F918"/>
          </comment>
          <pub-id pub-id-type="doi">10.1017/s1351324920000066</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lauren</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Qu</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Watta</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>GB</given-names>
            </name>
            <name name-style="western">
              <surname>Lendasse</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Generating word embeddings from an extreme learning machine for sentiment analysis and sequence labeling tasks</article-title>
          <source>Cogn Comput</source>
          <year>2018</year>
          <month>03</month>
          <day>02</day>
          <volume>10</volume>
          <issue>4</issue>
          <fpage>625</fpage>
          <lpage>38</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://link.springer.com/article/10.1007/s12559-018-9548-y"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s12559-018-9548-y</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Peters</surname>
              <given-names>ME</given-names>
            </name>
            <name name-style="western">
              <surname>Neumann</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Iyyer</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gardner</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Deep contextualized word representations</article-title>
          <source>Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</source>
          <year>2018</year>
          <conf-name>NAACL-HLT '18</conf-name>
          <conf-date>June 1-6, 2018</conf-date>
          <conf-loc>New Orleans, LA</conf-loc>
          <fpage>2227</fpage>
          <lpage>37</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/N18-1202.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Radu</surname>
              <given-names>RG</given-names>
            </name>
            <name name-style="western">
              <surname>Rădulescu</surname>
              <given-names>IM</given-names>
            </name>
            <name name-style="western">
              <surname>Truică</surname>
              <given-names>CO</given-names>
            </name>
            <name name-style="western">
              <surname>Apostol</surname>
              <given-names>ES</given-names>
            </name>
            <name name-style="western">
              <surname>Mocanu</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Clustering documents using the document to vector model for dimensionality reduction</article-title>
          <source>Proceedings of the 2020 International Conference on Automation, Quality and Testing, Robotics</source>
          <year>2020</year>
          <conf-name>AQTR '20</conf-name>
          <conf-date>May 21-23, 2020</conf-date>
          <conf-loc>Cluj-Napoca, Romania</conf-loc>
          <fpage>1</fpage>
          <lpage>6</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ieeexplore.ieee.org/document/9129967"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/aqtr49680.2020.9129967</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Ribeiro</surname>
              <given-names>MT</given-names>
            </name>
            <name name-style="western">
              <surname>Heer</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Weld</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Errudite: scalable, reproducible, and testable error analysis</article-title>
          <source>Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics</source>
          <year>2019</year>
          <conf-name>ACL '19</conf-name>
          <conf-date>July 28-August 2, 2019</conf-date>
          <conf-loc>Florence, Italy</conf-loc>
          <fpage>747</fpage>
          <lpage>63</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/P19-1073.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/p19-1073</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>CiteSpace II: detecting and visualizing emerging trends and transient patterns in scientific literature</article-title>
          <source>J Am Soc Inf Sci Technol</source>
          <year>2006</year>
          <month>02</month>
          <day>01</day>
          <volume>57</volume>
          <issue>3</issue>
          <fpage>359</fpage>
          <lpage>77</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://onlinelibrary.wiley.com/doi/abs/10.1002/asi.20317"/>
          </comment>
          <pub-id pub-id-type="doi">10.1002/asi.20317</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
