<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.0" xmlns:xlink="http://www.w3.org/1999/xlink">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JFR</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id>
      <journal-title>JMIR Formative Research</journal-title>
      <issn pub-type="epub">2561-326X</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v8i1e49562</article-id>
      <article-id pub-id-type="pmid">38833288</article-id>
      <article-id pub-id-type="doi">10.2196/49562</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Identifying X (Formerly Twitter) Posts Relevant to Dementia and COVID-19: Machine Learning Approach</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Mavragani</surname>
            <given-names>Amaryllis</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Yao</surname>
            <given-names>Li-Hung</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Chatzimina</surname>
            <given-names>Maria</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Azizi</surname>
            <given-names>Mehrnoosh</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-4337-4630</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Jamali</surname>
            <given-names>Ali Akbar</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-6942-5926</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Spiteri</surname>
            <given-names>Raymond J</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Computer Science</institution>
            <institution>University of Saskatchewan</institution>
            <addr-line>S425 Thorvaldson Building, 110 Science Place</addr-line>
            <addr-line>Saskatoon, SK, S7N5C9</addr-line>
            <country>Canada</country>
            <phone>1 306 966 2909</phone>
            <email>spiteri@cs.usask.ca</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-3513-6237</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Computer Science</institution>
        <institution>University of Saskatchewan</institution>
        <addr-line>Saskatoon, SK</addr-line>
        <country>Canada</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Raymond J Spiteri <email>spiteri@cs.usask.ca</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2024</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>4</day>
        <month>6</month>
        <year>2024</year>
      </pub-date>
      <volume>8</volume>
      <elocation-id>e49562</elocation-id>
      <history>
        <date date-type="received">
          <day>1</day>
          <month>6</month>
          <year>2023</year>
        </date>
        <date date-type="rev-request">
          <day>30</day>
          <month>11</month>
          <year>2023</year>
        </date>
        <date date-type="rev-recd">
          <day>11</day>
          <month>12</month>
          <year>2023</year>
        </date>
        <date date-type="accepted">
          <day>3</day>
          <month>4</month>
          <year>2024</year>
        </date>
      </history>
      <copyright-statement>©Mehrnoosh Azizi, Ali Akbar Jamali, Raymond J Spiteri. Originally published in JMIR Formative Research (https://formative.jmir.org), 04.06.2024.</copyright-statement>
      <copyright-year>2024</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on https://formative.jmir.org, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://formative.jmir.org/2024/1/e49562" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>During the pandemic, patients with dementia were identified as a vulnerable population. X (formerly Twitter) became an important source of information for people seeking updates on COVID-19, and, therefore, identifying posts (formerly tweets) relevant to dementia can be an important support for patients with dementia and their caregivers. However, mining and coding relevant posts can be daunting due to the sheer volume and high percentage of irrelevant posts.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>The objective of this study was to automate the identification of posts relevant to dementia and COVID-19 using natural language processing and machine learning (ML) algorithms.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We used a combination of natural language processing and ML algorithms with manually annotated posts to identify posts relevant to dementia and COVID-19. We used 3 data sets containing more than 100,000 posts and assessed the capability of various algorithms in correctly identifying relevant posts.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>Our results showed that (pretrained) transfer learning algorithms outperformed traditional ML algorithms in identifying posts relevant to dementia and COVID-19. Among the algorithms tested, the transfer learning algorithm A Lite Bidirectional Encoder Representations from Transformers (ALBERT) achieved an accuracy of 82.92% and an area under the curve of 83.53%. ALBERT substantially outperformed the other algorithms tested, further emphasizing the superior performance of transfer learning algorithms in the classification of posts.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Transfer learning algorithms such as ALBERT are highly effective in identifying topic-specific posts, even when trained with limited or adjacent data, highlighting their superiority over other ML algorithms and applicability to other studies involving analysis of social media posts. Such an automated approach reduces the workload of manual coding of posts and facilitates their analysis for researchers and policy makers to support patients with dementia and their caregivers and other vulnerable populations.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>machine learning</kwd>
        <kwd>dementia</kwd>
        <kwd>Alzheimer disease</kwd>
        <kwd>COVID-19</kwd>
        <kwd>X (Twitter)</kwd>
        <kwd>natural language processing</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Dementia is a group of progressive syndromes that cause impairments in high-level cognitive functions, such as memory, language, and thinking, as well as everyday functioning and social interactions [<xref ref-type="bibr" rid="ref1">1</xref>]. Currently, approximately 50 million people worldwide are affected by dementia, and this number is projected to continue to rise rapidly [<xref ref-type="bibr" rid="ref2">2</xref>]. As populations age worldwide, dementia is expected to continue to be a significant health care challenge, requiring considerable medical, social, and institutional care [<xref ref-type="bibr" rid="ref3">3</xref>].</p>
      <p>The emergence of the COVID-19 pandemic presented an additional challenge for patients with dementia and their caregivers. Older patients were at a higher risk of contracting COVID-19 and were more likely to experience severe symptoms and consequences due to age, vulnerability, frailty, and other health conditions commonly associated with dementia [<xref ref-type="bibr" rid="ref4">4</xref>]. In the United Kingdom, half (50%) of the COVID-19–related deaths in care homes were among patients with dementia [<xref ref-type="bibr" rid="ref5">5</xref>]. Patients with dementia face challenges in adhering to self-protection protocols, such as wearing masks, using hand hygiene, and practicing physical distancing. In addition, they may have difficulty understanding and remembering the risks associated with COVID-19 [<xref ref-type="bibr" rid="ref6">6</xref>]. In addition, the caregivers of patients with dementia also face limitations imposed by COVID-19, such as social isolation, loss of support, and care-partner burnout, all of which can further exacerbate the unpleasant situation of patients with dementia by limiting their access to public services and the support provided by their caregivers [<xref ref-type="bibr" rid="ref6">6</xref>].</p>
      <p>Social media platforms have emerged as valuable sources of information for individuals seeking updates on various issues, performing scientific studies, providing support, and raising public awareness. X (formerly Twitter), established in 2006 with about 400 million users, is a popular social media platform for microblogging where users can publicly share their thoughts using short messages called posts (formerly tweets). Posts can provide insight into COVID-19–related experiences of patients with dementia and their caregivers, offering an opportunity to study the impact of the pandemic on them. By identifying and analyzing posts relevant to dementia and COVID-19, substantial opportunities to support effective decision-making and policy development and further research can be uncovered.</p>
      <p>Manual analysis, mining, and coding of posts can be a challenging, time-consuming, and laborious task, hindering the ability of health care researchers and practitioners to gain insights into the impact of COVID-19 on patients with dementia and other vulnerable populations. As such, there is a need to improve existing methodologies to streamline and automate the process. Machine learning (ML) algorithms have become a popular approach for performing tasks such as thematic or semantic analysis, and the classification of posts with reliable accuracy, offering a promising solution to this challenge [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref11">11</xref>].</p>
      <p>ML algorithms have shown promise in classifying posts based on different sentiments. For example, Roy and Ojha [<xref ref-type="bibr" rid="ref12">12</xref>] trained 3 classifiers using automatically labeled data in their study. However, the inclusion of a significant amount of noise through the automated labeling of data resulted in poor performance of their classifiers. Chiroma et al [<xref ref-type="bibr" rid="ref13">13</xref>] manually labeled posts and used ML algorithms to classify suicide-related posts. The authors applied the Bag of Word technique for feature extraction and evaluated the performance of 5 traditional ML algorithms. Although there was no significant difference in performance among different algorithms for most classifications, binary classification showed the most promising results.</p>
      <p>Despite these results, recent studies have raised concerns about the effectiveness of traditional ML algorithms for identifying and classifying posts [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>]. Therefore, this study aims to demonstrate an alternative method for identifying posts relevant to dementia and COVID-19 using a combination of natural language processing (NLP) and ML algorithms and explore the reliability and performance of these algorithms. By leveraging the power of more advanced algorithms and techniques, specifically transfer learning algorithms, we aim to develop a model that can effectively analyze large volumes of posts to identify relevant posts and potentially offer valuable insights to policy makers, health care professionals, and researchers to inform evidence-based decision-making and support patients with dementia and their caregivers. Furthermore, the methodology described applies more generally to identify relevant social media posts to help facilitate further analysis. The overview and workflow of our study are shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>.</p>
      <fig id="figure1" position="float">
        <label>Figure 1</label>
        <caption>
          <p>The workflow of this study. (1) Data collection involved gathering posts containing search keywords from 3 distinct data sets: the First Wave data set consisting of 5063 posts from February 15, 2020, to September 7, 2020; the Longitudinal data set consisting of 110,528 posts from September 8, 2020, to December 8, 2021; and the Alzheimer’s Awareness Month data set consisting of 1289 posts from January 1, 2022, to January 31, 2022; (2) data preprocessing, and (3) classification/evaluation.</p>
        </caption>
        <graphic xlink:href="formative_v8i1e49562_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Data Collection</title>
        <p>In this study, we aimed to analyze posts and identify those relevant to dementia and COVID-19. We used 3 data sets for this purpose. The first data set, referred to as the First Wave data set, consisted of 5063 posts collected between February 15, 2020, and September 7, 2020, using the search terms “dementia” or “Alzheimer” in combination with “COVID-19,” “COVID,” or “Corona” [<xref ref-type="bibr" rid="ref16">16</xref>]. This data set captured the experiences of people impacted by Alzheimer disease or dementia and COVID-19 in the early stages of the pandemic. To capture the later stages of the pandemic, we collected a second data set, referred to as the longitudinal data set, consisting of 110,528 posts between September 8, 2020, and December 8, 2021, using the same search terms as the first data set. Finally, to examine the X discourse on dementia during Alzheimer’s Awareness Month in Canada, we collected a third data set, referred to as the Alzheimer’s Awareness Month data set, comprising 1289 posts between January 1, 2022, and January 31, 2022. <xref ref-type="table" rid="table1">Table 1</xref> summarizes the particulars of the data sets, including the number of instances, and search terms for each data set. Figure S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> depicts the word cloud of the most frequently used word in each data set.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>The characteristics of different data sets<sup>a</sup>.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="120"/>
            <col width="70"/>
            <col width="200"/>
            <col width="330"/>
            <col width="280"/>
            <thead>
              <tr valign="top">
                <td>Data set</td>
                <td>Posts, n</td>
                <td>Time frame</td>
                <td>First search terms</td>
                <td>Second search terms</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>First Wave</td>
                <td>5063</td>
                <td>February 15, 2020, to September 7, 2020</td>
                <td>Dementia, Alzheimer</td>
                <td>COVID-19, COVID, Corona</td>
              </tr>
              <tr valign="top">
                <td>Longitudinal</td>
                <td>110,528</td>
                <td>September 8, 2020, to December 8, 2021</td>
                <td>Dementia, Alzheimer</td>
                <td>COVID-19, COVID, Corona</td>
              </tr>
              <tr valign="top">
                <td>Alzheimer’s Awareness Month</td>
                <td>1289</td>
                <td>January 1, 2022, to January 31, 2022</td>
                <td>#AlzheimersAwarenessMonth, #AlzAwareness, #dementiawareness, dementia month, dementia awareness month, Alzheimer’s awareness month, Alzheimer’s month, January is Alzheimer’s Awareness month</td>
                <td>@alzCanada, @AlzheimerOnt, @AlzheimerSK, @DementiaAB\_NT, @AlzheimerNS, @AlzheimerNB, @AlzheimerPEI, @alzheimerMB, @AlzheimerBC,<break/>@asnl2,<break/>@FqsaAlzh</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>The data applied in the study consist of 3 distinct data sets including different search keywords.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Data Preprocessing</title>
        <p>To construct data sets consistently, preprocessing was performed on all 3 data sets to remove noise. Preparation steps such as simple filtering (removing emoticons, misspelled words, duplicates, obviously irrelevant characters, posts, and replies), tokenization, lemmatization, lowercasing, and normalization (removing punctuation, possessive pronouns, and stop words) were applied. After this filtering, the remaining posts were manually labeled as either relevant or irrelevant by a team of 11 coders using thematic analysis and the codebook from Bacsu et al [<xref ref-type="bibr" rid="ref16">16</xref>]. Intercoder reliability was assessed through random reviewing of 25% (1560/6243) of all labeled posts and an average intercoder reliability of 83.4% was achieved.</p>
      </sec>
      <sec>
        <title>Feature Extraction</title>
        <p>After the preprocessing steps, the tokenized words were then converted into vectors using the term frequency–inverse document frequency (TF-IDF) method [<xref ref-type="bibr" rid="ref17">17</xref>-<xref ref-type="bibr" rid="ref19">19</xref>]. TF-IDF calculates the relative frequency of words in a specific document compared with the inverse proportion of that word over the entire text document determining the importance of a given word in a document. Accordingly, words frequently used in a document have higher TF-IDF weights and are regarded as more representative. Words common to many documents, such as articles and prepositions, have lower TF-IDF weights and are considered less representative [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>]. The TF-IDF hyperparameter, max-features, determines the number of the most representative words for the rest of the analysis [<xref ref-type="bibr" rid="ref18">18</xref>]. We set the max-features threshold to 1250 based on experimentation and the scores of different ML classifiers. This approach helps select the most representative words for the highest performance.</p>
      </sec>
      <sec>
        <title>ML Algorithms</title>
        <sec>
          <title>Overview</title>
          <p>Various traditional and transfer learning ML algorithms were applied to identify posts relevant to dementia and COVID-19.</p>
        </sec>
        <sec>
          <title>Traditional ML Algorithms</title>
          <sec>
            <title>Logistic Regression</title>
            <p>Logistic regression (LR) is a statistical ML method used for analyzing the relationship between a binary dependent variable and 1 or more independent variables. It is commonly used for classification tasks, where the goal is to predict the probability of an event occurring based on a set of input variables. The output of a logistic regression model is a probability value between 0 and 1, which can be interpreted as the likelihood of the event occurring [<xref ref-type="bibr" rid="ref20">20</xref>].</p>
          </sec>
          <sec>
            <title>Naïve Bayes</title>
            <p>Naïve Bayes (NB) is a probabilistic ML algorithm that is based on the Bayes rule. It uses the assumptions of strong independence between variables to construct a simple and fast algorithm [<xref ref-type="bibr" rid="ref21">21</xref>].</p>
          </sec>
          <sec>
            <title>Multinomial Naïve Bayes</title>
            <p>Multinomial Naïve Bayes (MNB) is a probabilistic ML algorithm based on Bayes’ theorem that calculates the probability of a particular event occurring based on prior knowledge of the conditions that may affect that event [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref23">23</xref>]. In text classification, the algorithm learns the probability distribution of words in each class and then calculates the conditional probability of a new document belonging to each class given its word frequency distribution.</p>
          </sec>
          <sec>
            <title>K-Nearest Neighbors</title>
            <p>K-nearest neighbors (kNN) is a nonparametric ML algorithm known for its simplicity and effectiveness. kNN classifies data based on the closeness of training samples in a given region [<xref ref-type="bibr" rid="ref24">24</xref>].</p>
          </sec>
          <sec>
            <title>Support Vector Machine</title>
            <p>Support vector machine (SVM) is a widely used supervised ML algorithm for classification and regression analysis. SVMs are based on finding the best hyperplane that separates data into different classes. The hyperplane is chosen such that it maximizes the distance between the 2 classes [<xref ref-type="bibr" rid="ref25">25</xref>].</p>
          </sec>
          <sec>
            <title>Decision Tree</title>
            <p>Decision tree (DT) is a supervised ML algorithm that is designed to solve classification problems by learning a hierarchy of “if/else” questions and answers and by creating a tree representation that results in a decision [<xref ref-type="bibr" rid="ref26">26</xref>]. The goal of the DT classifier is to get the right classification result by asking the least number of “if/else” questions.</p>
          </sec>
        </sec>
      </sec>
      <sec>
        <title>Ensemble Algorithms</title>
        <sec>
          <title>Overview</title>
          <p>Ensemble algorithms are designed to improve the accuracy of classification by combining multiple base classifiers. Although any type of base classifier including DT, neural networks, or SVMs can be used to create ensemble algorithms, DT is a commonly used algorithm [<xref ref-type="bibr" rid="ref27">27</xref>].</p>
        </sec>
        <sec>
          <title>Random Forest</title>
          <p>Random forest (RF) is a powerful ensemble algorithm that uses DTs as the base classifiers. In RF, a set of features is randomly selected to determine the best split at each node of the DT. To make a prediction for a new data point, RF first applies each DT in the forest and predicts the target. Then, it uses the majority vote of all the DT predictions to assign the target with the highest probability to the new data [<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref28">28</xref>].</p>
        </sec>
        <sec>
          <title>AdaBoost</title>
          <p>Adaboost (Adaptive Boosting) is a boosting algorithm that combines weak classifiers to create a strong classifier. It works by iteratively adjusting the weights of misclassified samples and adding new weak classifiers based on the samples that were misclassified in the previous iteration. This makes the algorithm adaptive, enabling it to focus on the more difficult samples in subsequent iterations. The final classification is determined by a weighted combination of the weak classifiers [<xref ref-type="bibr" rid="ref29">29</xref>].</p>
        </sec>
        <sec>
          <title>XGBoost</title>
          <p>Extreme Gradient Boosting (XGBoost) is a distributed boosting algorithm that uses regression trees as a base classifier. It is designed to improve the accuracy of gradient boosting and is known for its high predictive power and speed, capable of being many times faster than other boosting techniques due to its parallel and distributed computing capabilities [<xref ref-type="bibr" rid="ref30">30</xref>]. In addition, XGBoost performs well in sparse feature spaces [<xref ref-type="bibr" rid="ref31">31</xref>] and uses more accurate approximations to find the best tree model.</p>
        </sec>
      </sec>
      <sec>
        <title>Transfer Learning Algorithms</title>
        <sec>
          <title>Overview</title>
          <p>Transfer learning refers to transferring knowledge from different but related source domains to the target model in target domains in order to improve the performance of the target model. Consequently, a target model can be constructed without having to rely on a large number of domain data [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref32">32</xref>].</p>
        </sec>
        <sec>
          <title>Bidirectional Encoder Representations From Transformers</title>
          <p>Bidirectional Encoder Representations from Transformers (BERT) is a transfer learning algorithm that uses bidirectional transformers to create contextualized embeddings for each word in a sentence or text. It has achieved state-of-the-art performance in a wide range of NLP tasks, including text classification, question answering, and language understanding [<xref ref-type="bibr" rid="ref33">33</xref>]. BERT has 2 phases: pretraining and fine-tuning. In the pretraining phase, the model is trained on a large corpus of text to learn general language patterns. In the fine-tuning phase, the pretrained model is adapted to a specific task by training it on a smaller, task-specific data set [<xref ref-type="bibr" rid="ref34">34</xref>].</p>
        </sec>
        <sec>
          <title>A Lite BERT</title>
          <p>A Lite BERT (ALBERT) is a variant of BERT that reduces the number of parameters in the model while maintaining the same performance. This is achieved by applying 2 parameter reduction techniques: factorized embedding parameterization and cross-layer parameter sharing. ALBERT has been shown to have faster training times and better scalability than BERT, making it a useful option for large-scale NLP tasks [<xref ref-type="bibr" rid="ref35">35</xref>].</p>
        </sec>
      </sec>
      <sec>
        <title>Evaluation</title>
        <p>To evaluate the performance of compared algorithms, we randomly partitioned the First Wave data set into 2 subsets: a training set used for cross-validation (CV; 4050/5063, 80% labeled posts) and a test set (1013/5063, 20% labeled posts). The CV set was further divided into 10 subsets, allowing us to construct and train 10 distinct models. The CV set was further partitioned into 10 subsets, enabling the construction and training of 10 distinct models. Using unseen test data is essential to evaluate the models’ generalization capability. This assessment with unseen data minimizes the risk of overfitting and offers a more dependable estimate of the algorithm’s real-world performance. Then, we selected the best-performing classifier and evaluated its reliability on unseen data using the longitudinal data set. A reliable algorithm should accurately predict the class of a large portion of the unseen data. Finally, we trained and tested the best-performing algorithm on the Alzheimer’s Awareness Month data set, using a training set (129/1289, 10% labeled posts) and a test set (1160/1289, 90% labeled posts). To assess the performance of the used ML algorithms, evaluation metrics including accuracy, precision, sensitivity, specificity, <italic>F</italic><sub>1</sub>-score, and area under the curve (AUC) were used. These metrics provide a comprehensive understanding of the performance of algorithms in terms of correctly identifying relevant posts and minimizing false positives and false negatives.</p>
      </sec>
      <sec>
        <title>Parameter Sensitivity Analysis</title>
        <p>To optimize the performance of competing algorithms, it is necessary to tune their parameters. However, there is no one-size-fits-all approach for parameter selection. Therefore, in this study, we explored different sets of parameter values for each algorithm to identify the optimal configuration. A summary of the ML algorithms and their respective parameters used in this study can be found in <xref ref-type="table" rid="table2">Table 2</xref>.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Parameters of different ML<sup>a</sup> algorithms<sup>b</sup>.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="500"/>
            <col width="500"/>
            <thead>
              <tr valign="top">
                <td>Algorithm</td>
                <td>Parameters</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>LR<sup>c</sup></td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>C: [0.1, 0.5, 1,2]</p>
                    </list-item>
                    <list-item>
                      <p>Multiclass: [ovr]</p>
                    </list-item>
                    <list-item>
                      <p>Solver: [Newton-CG, IBFGS, Liblinear, SAG, SAGA]</p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>kNN<sup>d</sup></td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Number of neighbors: [3, 5, 7, …, 51]</p>
                    </list-item>
                    <list-item>
                      <p>Metrics: [Euclidian, Manhattan]</p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>SVM<sup>e</sup></td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>C: [0.1, 0.5, 1, 1.5, 2]</p>
                    </list-item>
                    <list-item>
                      <p>Kernel: [linear, poly, RBF, sigmoid]</p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>DT<sup>f</sup></td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Criterion: [Entropy, Gini]</p>
                    </list-item>
                    <list-item>
                      <p>Min sample leaf: [1,2,4,6,8]</p>
                    </list-item>
                    <list-item>
                      <p>Min sample split: [1, 2, 3, …, 10, …, 20]</p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>RF<sup>g</sup></td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>N estimators: [100, 200, 300, 400, 500]</p>
                    </list-item>
                    <list-item>
                      <p>Criterion: [Entropy, Gini]</p>
                    </list-item>
                    <list-item>
                      <p>Max depth: [2, 4, 6, …, 32, …, 64]</p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>Adaboost<sup>h</sup></td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Base estimators: [LR, DT, SVM]</p>
                    </list-item>
                    <list-item>
                      <p>N estimators: [10, 20, 30, …100, …, 500]</p>
                    </list-item>
                    <list-item>
                      <p>Max depth: [1, 2, 3, 4, …, 20]</p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>XGBoost<sup>i</sup></td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Max depth: [1, 2, 3, 4, …, 19, 20]</p>
                    </list-item>
                    <list-item>
                      <p>Learning rate: [0.01, 0.015, 0.02, 0.025, …, 0.1]</p>
                    </list-item>
                  </list>
                </td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>ML: machine learning.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>Each algorithm has different parameters. The values in brackets represent values for each parameter.</p>
            </fn>
            <fn id="table2fn3">
              <p><sup>c</sup>LR: logistic regression.</p>
            </fn>
            <fn id="table2fn4">
              <p><sup>d</sup>KNN: k-nearest neighbor.</p>
            </fn>
            <fn id="table2fn5">
              <p><sup>e</sup>SVM: support vector machine.</p>
            </fn>
            <fn id="table2fn6">
              <p><sup>f</sup>DT: decision tree.</p>
            </fn>
            <fn id="table2fn7">
              <p><sup>g</sup>RF: random forest.</p>
            </fn>
            <fn id="table2fn8">
              <p><sup>h</sup>Adaboost: Adaptive Boosting.</p>
            </fn>
            <fn id="table2fn9">
              <p><sup>i</sup>XGBoost: Extreme Gradient Boosting.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Implementation</title>
        <p>Posts were obtained using TWINT, a powerful scraping tool that allows for the scraping of posts without the requirement for an X account and the use of X’s application programming interface, enhancing the number of posts scraped, frequency, and time period of scrapes. We used Python programming language (version 3.8.5; Python Software Foundation) using NLTK [<xref ref-type="bibr" rid="ref36">36</xref>], Scikit-Learn [<xref ref-type="bibr" rid="ref37">37</xref>], and TensorFlow [<xref ref-type="bibr" rid="ref38">38</xref>] libraries. The computations were performed on an Nvidia Tesla T4 GPU (Nvidia Corporation) within Google Colab.</p>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>Ethical considerations about social media research suggest that publicly available data (eg, posts on X) can be used for research studies without requiring additional consent or ethics approval [<xref ref-type="bibr" rid="ref39">39</xref>]. In this study, we did not apply for the ethics approval. In addition, because we did not engage or interfere with the users whose publicly posted content was collected and analyzed, we did not require informed consent. Nonetheless, to ensure users’ anonymity and protect their privacy, any related identifying personal information (eg, user IDs and usernames) has been removed. There was no compensation provided to the users whose public X posts were used.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Parameter Sensitivity Analysis</title>
        <p>In this study, several algorithms were used with multiple parameters that were tuned to obtain optimal performance. AdaBoost performed best with a base estimator of DT, N-estimator set to 100, and a max depth set to 5. DT achieved the best performance with the criterion parameter set to “entropy,” a minimum sample leaf set to 100 and a minimum sample split set to 5. KNN performed best with 5 neighbors and with the “Euclidean” distance metric. LR achieved the best performance with a parameter <italic>c</italic> of 0.5, “ovr” selected for multiclass and “liblinear” selected for solver parameters. RF performed best with “Gini” as the criterion parameter, N-estimator set to 300, and max depth set to 32. SVM performed best with a <italic>radial basis function</italic> kernel and <italic>C</italic> set to 5. Finally, XGBoost performed best when max depth and learning rate were set to 6 and 0.015, respectively.</p>
      </sec>
      <sec>
        <title>Performance Analysis</title>
        <sec>
          <title>Study 1</title>
          <p>The performance of various algorithms in identifying posts relevant to dementia and COVID-19 was evaluated with the First Wave data set. SVM achieved the highest accuracy (96.17%), precision (93.08%), sensitivity (98.04%), specificity (94.86%), <italic>F</italic><sub>1</sub>-score (95.49%), and AUC (99.41%) for the training set, followed by the RF algorithm obtaining the second-best performance across all metrics. <xref ref-type="table" rid="table3">Table 3</xref> summarizes the performance metrics of different algorithms for the training set.</p>
          <p>For the test set, the ALBERT algorithm achieved the best accuracy (82.92%), precision (74.55%), sensitivity (84.83%), specificity (82.24%), <italic>F</italic><sub>1</sub>-score (78.81%), and AUC (83.53%) among all the algorithms tested. The BERT algorithm achieved the second-best performance for these metrics. Performance metrics of different algorithms for the test set are summarized in <xref ref-type="table" rid="table4">Table 4</xref>, and the receiver operating characteristic curves of competing algorithms for the training and test sets are shown in <xref rid="figure2" ref-type="fig">Figure 2</xref>A and 2B, respectively.</p>
          <p>Based on the performance of various algorithms on the test set (<xref ref-type="table" rid="table4">Table 4</xref>), the ALBERT algorithm demonstrated promising performance in class-wise measurements and accurately identifying posts relevant to dementia and COVID-19. Therefore, we concluded that the ALBERT algorithm shows reliable performance for this task. The class-wise results of competing algorithms using confusion matrices are shown in <xref rid="figure3" ref-type="fig">Figure 3</xref>.</p>
          <table-wrap position="float" id="table3">
            <label>Table 3</label>
            <caption>
              <p>The performance of different algorithms for the training set using the First Wave data set.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="190"/>
              <col width="120"/>
              <col width="130"/>
              <col width="140"/>
              <col width="140"/>
              <col width="140"/>
              <col width="140"/>
              <thead>
                <tr valign="top">
                  <td>Algorithm</td>
                  <td>Mean accuracy % (SD)</td>
                  <td>Mean precision % (SD)</td>
                  <td>Mean sensitivity % (SD)</td>
                  <td>Mean specificity % (SD)</td>
                  <td>Mean <italic>F</italic><sub>1</sub>-score % (SD)</td>
                  <td>Mean AUC<sup>a</sup> % (SD)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Logistic regression</td>
                  <td>80.51 (0.18)</td>
                  <td>68.39 (0.54)</td>
                  <td>83.88 (0.32)</td>
                  <td>78.65 (0.19)</td>
                  <td>75.35 (0.31)</td>
                  <td>88.96 (0.1)</td>
                </tr>
                <tr valign="top">
                  <td>Naïve Bayes</td>
                  <td>78.91 (0.29)</td>
                  <td>62.57 (0.62)</td>
                  <td>85.07 (0.48)</td>
                  <td>76.02 (0.27)</td>
                  <td>72.1 (0.41)</td>
                  <td>87.86 (0.13)</td>
                </tr>
                <tr valign="top">
                  <td>Multinomial Naïve Bayes</td>
                  <td>79.3 (0.56)</td>
                  <td>71.82 (4.46)</td>
                  <td>79 (2.67)</td>
                  <td>79.74 (1.82)</td>
                  <td>75.06 (1.74)</td>
                  <td>82.23 (1.51)</td>
                </tr>
                <tr valign="top">
                  <td>K-nearest neighbor</td>
                  <td>78.41 (0.32)</td>
                  <td>69.78 (2.61)</td>
                  <td>78.41 (1.79)</td>
                  <td>78.53 (1.08)</td>
                  <td>73.78 (0.67)</td>
                  <td>87.42 (0.23)</td>
                </tr>
                <tr valign="top">
                  <td>Support vector machine</td>
                  <td>
                    <italic>96.17 (0.13)<sup>b</sup></italic>
                  </td>
                  <td>
                    <italic>93.08 (0.23)</italic>
                  </td>
                  <td>
                    <italic>98.04 (0.22)</italic>
                  </td>
                  <td>
                    <italic>94.86 (0.15)</italic>
                  </td>
                  <td>
                    <italic>95.49 (0.15)</italic>
                  </td>
                  <td>
                    <italic>99.41 (0.05)</italic>
                  </td>
                </tr>
                <tr valign="top">
                  <td>Decision tree</td>
                  <td>77.39 (0.62)</td>
                  <td>68.87 (1.48)</td>
                  <td>76.84 (1.18)</td>
                  <td>77.76 (0.72)</td>
                  <td>72.62 (0.87)</td>
                  <td>86.19 (0.79)</td>
                </tr>
                <tr valign="top">
                  <td>Random forest</td>
                  <td>88.66 (0.34)</td>
                  <td>80.83 (1.03)</td>
                  <td>92.17 (0.42)</td>
                  <td>86.49 (0.54)</td>
                  <td>86.12 (0.52)</td>
                  <td>96.87 (0.09)</td>
                </tr>
                <tr valign="top">
                  <td>AdaBoost</td>
                  <td>75.08 (0.85)</td>
                  <td>76.2 (2.34)</td>
                  <td>69.6 (1.68)</td>
                  <td>80.21 (0.98)</td>
                  <td>72.7 (0.74)</td>
                  <td>83.86 (0.55)</td>
                </tr>
                <tr valign="top">
                  <td>XGBoost</td>
                  <td>73.99 (0.57)</td>
                  <td>59.9 (5.67)</td>
                  <td>75.65 (2.64)</td>
                  <td>73.41 (1.94)</td>
                  <td>66.59 (2.57)</td>
                  <td>82.8 (0.24)</td>
                </tr>
                <tr valign="top">
                  <td>BERT<sup>c</sup></td>
                  <td>81.52 (0.1)</td>
                  <td>75.42 (1.04)</td>
                  <td>85.13 (1.01)</td>
                  <td>81.78 (0.2)</td>
                  <td>80.23 (1.32)</td>
                  <td>82.98 (0.4)</td>
                </tr>
                <tr valign="top">
                  <td>ALBERT<sup>d</sup></td>
                  <td>81.78 (0.45)</td>
                  <td>74.98 (0.98)</td>
                  <td>84.08 (0.82)</td>
                  <td>82.19 (0.61)</td>
                  <td>79.27 (0.5)</td>
                  <td>83.13 (0.45)</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table3fn1">
                <p><sup>a</sup>AUC: area under the curve.</p>
              </fn>
              <fn id="table3fn2">
                <p><sup>b</sup>The best values for the performance metrics are in italics.</p>
              </fn>
              <fn id="table3fn3">
                <p><sup>c</sup>BERT: Bidirectional Encoder Representations from Transformers.</p>
              </fn>
              <fn id="table3fn4">
                <p><sup>d</sup>ALBERT: A Lite BERT.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
          <table-wrap position="float" id="table4">
            <label>Table 4</label>
            <caption>
              <p>The performance of different algorithms for test set using the First Wave data set.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="230"/>
              <col width="130"/>
              <col width="120"/>
              <col width="140"/>
              <col width="140"/>
              <col width="120"/>
              <col width="120"/>
              <thead>
                <tr valign="top">
                  <td>Algorithm</td>
                  <td>Mean accuracy % (SD)</td>
                  <td>Mean precision % (SD)</td>
                  <td>Mean sensitivity % (SD)</td>
                  <td>Mean specificity % (SD)</td>
                  <td>Mean <italic>F</italic><sub>1</sub>-score % (SD)</td>
                  <td>Mean AUC<sup>a</sup> % (SD)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Logistic regression</td>
                  <td>75.13 (0.32)</td>
                  <td>61.97 (2.81)</td>
                  <td>76.55 (2.85)</td>
                  <td>74.39 (2.38)</td>
                  <td>68.43 (1.96)</td>
                  <td>82.21 (0.99)</td>
                </tr>
                <tr valign="top">
                  <td>Naïve Bayes</td>
                  <td>74.26 (0.48)</td>
                  <td>56.19 (3.39)</td>
                  <td>78.8 (3.91)</td>
                  <td>72.29 (2.65)</td>
                  <td>65.5 (2.65)</td>
                  <td>82.13 (1.16)</td>
                </tr>
                <tr valign="top">
                  <td>Multinomial Naïve Bayes</td>
                  <td>72.1 (2.67)</td>
                  <td>63.6 (4.26)</td>
                  <td>69.92 (4.52)</td>
                  <td>73.75 (1.92)</td>
                  <td>66.43 (2.78)</td>
                  <td>74.37 (1.87)</td>
                </tr>
                <tr valign="top">
                  <td>K-nearest neighbor</td>
                  <td>64.31 (1.79)</td>
                  <td>53.54 (3.98)</td>
                  <td>60.49 (4.74)</td>
                  <td>66.94 (1.97)</td>
                  <td>56.6 (2.5)</td>
                  <td>73.87 (2.72)</td>
                </tr>
                <tr valign="top">
                  <td>Support vector machine</td>
                  <td>74.75 (0.22)</td>
                  <td>62.04 (2.2)</td>
                  <td>75.65 (2.99)</td>
                  <td>74.26 (2.1)</td>
                  <td>68.13 (1.93)</td>
                  <td>82.09 (1.09)</td>
                </tr>
                <tr valign="top">
                  <td>Decision tree</td>
                  <td>66.49 (1.18)</td>
                  <td>56.03 (2.98)</td>
                  <td>62.97 (3.21)</td>
                  <td>68.73 (2.4)</td>
                  <td>59.25 (2.52)</td>
                  <td>69.64 (2.18)</td>
                </tr>
                <tr valign="top">
                  <td>Random forest</td>
                  <td>73.37 (0.42)</td>
                  <td>56.98 (2.4)</td>
                  <td>75.91 (3.79)</td>
                  <td>72.15 (2.06)</td>
                  <td>65.05 (2.46)</td>
                  <td>80.48 (1.82)</td>
                </tr>
                <tr valign="top">
                  <td>AdaBoost</td>
                  <td>68.13 (1.68)</td>
                  <td>67.98 (4.67)</td>
                  <td>62.44 (3.65)</td>
                  <td>73.54 (3.04)</td>
                  <td>64.94 (2.6)</td>
                  <td>73.67 (1.65)</td>
                </tr>
                <tr valign="top">
                  <td>XGBoost</td>
                  <td>70.32 (2.64)</td>
                  <td>54.6 (5.89)</td>
                  <td>70.75 (3.52)</td>
                  <td>70.29 (2.88)</td>
                  <td>61.4 (3.82)</td>
                  <td>76.82 (1.93)</td>
                </tr>
                <tr valign="top">
                  <td>BERT<sup>b</sup></td>
                  <td>81.03 (1.01)</td>
                  <td>73.87 (3.43)</td>
                  <td>80.21 (3.1)</td>
                  <td>77.61 (3.3)</td>
                  <td>77.87 (2.34)</td>
                  <td>80.89 (2.3)</td>
                </tr>
                <tr valign="top">
                  <td>ALBERT<sup>c</sup></td>
                  <td>
                    <italic>82.92 (0.82)<sup>d</sup></italic>
                  </td>
                  <td>
                    <italic>74.55 (8.91)</italic>
                  </td>
                  <td>
                    <italic>84.83 (5.86)</italic>
                  </td>
                  <td>
                    <italic>82.24 (5.06)</italic>
                  </td>
                  <td>
                    <italic>78.81 (4.93)</italic>
                  </td>
                  <td>
                    <italic>83.53 (3.14)</italic>
                  </td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table4fn1">
                <p><sup>a</sup>AUC: area under the curve.</p>
              </fn>
              <fn id="table4fn2">
                <p><sup>b</sup>BERT: Bidirectional Encoder Representations from Transformers.</p>
              </fn>
              <fn id="table4fn3">
                <p><sup>c</sup>ALBERT: A Lite BERT.</p>
              </fn>
              <fn id="table4fn4">
                <p><sup>d</sup>The best values for the performance metrics are in italics.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
          <fig id="figure2" position="float">
            <label>Figure 2</label>
            <caption>
              <p>Receiver operating characteristic curves for competing machine learning algorithms for training (A) and test sets (B). The corresponding area under the curve values are given for each algorithm. ROC: receiver operating characteristic; LR: logistics regression; AUC: area under the curve; NB: Naïve Bayes; MNB: Multinomial Naïve Bayes; kNN: k-nearest neighbor; SVM: support vector machine; DT: decision tree; RF: random forest; BERT: Bidirectional Encoder Representations from Transformers; ALBERT: A Lite BERT.</p>
            </caption>
            <graphic xlink:href="formative_v8i1e49562_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <fig id="figure3" position="float">
            <label>Figure 3</label>
            <caption>
              <p>Confusion matrices produced by the competing algorithms for the test set using the First Wave data set. LR: logistics regression; NB: Naïve Bayes; MNB: Multinomial Naïve Bayes; kNN: k-nearest neighbor; SVM: support vector machine; DT: decision tree; RF: random forest; BERT: Bidirectional Encoder Representations from Transformers; ALBERT: A Lite BERT.</p>
            </caption>
            <graphic xlink:href="formative_v8i1e49562_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Study 2</title>
          <p>To extend our study to other conditions and evaluate the reliability of the best-performing algorithm in identifying posts relevant to dementia and COVID-19, we applied it to the longitudinal data set. Because this data set shares similarities with the one used in study 1, the results of study 2 can indicate whether the same methods can be used for similar data sets involving mental health disorders, pandemics, and the like. The ALBERT algorithm was used to test the longitudinal data set, and the results, as shown in <xref ref-type="table" rid="table5">Table 5</xref>, confirmed its effectiveness in correctly identifying posts relevant to dementia and COVID-19 at the second stage of the pandemic.</p>
          <p>Although the ALBERT algorithm demonstrated promising performance in study 2, it was important to assess the generalizability and robustness of the methodology introduced by using a similar data set and search terms. Testing an algorithm on a data set with similar words as the training data can result in overfitting and may not generalize well to other data sets. Furthermore, a customized method for a given data set is not overly useful. Accordingly, study 3 was conducted to evaluate the performance of the ALBERT algorithm on a new and unrelated yet adjacent data set.</p>
          <table-wrap position="float" id="table5">
            <label>Table 5</label>
            <caption>
              <p>The performance of the ALBERT<sup>a</sup> algorithm using the longitudinal data set.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="100"/>
              <col width="160"/>
              <col width="160"/>
              <col width="170"/>
              <col width="170"/>
              <col width="110"/>
              <col width="130"/>
              <thead>
                <tr valign="top">
                  <td>Algorithm</td>
                  <td>Mean accuracy %</td>
                  <td>Mean precision %</td>
                  <td>Mean sensitivity %</td>
                  <td>Mean specificity %</td>
                  <td>Mean <italic>F</italic><sub>1</sub>-score %</td>
                  <td>Mean AUC<sup>b</sup> %</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>ALBERT</td>
                  <td>81.23</td>
                  <td>86.02</td>
                  <td>79.45</td>
                  <td>83.41</td>
                  <td>82.24</td>
                  <td>81.46</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table5fn1">
                <p><sup>a</sup>ALBERT: A Lite Bidirectional Encoder Representations from Transformers.</p>
              </fn>
              <fn id="table5fn2">
                <p><sup>b</sup>AUC: area under the curve.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
        </sec>
        <sec>
          <title>Study 3</title>
          <p>Study 3 aimed to assess the generalizability of the ML algorithm in identifying posts relevant to different mental disorders and pandemics by examining the X discourse on dementia during Alzheimer’s Awareness Month in Canada. Given the variability of health issues and the possibility of different pandemics, it is highly beneficial to have a model that performs well across different contexts. To evaluate the performance of the ALBERT algorithm in this context, we retrained the algorithm using 10% (129/1289 labeled posts) of the Alzheimer’s Awareness Month data set (addressing different mental health disorders and containing different search terms from those in the First Wave and longitudinal data sets) and tested it on the remaining 90% (1160/1289 labeled posts). As shown in <xref ref-type="table" rid="table6">Table 6</xref>, using the Alzheimer’s Awareness Month data set ALBERT algorithm achieved a reliable and acceptable performance of 80% or higher in all the metrics considered. The results challenged any potential bias of the ALBERT algorithm toward the First Wave and longitudinal data sets and confirmed that it can be applied to identify and classify posts relevant to different mental health disorders during pandemics.</p>
          <table-wrap position="float" id="table6">
            <label>Table 6</label>
            <caption>
              <p>The performance of the ALBERT<sup>a</sup> algorithm for training and test sets using the Alzheimer’s Awareness Month data set.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="100"/>
              <col width="150"/>
              <col width="160"/>
              <col width="160"/>
              <col width="150"/>
              <col width="150"/>
              <col width="130"/>
              <thead>
                <tr valign="bottom">
                  <td>
                    <break/>
                  </td>
                  <td>Mean accuracy %</td>
                  <td>Mean precision %</td>
                  <td>Mean sensitivity %</td>
                  <td>Mean specificity %</td>
                  <td>Mean <italic>F</italic><sub>1</sub>-score %</td>
                  <td>Mean AUC<sup>b</sup> %</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Training set</td>
                  <td>86.39</td>
                  <td>89.21</td>
                  <td>84.98</td>
                  <td>87.24</td>
                  <td>82.77</td>
                  <td>86.24</td>
                </tr>
                <tr valign="top">
                  <td>Test set</td>
                  <td>80.62</td>
                  <td>83.40</td>
                  <td>79.98</td>
                  <td>80.62</td>
                  <td>81.65</td>
                  <td>80.30</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table6fn1">
                <p><sup>a</sup>ALBERT: A Lite Bidirectional Encoder Representations from Transformers.</p>
              </fn>
              <fn id="table6fn2">
                <p><sup>b</sup>AUC: area under the curve.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
        </sec>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>The COVID-19 pandemic has highlighted the needs of vulnerable populations, including patients with dementia and their caregivers. Social media platforms, particularly X, provide a valuable source of data for health care researchers, governments, policy makers, and practitioners to understand the impact of the pandemic on this population. However, the sheer volume and high percentage of irrelevant social media data make it difficult to manually mine, analyze, and identify relevant posts for this purpose.</p>
        <p>In this study, we aimed to identify posts relevant to dementia and COVID-19 using a combination of NLP and ML algorithms. Our findings demonstrated that transfer learning algorithms such as ALBERT outperform traditional ML algorithms in identifying relevant posts. The results of study 1 showed the superiority of the ALBERT algorithm and demonstrated that it achieved the best performance for the task of identifying relevant posts. In addition, in study 2, this algorithm demonstrated its capability in identifying posts that share similar content with the data set used in study 1. This indicated that transfer learning algorithms can effectively identify posts relevant to similar disorders for which the algorithm was trained. Furthermore, study 3 revealed a high level of generalizability of these algorithms. This suggests that transfer learning algorithms can be trained for a specific disorder or pandemic and can be applied to different (especially adjacent) disorders or pandemics with comparable performance.</p>
        <p>Our study highlights the significant applicability and value of automated approaches in identifying posts relevant to COVID-19 and dementia, and the methodology is applicable to other studies involving the analysis of social media posts. A further application of this study is potential real-time monitoring of public health sentiment during pandemics or other public health crises. Tools can be developed to continuously analyze social media data to track the sentiments, concerns, and information needs of patients with dementia, their caregivers, and the general public. This real-time monitoring can provide invaluable insights to public health authorities, researchers, and policy makers, enabling them to decide about responses, allocate resources efficiently, and provide targeted support to vulnerable populations such as patients with dementia.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>Limitations of this study mainly involve the inherent biases in using X data for research. First, X users represent a particular portion of the population, often more tech-savvy, and this bias can result in the underrepresentation of older or less internet-connected demographics. In addition, X data may not capture the perspectives of individuals who do not use this platform for discussing health-related topics, potentially leading to an inaccurate perspective on public sentiment. Furthermore, linguistic and cultural biases can exist in the language and topics discussed on X, and these biases should also be considered when interpreting the findings. Moreover, this study focused on posts filtered by the keywords “Coronavirus” and “dementia.” Although the methods and analysis were tested on different data sets, they are particularly relevant to data related to COVID-19 and dementia. To increase generalizability, future research could explore the use of various other search keywords. Another limitation is the unexplored possible effect of post length on algorithm performance. Because the study did not consider post length, future research could include sensitivity analysis to determine its impact on algorithm performance. Finally, although posts reflect people’s thoughts at a specific moment, textual features alone may not fully reflect overall sentiments. Various syntactic and semantic post features could be explored to enhance algorithm performance. These limitations do not undermine this study’s contributions.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>The COVID-19 pandemic has created a pressing need for policy makers to make timely decisions to address the needs of vulnerable populations, such as patients with dementia and their caregivers. Social media platforms, particularly X, offer a wealth of data that can provide invaluable insights into COVID-19 and its impact on patients with dementia and other vulnerable populations. However, the sheer volume and high fraction of irrelevant data make it difficult to extract and analyze relevant content. An automated tool that can accurately identify posts relevant to a given topic would be of great value to health care researchers, social scientists, policy makers, and practitioners in analyzing such posts.</p>
        <p>In this study, we explored the use of an automated approach to identify posts relevant to dementia and COVID-19 using various ML algorithms. Our study shows that transfer learning algorithms outperform traditional ML algorithms, with the ALBERT algorithm achieving the best performance among the algorithms tested. The reliability of the results was confirmed using independent data sets, highlighting the ability of these algorithms to be part of an automated methodology for identifying posts relevant to dementia and COVID-19. Such a methodology can also be applied to help facilitate other studies that involve analysis of social media posts, and it can aid in effective and timely decision-making during times of acute need.</p>
        <p>Future research could include the analysis of multiple features of posts to further increase identification performance and the reliability of the algorithms. In addition, as pandemic or other communicable disease-related data continue to become available on social media, the development of a range of analytics and ML-based solutions based on processing such data can lead to enhanced support for patients and their caregivers. Other future research directions include use of specific medical-based models to improve performance, application of the approach to nonmedical data to test its generality, and incorporation of cross-language recognition to gain insight from other cultures.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>The word cloud of the most frequently used word in each data set.</p>
        <media xlink:href="formative_v8i1e49562_app1.docx" xlink:title="DOCX File , 277 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">ALBERT</term>
          <def>
            <p>A lite BERT</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">AUC</term>
          <def>
            <p>area under the curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">BERT</term>
          <def>
            <p>Bidirectional Encoder Representations from Transformers</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">CV</term>
          <def>
            <p>cross-validation</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">DT</term>
          <def>
            <p>decision tree</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">kNN</term>
          <def>
            <p>k-nearest neighbor</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">LR</term>
          <def>
            <p>logistics regression</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">ML</term>
          <def>
            <p>machine learning</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">MNB</term>
          <def>
            <p>multinomial naïve Bayes</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">NB</term>
          <def>
            <p>naïve Bayes</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">RF</term>
          <def>
            <p>random forest</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">SVM</term>
          <def>
            <p>support vector machine</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb14">TF-IDF</term>
          <def>
            <p>term frequency–inverse document frequency</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb15">XGBoost</term>
          <def>
            <p>Extreme Gradient Boosting</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The authors gratefully acknowledge the financial support from the Natural Sciences and Engineering Research Council of Canada under Discovery Grant (RGPN 2020-04467), Mitacs, and Refresh Inc.</p>
    </ack>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>The data sets generated or analyzed during this study are available from the corresponding author on reasonable request.</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="con">
        <p>All authors were involved in the conceptualization of the idea and writing and reviewing the paper. MA designed and implemented the experiment and collected, curated, analyzed, and visualized the data. AAJ analyzed and visualized the data. RJS looked after the administration, supervised the study, and provided funding and resources.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ding</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Leung</surname>
              <given-names>PY</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>TL</given-names>
            </name>
            <name name-style="western">
              <surname>Chan</surname>
              <given-names>AS</given-names>
            </name>
          </person-group>
          <article-title>Effectiveness of lifestyle medicine on cognitive functions in mild cognitive impairments and dementia: a systematic review on randomized controlled trials</article-title>
          <source>Ageing Res Rev</source>
          <year>2023</year>
          <volume>86</volume>
          <fpage>101886</fpage>
          <pub-id pub-id-type="doi">10.1016/j.arr.2023.101886</pub-id>
          <pub-id pub-id-type="medline">36806378</pub-id>
          <pub-id pub-id-type="pii">S1568-1637(23)00045-4</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Masterson-Algar</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Allen</surname>
              <given-names>MC</given-names>
            </name>
            <name name-style="western">
              <surname>Hyde</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Keating</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Windle</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Exploring the impact of COVID-19 on the care and quality of life of people with dementia and their carers: a scoping review</article-title>
          <source>Dementia</source>
          <year>2021</year>
          <volume>21</volume>
          <issue>2</issue>
          <fpage>648</fpage>
          <lpage>676</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://journals.sagepub.com/doi/10.1177/14713012211053971"/>
          </comment>
          <pub-id pub-id-type="doi">10.1177/14713012211053971</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lindeza</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Rodrigues</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Costa</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Guerreiro</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Rosa</surname>
              <given-names>MM</given-names>
            </name>
          </person-group>
          <article-title>Impact of dementia on informal care: a systematic review of family caregivers' perceptions</article-title>
          <source>BMJ Support Palliat Care</source>
          <year>2024</year>
          <volume>14</volume>
          <fpage>e38</fpage>
          <lpage>e49</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://spcare.bmj.com/content/14/e1/e38"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/bmjspcare-2020-002242</pub-id>
          <pub-id pub-id-type="medline">33055092</pub-id>
          <pub-id pub-id-type="pii">bmjspcare-2020-002242</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>O'Connor</surname>
              <given-names>MK</given-names>
            </name>
            <name name-style="western">
              <surname>Nicholson</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Epstein</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Donley</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Salant</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Nguyen</surname>
              <given-names>AH</given-names>
            </name>
            <name name-style="western">
              <surname>Shirk</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Stevenson</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Mittelman</surname>
              <given-names>MS</given-names>
            </name>
          </person-group>
          <article-title>Telehealth support for dementia caregivers during the COVID-19 pandemic: lessons learned from the NYU family support program</article-title>
          <source>Am J Geriatr Psychiatry</source>
          <year>2023</year>
          <volume>31</volume>
          <issue>1</issue>
          <fpage>14</fpage>
          <lpage>21</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36167652"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jagp.2022.08.005</pub-id>
          <pub-id pub-id-type="medline">36167652</pub-id>
          <pub-id pub-id-type="pii">S1064-7481(22)00497-3</pub-id>
          <pub-id pub-id-type="pmcid">PMC9424119</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="web">
          <article-title>ONS report shows 52% increase in excess deaths of people dying of dementia—Alzheimer's Society comments</article-title>
          <source>Alzheimer's Society</source>
          <year>2020</year>
          <access-date>2024-05-02</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.alzheimers.org.uk/news/2020-06-05/ons-report-shows-52-increase-excess-deaths-people-dying-dementia-alzheimers-society">https://www.alzheimers.org.uk/news/2020-06-05/ons-report-shows-52-increase-excess-deaths-people-dying-dementia-alzheimers-society</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Davis</surname>
              <given-names>PB</given-names>
            </name>
            <name name-style="western">
              <surname>Gurney</surname>
              <given-names>ME</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>COVID-19 and dementia: analyses of risk, disparity, and outcomes from electronic health records in the US</article-title>
          <source>Alzheimers Dement</source>
          <year>2021</year>
          <volume>17</volume>
          <issue>8</issue>
          <fpage>1297</fpage>
          <lpage>1306</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/33559975"/>
          </comment>
          <pub-id pub-id-type="doi">10.1002/alz.12296</pub-id>
          <pub-id pub-id-type="medline">33559975</pub-id>
          <pub-id pub-id-type="pmcid">PMC8014535</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chayangkoon</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Srivihok</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Text classification model for methamphetamine-related tweets in Southeast Asia using dual data preprocessing techniques</article-title>
          <source>Int J Electr Comput Eng</source>
          <year>2021</year>
          <volume>11</volume>
          <issue>4</issue>
          <fpage>3617</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ijece.iaescore.com/index.php/IJECE/article/view/24615"/>
          </comment>
          <pub-id pub-id-type="doi">10.11591/ijece.v11i4.pp3617-3628</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Khattak</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Batool</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Satti</surname>
              <given-names>FA</given-names>
            </name>
            <name name-style="western">
              <surname>Hussain</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Khan</surname>
              <given-names>WA</given-names>
            </name>
            <name name-style="western">
              <surname>Khan</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Hayat</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Tweets classification and sentiment analysis for personalized tweets recommendation</article-title>
          <source>Complexity</source>
          <year>2020</year>
          <volume>2020</volume>
          <fpage>1</fpage>
          <lpage>11</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.hindawi.com/journals/complexity/2020/8892552/"/>
          </comment>
          <pub-id pub-id-type="doi">10.1155/2020/8892552</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Daouadi</surname>
              <given-names>KE</given-names>
            </name>
            <name name-style="western">
              <surname>Rebaï</surname>
              <given-names>RZ</given-names>
            </name>
            <name name-style="western">
              <surname>Amous</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>Optimizing semantic deep forest for tweet topic classification</article-title>
          <source>Inf Syst</source>
          <year>2021</year>
          <volume>101</volume>
          <fpage>101801</fpage>
          <pub-id pub-id-type="doi">10.1016/j.is.2021.101801</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sarsam</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Al-Samarraie</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Alzahrani</surname>
              <given-names>AI</given-names>
            </name>
            <name name-style="western">
              <surname>Alnumay</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>AP</given-names>
            </name>
          </person-group>
          <article-title>A lexicon-based approach to detecting suicide-related messages on Twitter</article-title>
          <source>Biomed Signal Process Control</source>
          <year>2021</year>
          <volume>65</volume>
          <fpage>102355</fpage>
          <pub-id pub-id-type="doi">10.1016/j.bspc.2020.102355</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jamali</surname>
              <given-names>AA</given-names>
            </name>
            <name name-style="western">
              <surname>Berger</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Spiteri</surname>
              <given-names>RJ</given-names>
            </name>
          </person-group>
          <article-title>Momentary depressive feeling detection using X (formerly Twitter) data: contextual language approach</article-title>
          <source>JMIR AI</source>
          <year>2023</year>
          <volume>2</volume>
          <fpage>e49531</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ai.jmir.org/2023/1/e49531"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/49531</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Roy</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ojha</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Twitter sentiment analysis using deep learning models</article-title>
          <year>2020</year>
          <conf-name>2020 IEEE 17th India Council International Conference (INDICON)</conf-name>
          <conf-date>December 10-13, 2020</conf-date>
          <conf-loc>New Delhi, India</conf-loc>
          <pub-id pub-id-type="doi">10.1109/indicon49873.2020.9342279</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chiroma</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Cocea</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Suicide related text classification with prism algorithm</article-title>
          <year>2018</year>
          <conf-name>2018 International Conference on Machine Learning and Cybernetics (ICMLC)</conf-name>
          <conf-date>July 15-18, 2018</conf-date>
          <conf-loc>Chengdu, China</conf-loc>
          <pub-id pub-id-type="doi">10.1109/icmlc.2018.8527032</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gulati</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Kumar</surname>
              <given-names>SS</given-names>
            </name>
            <name name-style="western">
              <surname>Boddu</surname>
              <given-names>RSK</given-names>
            </name>
            <name name-style="western">
              <surname>Sarvakar</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Sharma</surname>
              <given-names>DK</given-names>
            </name>
            <name name-style="western">
              <surname>Nomani</surname>
              <given-names>MZM</given-names>
            </name>
          </person-group>
          <article-title>Comparative analysis of machine learning-based classification models using sentiment classification of tweets related to COVID-19 pandemic</article-title>
          <source>Mater Today Proc</source>
          <year>2022</year>
          <volume>51</volume>
          <fpage>38</fpage>
          <lpage>41</lpage>
          <pub-id pub-id-type="doi">10.1016/j.matpr.2021.04.364</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Satu</surname>
              <given-names>MS</given-names>
            </name>
            <name name-style="western">
              <surname>Khan</surname>
              <given-names>MI</given-names>
            </name>
            <name name-style="western">
              <surname>Mahmud</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Uddin</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Summers</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Quinn</surname>
              <given-names>JMW</given-names>
            </name>
            <name name-style="western">
              <surname>Moni</surname>
              <given-names>MA</given-names>
            </name>
          </person-group>
          <article-title>TClustVID: a novel machine learning classification model to investigate topics and sentiment in COVID-19 tweets</article-title>
          <source>Knowl Based Syst</source>
          <year>2021</year>
          <volume>226</volume>
          <fpage>107126</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/33972817"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.knosys.2021.107126</pub-id>
          <pub-id pub-id-type="medline">33972817</pub-id>
          <pub-id pub-id-type="pii">S0950-7051(21)00389-0</pub-id>
          <pub-id pub-id-type="pmcid">PMC8099549</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bacsu</surname>
              <given-names>JD</given-names>
            </name>
            <name name-style="western">
              <surname>O'Connell</surname>
              <given-names>ME</given-names>
            </name>
            <name name-style="western">
              <surname>Cammer</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Azizi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Grewal</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Poole</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Green</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sivananthan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Spiteri</surname>
              <given-names>RJ</given-names>
            </name>
          </person-group>
          <article-title>Using Twitter to understand the COVID-19 experiences of people with dementia: infodemiology study</article-title>
          <source>J Med Internet Res</source>
          <year>2021</year>
          <volume>23</volume>
          <issue>2</issue>
          <fpage>e26254</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2021/2/e26254/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/26254</pub-id>
          <pub-id pub-id-type="medline">33468449</pub-id>
          <pub-id pub-id-type="pii">v23i2e26254</pub-id>
          <pub-id pub-id-type="pmcid">PMC7861035</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>AK</given-names>
            </name>
            <name name-style="western">
              <surname>Shashi</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Vectorization of text documents for identifying unifiable news articles</article-title>
          <source>Int J Adv Comput Sci Appl</source>
          <year>2019</year>
          <volume>10</volume>
          <issue>7</issue>
          <fpage>305</fpage>
          <lpage>310</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://thesai.org/Publications/ViewPaper?Volume=10&amp;Issue=7&amp;Code=IJACSA&amp;SerialNo=42"/>
          </comment>
          <pub-id pub-id-type="doi">10.14569/ijacsa.2019.0100742</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Yao</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <person-group person-group-type="editor">
            <name name-style="western">
              <surname>Wilbik</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Bouchon-Meunier</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Carvalho</surname>
              <given-names>JP</given-names>
            </name>
            <name name-style="western">
              <surname>Reformat</surname>
              <given-names>MZ</given-names>
            </name>
            <name name-style="western">
              <surname>Lesot</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Yager</surname>
              <given-names>RR</given-names>
            </name>
            <name name-style="western">
              <surname>Vieira</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Feature extraction with TF-IDF and game-theoretic shadowed sets</article-title>
          <source>Information Processing and Management of Uncertainty in Knowledge-Based Systems</source>
          <year>2020</year>
          <publisher-loc>Cham, Switzerland</publisher-loc>
          <publisher-name>Springer International Publishing</publisher-name>
          <fpage>722</fpage>
          <lpage>733</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ramos</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Using TF-IDF to determine word relevance in document queries</article-title>
          <source>Proc First Instr Conf Mach Learn</source>
          <year>2003</year>
          <volume>242</volume>
          <issue>1</issue>
          <fpage>29</fpage>
          <lpage>48</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://citeseerx.ist.psu.edu/document?repid=rep1&amp;type=pdf&amp;doi=b3bf6373ff41a115197cb5b30e57830c16130c2c"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Riffenburgh</surname>
              <given-names>RH</given-names>
            </name>
            <name name-style="western">
              <surname>Gillen</surname>
              <given-names>DL</given-names>
            </name>
          </person-group>
          <article-title>Logistic regression for binary outcomes</article-title>
          <source>Statistics in Medicine, 4th Edition</source>
          <year>2020</year>
          <publisher-loc>London, United Kingdom</publisher-loc>
          <publisher-name>Academic Press</publisher-name>
          <fpage>437</fpage>
          <lpage>457</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Naive Bayes classification algorithm based on small sample set</article-title>
          <year>2011</year>
          <conf-name>2011 IEEE International Conference on Cloud Computing and Intelligence Systems</conf-name>
          <conf-date>September 15-17, 2011</conf-date>
          <conf-loc>Beijing, China</conf-loc>
          <pub-id pub-id-type="doi">10.1109/ccis.2011.6045027</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Structure extended multinomial naive Bayes</article-title>
          <source>Inf Sci</source>
          <year>2016</year>
          <volume>329</volume>
          <fpage>346</fpage>
          <lpage>356</lpage>
          <pub-id pub-id-type="doi">10.1016/j.ins.2015.09.037</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kibriya</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Frank</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Pfahringer</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Holmes</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <person-group person-group-type="editor">
            <name name-style="western">
              <surname>Webb</surname>
              <given-names>GI</given-names>
            </name>
            <name name-style="western">
              <surname>Webb</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Multinomial naive Bayes for text categorization revisited</article-title>
          <source>AI 2004: Advances in Artificial Intelligence</source>
          <year>2005</year>
          <publisher-loc>Berlin, Heidelberg</publisher-loc>
          <publisher-name>Springer Berlin Heidelberg</publisher-name>
          <fpage>488</fpage>
          <lpage>499</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Taunk</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>De</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Verma</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Swetapadma</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>A brief review of nearest neighbor algorithm for learning and classification</article-title>
          <year>2019</year>
          <conf-name>2019 International Conference on Intelligent Computing and Control Systems (ICCS)</conf-name>
          <conf-date>May 15-17, 2019</conf-date>
          <conf-loc>Madurai, India</conf-loc>
          <pub-id pub-id-type="doi">10.1109/iccs45141.2019.9065747</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Alsagri</surname>
              <given-names>HS</given-names>
            </name>
            <name name-style="western">
              <surname>Ykhlef</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Machine learning-based approach for depression detection in Twitter using content and activity features</article-title>
          <source>IEICE Trans Inf Syst</source>
          <year>2020</year>
          <volume>E103.D</volume>
          <issue>8</issue>
          <fpage>1825</fpage>
          <lpage>1832</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jstage.jst.go.jp/article/transinf/E103.D/8/E103.D_2020EDP7023/_article"/>
          </comment>
          <pub-id pub-id-type="doi">10.1587/transinf.2020edp7023</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Müller</surname>
              <given-names>AC</given-names>
            </name>
            <name name-style="western">
              <surname>Guido</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <source>Introduction to Machine Learning with Python: A Guide for Data Scientists</source>
          <year>2016</year>
          <publisher-loc>Sebastopol, California</publisher-loc>
          <publisher-name>O'Reilly Media, Inc</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Che</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Rasheed</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Tao</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Decision tree and ensemble learning algorithms with their applications in bioinformatics</article-title>
          <source>Adv Exp Med Biol</source>
          <year>2011</year>
          <volume>696</volume>
          <fpage>191</fpage>
          <lpage>199</lpage>
          <pub-id pub-id-type="doi">10.1007/978-1-4419-7046-6_19</pub-id>
          <pub-id pub-id-type="medline">21431559</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Breiman</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Bagging predictors</article-title>
          <source>Mach Learn</source>
          <year>1996</year>
          <volume>24</volume>
          <issue>2</issue>
          <fpage>123</fpage>
          <lpage>140</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://link.springer.com/article/10.1007/BF00058655"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/bf00058655</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kowsari</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Meimandi</surname>
              <given-names>KJ</given-names>
            </name>
            <name name-style="western">
              <surname>Heidarysafa</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Mendu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Barnes</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Brown</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Text classification algorithms: a survey</article-title>
          <source>Information</source>
          <year>2019</year>
          <volume>10</volume>
          <issue>4</issue>
          <fpage>150</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/2078-2489/10/4/150"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/info10040150</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Guestrin</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>XGBoost: a scalable tree boosting system</article-title>
          <year>2016</year>
          <conf-name>KDD '16: The 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining</conf-name>
          <conf-date>August 13-17, 2016</conf-date>
          <conf-loc>San Francisco, California, USA</conf-loc>
          <pub-id pub-id-type="doi">10.1145/2939672.2939785</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zeng</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Gautam</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Huson</surname>
              <given-names>DH</given-names>
            </name>
          </person-group>
          <article-title>On the application of advanced machine learning methods to analyze enhanced, multimodal data from persons infected with COVID-19</article-title>
          <source>Computation</source>
          <year>2021</year>
          <volume>9</volume>
          <issue>1</issue>
          <fpage>4</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/2079-3197/9/1/4"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/computation9010004</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhuang</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Qi</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Duan</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Xi</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Xiong</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>Q</given-names>
            </name>
          </person-group>
          <article-title>A comprehensive survey on transfer learning</article-title>
          <source>Proc IEEE</source>
          <year>2021</year>
          <volume>109</volume>
          <issue>1</issue>
          <fpage>43</fpage>
          <lpage>76</lpage>
          <pub-id pub-id-type="doi">10.1109/jproc.2020.3004555</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Deepa</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Bidirectional Encoder Representations from Transformers (BERT) language model for sentiment analysis task</article-title>
          <source>Turk J Comput Math Educ</source>
          <year>2021</year>
          <volume>12</volume>
          <issue>7</issue>
          <fpage>1708</fpage>
          <lpage>1721</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://turcomat.org/index.php/turkbilmat/article/view/3055"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>González-Carvajal</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Garrido-Merchán</surname>
              <given-names>EC</given-names>
            </name>
          </person-group>
          <article-title>Comparing BERT against traditional machine learning text classification</article-title>
          <source>Journal of Computational and Cognitive Engineering</source>
          <year>2020</year>
          <volume>2</volume>
          <issue>4</issue>
          <fpage>352</fpage>
          <lpage>356</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2005.13012"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lan</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Goodman</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Gimpel</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Sharma</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Soricut</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Albert: a lite bert for self-supervised learning of language representations</article-title>
          <source>INTERNATIONAL CONFERENCE ON LEARNING REPRESENTATIONS (ICLR)</source>
          <year>2020</year>
          <fpage>1</fpage>
          <lpage>17</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://iclr.cc/virtual_2020/poster_H1eA7AEtvS.html"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Perkins</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <source>Python Text Processing With NLTK 2.0 Cookbook</source>
          <year>2010</year>
          <publisher-loc>England</publisher-loc>
          <publisher-name>PACKT publishing</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pedregosa</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Varoquaux</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Gramfort</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Michel</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Thirion</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Grisel</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Blondel</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Prettenhofer</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Weiss</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Dubourg</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Vanderplas</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Passos</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Cournapeau</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Brucher</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Perrot</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Duchesnay</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Scikit-learn: machine learning in Python</article-title>
          <source>J Mach Learn Res</source>
          <year>2011</year>
          <volume>12</volume>
          <fpage>2825</fpage>
          <lpage>2830</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmlr.org/papers/volume12/pedregosa11a/pedregosa11a.pdf?"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Silaparasetty</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>The tensorflow machine learning library</article-title>
          <source>Machine Learning Concepts with Python and the Jupyter Notebook Environment: Using Tensorflow 2.0</source>
          <year>2020</year>
          <publisher-loc>New York, NY</publisher-loc>
          <publisher-name>Springer</publisher-name>
          <fpage>149</fpage>
          <lpage>171</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Williams</surname>
              <given-names>ML</given-names>
            </name>
            <name name-style="western">
              <surname>Burnap</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Sloan</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Towards an ethical framework for publishing Twitter data in social research: taking into account users' views, online context and algorithmic estimation</article-title>
          <source>Sociology</source>
          <year>2017</year>
          <volume>51</volume>
          <issue>6</issue>
          <fpage>1149</fpage>
          <lpage>1168</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://journals.sagepub.com/doi/abs/10.1177/0038038517708140?url_ver=Z39.88-2003&amp;rfr_id=ori:rid:crossref.org&amp;rfr_dat=cr_pub  0pubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.1177/0038038517708140</pub-id>
          <pub-id pub-id-type="medline">29276313</pub-id>
          <pub-id pub-id-type="pii">10.1177_0038038517708140</pub-id>
          <pub-id pub-id-type="pmcid">PMC5718335</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
