<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.0" xmlns:xlink="http://www.w3.org/1999/xlink">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JFR</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id>
      <journal-title>JMIR Formative Research</journal-title>
      <issn pub-type="epub">2561-326X</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v9i1e57395</article-id>
      <article-id pub-id-type="pmid"/>
      <article-id pub-id-type="doi">10.2196/57395</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Public Health Discussions on Social Media: Evaluating Automated Sentiment Analysis Methods</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Mavragani</surname>
            <given-names>Amaryllis</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Ashraf</surname>
            <given-names>Amir Reza</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Margolin</surname>
            <given-names>Drew B</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes" equal-contrib="yes">
          <name name-style="western">
            <surname>Gandy</surname>
            <given-names>Lisa M</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Computer Science</institution>
            <institution>College of Sciences and Liberal Arts</institution>
            <institution>Kettering University</institution>
            <addr-line>1700 University Ave</addr-line>
            <addr-line>2300 AB</addr-line>
            <addr-line>Flint, MI, 48504</addr-line>
            <country>United States</country>
            <phone>1 9898547001</phone>
            <email>lgandy@kettering.edu</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6487-8064</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Ivanitskaya</surname>
            <given-names>Lana V</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-4138-6646</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Bacon</surname>
            <given-names>Leeza L</given-names>
          </name>
          <degrees>DHA</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0000-8229-816X</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Bizri-Baryak</surname>
            <given-names>Rodina</given-names>
          </name>
          <degrees>DHA</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0003-8769-9597</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Computer Science</institution>
        <institution>College of Sciences and Liberal Arts</institution>
        <institution>Kettering University</institution>
        <addr-line>Flint, MI</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of Health Administration</institution>
        <institution>The College of Health Professions</institution>
        <institution>Central Michigan University</institution>
        <addr-line>Mt Pleasant, MI</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Department of Healthcare Management</institution>
        <institution>Northwood University</institution>
        <addr-line>Midland, MI</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Lisa M Gandy <email>lgandy@kettering.edu</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2025</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>8</day>
        <month>1</month>
        <year>2025</year>
      </pub-date>
      <volume>9</volume>
      <elocation-id>e57395</elocation-id>
      <history>
        <date date-type="received">
          <day>15</day>
          <month>2</month>
          <year>2024</year>
        </date>
        <date date-type="rev-request">
          <day>25</day>
          <month>6</month>
          <year>2024</year>
        </date>
        <date date-type="rev-recd">
          <day>31</day>
          <month>8</month>
          <year>2024</year>
        </date>
        <date date-type="accepted">
          <day>20</day>
          <month>9</month>
          <year>2024</year>
        </date>
      </history>
      <copyright-statement>©Lisa M Gandy, Lana V Ivanitskaya, Leeza L Bacon, Rodina Bizri-Baryak. Originally published in JMIR Formative Research (https://formative.jmir.org), 08.01.2025.</copyright-statement>
      <copyright-year>2025</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on https://formative.jmir.org, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://formative.jmir.org/2025/1/e57395" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Sentiment analysis is one of the most widely used methods for mining and examining text. Social media researchers need guidance on choosing between manual and automated sentiment analysis methods.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>Popular sentiment analysis tools based on natural language processing (NLP; VADER [Valence Aware Dictionary for Sentiment Reasoning], TEXT2DATA [T2D], and Linguistic Inquiry and Word Count [LIWC-22]), and a large language model (ChatGPT 4.0) were compared with manually coded sentiment scores, as applied to the analysis of YouTube comments on videos discussing the opioid epidemic. Sentiment analysis methods were also examined regarding ease of programming, monetary cost, and other practical considerations.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>Evaluation methods included descriptive statistics, receiver operating characteristic (ROC) curve analysis, confusion matrices, Cohen κ, accuracy, specificity, precision, sensitivity (recall), <italic>F</italic><sub>1</sub>-score harmonic mean, and the Matthews correlation coefficient. An inductive, iterative approach to content analysis of the data was used to obtain manual sentiment codes.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>A subset of comments were analyzed by a second coder, producing good agreement between the 2 coders’ judgments (κ=0.734). YouTube social media about the opioid crisis had many more negative comments (4286/4871, 88%) than positive comments (79/662, 12%), making it possible to evaluate the performance of sentiment analysis models in an unbalanced dataset. The tone summary measure from LIWC-22 performed better than other tools for estimating the prevalence of negative versus positive sentiment. According to the ROC curve analysis, VADER was best at classifying manually coded negative comments. A comparison of Cohen κ values indicated that NLP tools (VADER, followed by LIWC’s tone and T2D) showed only fair agreement with manual coding. In contrast, ChatGPT 4.0 had poor agreement and failed to generate binary sentiment scores in 2 out of 3 attempts. Variations in accuracy, specificity, precision, sensitivity, <italic>F</italic><sub>1</sub>-score, and MCC did not reveal a single superior model. <italic>F</italic><sub>1</sub>-score harmonic means were 0.34-0.38 (SD 0.02) for NLP tools and very low (0.13) for ChatGPT 4.0. None of the MCCs reached a strong correlation level.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Researchers studying negative emotions, public worries, or dissatisfaction with social media face unique challenges in selecting models suitable for unbalanced datasets. We recommend VADER, the only cost-free tool we evaluated, due to its excellent discrimination, which can be further improved when the comments are at least 100 characters long. If estimating the prevalence of negative comments in an unbalanced dataset is important, we recommend the tone summary measure from LIWC-22. Researchers using T2D must know that it may only score some data and, compared with other methods, be more time-consuming and cost-prohibitive. A general-purpose large language model, ChatGPT 4.0, has yet to surpass the performance of NLP models, at least for unbalanced datasets with highly prevalent (7:1) negative comments.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>ChatGPT</kwd>
        <kwd>VADER</kwd>
        <kwd>valence aware dictionary for sentiment reasoning</kwd>
        <kwd>LIWC-22</kwd>
        <kwd>machine learning</kwd>
        <kwd>social media</kwd>
        <kwd>sentiment analysis</kwd>
        <kwd>public health</kwd>
        <kwd>population health</kwd>
        <kwd>opioids</kwd>
        <kwd>drugs</kwd>
        <kwd>pharmacotherapy</kwd>
        <kwd>pharmaceuticals</kwd>
        <kwd>medications</kwd>
        <kwd>YouTube</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>The Pew Research Center [<xref ref-type="bibr" rid="ref1">1</xref>] reports that as of 2021, 72% of Americans used social media. On a global scale, as of 2019, social media platforms were used by 1 in 3 people worldwide and by more than two-thirds of all internet users [<xref ref-type="bibr" rid="ref2">2</xref>]. With social media users rapidly increasing, user-generated content has grown exponentially. Generated by a broad swath of global citizens, the data provides insights into a wide array of human experiences, for example, the effects of the opioid crisis on physical, mental, and social well-being.</p>
      <p>Analysis of the opioid epidemic revealed the struggles of opioid victims, their families, and communities. This information adds value to health policy analysis.</p>
      <p>This study examines the use of YouTube, a unique social media platform. It is a diverse medium because its primary purpose is content sharing and educating its users about relevant topics [<xref ref-type="bibr" rid="ref3">3</xref>]. Users seek out content of interest and, comment and react. When examining health care crises and the relay of medical information, sentiment analysis may be used to gauge the response to the stimulus. By analyzing the sentiments expressed in comments, researchers can perform qualitative content analyses to explore how these reactions influence reputation, potentially affect individuals, and impact the communities involved. Furthermore, understanding the sentiment toward health-related content or issues like human suffering and societal concerns can aid policy makers in developing strategies to enhance public health.</p>
      <p>Sentiment analysis is the most popular artificial intelligence used to mine and examine all text types in various fields of study. Sentiment analysis is a computational method that extracts sentiment from a text. Some sentiment analysis methods use rule-based lexicons such as Linguistic Inquiry and Word Count (LIWC)-22 [<xref ref-type="bibr" rid="ref4">4</xref>]. Other sentiment analysis methods use traditional machine learning approaches such as Support Vector Machines [<xref ref-type="bibr" rid="ref5">5</xref>] and Naive Bayes classification [<xref ref-type="bibr" rid="ref6">6</xref>], while others use deep learning [<xref ref-type="bibr" rid="ref7">7</xref>]. Sometimes, a hybrid approach is used between 2 or more sentiment analysis methods [<xref ref-type="bibr" rid="ref8">8</xref>].</p>
      <p>This study compares 3 popular sentiment analysis methods on social media data: Valence Aware Dictionary for Sentiment Reasoning (VADER), TEXT2DATA (T2D), and LIWC, and also uses the ChatGPT large language model (LLM) for sentiment analysis. A total of 2 methods (VADER and LIWC) were picked due to their previous validation and use in many published studies by scholars from different disciplines; other methods were chosen because of their user-friendly interface and no requirement of prerequisite programming skills (LIWC-22 and T2D). ChatGPT was chosen as LLM represent a game-changing technological leap in natural language processing (NLP), including sentiment analysis.</p>
      <p>VADER [<xref ref-type="bibr" rid="ref9">9</xref>] is a rule and lexicon-based sentiment analysis tool that, when analyzing text, returns a numeric valence (polarity) score between –1 (extremely negative) and +1 (extremely positive). The VADER lexicon is primarily built using pre-existing lexicons, but these lexicons were extended to include emoticons, acronyms, and slang commonly used in social media. Each lexical feature is assigned a score; then, the score is shifted based on the presence of punctuation marks, capital letters, and negations. The lexical scores are then averaged. VADER has been validated on multiple data types, including product and movie reviews, and has been used as a quality benchmark in numerous studies [<xref ref-type="bibr" rid="ref9">9</xref>-<xref ref-type="bibr" rid="ref11">11</xref>]. It has been used for sentiment analysis in a wide variety of areas, such as customer reviews and opinion mining [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>], political discourse analysis [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>], and mental health studies [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>]. Although VADER has been used extensively, concerns about VADER’s sensitivity to text length have been expressed [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>]. For example, as each word in a text is assigned a polarity score, words with strong sentiment can unduly influence the overall sentiment of a short text. There are proposed methods to mitigate this phenomenon [<xref ref-type="bibr" rid="ref9">9</xref>], such as considering the text of an entire document about a sentence. Another possible issue is that VADER is primarily used as a Python-based program, and users must be moderately proficient in Python programming to use its full capabilities.</p>
      <p>T2D [<xref ref-type="bibr" rid="ref20">20</xref>] is a Microsoft Excel or Google Sheets add-in with a sentiment analysis application programming interface (API). It classifies text into 5 categories: very negative, negative, neutral, positive, and very positive. The T2D website contains scant information about the corpus or the methodology used for sentiment classification. The website states that the API is “based on an NLP engine” and that the “system also contains specially prepared classification models for Twitter (rebranded as X) and other social media content, trained on billions of manually verified entries” [<xref ref-type="bibr" rid="ref20">20</xref>]. It is a paid subscription model, but users can make 1000 API calls a month for free. The next tier is 10,000 API calls for US $27 a month, then a variety of tiers exist, with the highest tier being Enterprise at US $351 a month and Unlimited API calls. As a Microsoft Excel add-in and a user-friendly tool for those familiar with Microsoft Excel, T2D has been applied to studies that examined tourist experiences [<xref ref-type="bibr" rid="ref21">21</xref>], compared Twitter comments with polling results [<xref ref-type="bibr" rid="ref22">22</xref>], and analyzed digital mental health interventions [<xref ref-type="bibr" rid="ref23">23</xref>].</p>
      <p>LIWC-22 is a dictionary-based sentiment analysis tool with a representative score in over 100 categories. The score in each category indicates the percentage of words in the text corresponding to the particular category and, therefore, can range from 0% to 100%. Many LIWC-22 categories are organized in a hierarchical structure. The same word may be categorized into multiple categories. For instance, “celebrate” is in both the positive emotion and achievement categories. Each category is represented internally in LIWC-22 by a dictionary with words and emoticons related to that category. A complete list of LIWC-22 categories can be found in [<xref ref-type="bibr" rid="ref24">24</xref>]. The creators of LIWC-22 selected a vast pool of words that represented a wide range of linguistic categories and psychological dimensions, such as emotions, cognitive processes, social terms, pronouns, prepositions, and other linguistic constructs. LIWC-22 is the fifth iteration of LIWC-22 (LIWC 2001, 2007, 2015, and 2019). To create LIWC, experts in linguistics and psychology first gathered a large corpus of texts and tagged each word into a linguistic category. The resulting dictionaries underwent validation studies and then passed through a refinement phase when researchers added new vocabulary and, as meanings shifted, modified existing word-category pairings. LIWC-22 is a subscription model. Currently, users pay US $129.95 for a 3-year subscription, and shorter-term subscriptions are offered at lower prices. LIWC-22 has both a web-based app and an app for download. The users upload their file (JSON, CSV, and EXCEL), choose the LIWC-22 dictionary they would like to use, and then select the label in their data that denotes the text to be classified. Users also have the option to choose all or specific LIWC-22 categories to be evaluated.</p>
      <p>Similar to VADER and T2D, LIWC-22 has been used as a sentiment analysis tool in many studies, such as education [<xref ref-type="bibr" rid="ref25">25</xref>], public discourse analysis [<xref ref-type="bibr" rid="ref26">26</xref>], and brand perception [<xref ref-type="bibr" rid="ref27">27</xref>] among others. Many categories in LIWC-22 could represent sentiment, including positive emotion, negative emotion, anger, sadness, and anxiety. LIWC-22 includes positive tone and negative tone dimensions but also includes another Tone variable that merges the 2 dimensions into a single summary variable. The higher the number, the more positive the tone; numbers below 50 are counted as negative tone. This paper uses the LIWC-22 tone composite (summary) measure as its dictionary was more extensive than other sentiment-based LIWC-22 categories. We will refer to it as LIWC tone.</p>
      <p>Previous efforts to evaluate sentiment analysis tools did not compare LIWC tone, VADER, and T2D. For example, Boukes et al [<xref ref-type="bibr" rid="ref28">28</xref>] compared LIWC, SentiStrength, Pattern, Polyglot, and DANEW on economic news in Danish. These sentiment analysis tools were chosen as they were off the shelf and supported the Danish language. Hartmann et al [<xref ref-type="bibr" rid="ref29">29</xref>] compared LIWC-22 with a host of machine learning-based sentiment analysis methods (Support Vector Machines, Neural Networks, K Nearest Neighbors, and Random Forests) but did not include VADER or T2D and focus on the marking aspect of social media data.</p>
      <p>Recent advances in NLP have led to the development of powerful language models, such as the GPT series, including ChatGPT (GPT–3.5 and GPT–4) [<xref ref-type="bibr" rid="ref30">30</xref>]. These models, pretrained on vast amounts of text data, have demonstrated strong performance across tasks like language translation, text summarization, and question-answering [<xref ref-type="bibr" rid="ref31">31</xref>]. A LLM ChatGPT, in particular, has shown promise in education, health care, reasoning, text generation, human-machine interaction, and scientific research [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref32">32</xref>]. Despite these opportunities, challenges and ethical concerns remain, particularly regarding accuracy. The accuracy of these models depends heavily on the quality, diversity, and complexity of the training data, as well as the quality of user input [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref31">31</xref>]. Previous research has highlighted the importance of developing higher-order thinking skills in education, but NLP systems may struggle with the nuances of human language, leading to potential errors [<xref ref-type="bibr" rid="ref30">30</xref>-<xref ref-type="bibr" rid="ref32">32</xref>].</p>
      <p>We compare the sentiment scores produced by VADER, T2D, LIWC_tone, and ChatGPT 4.0 to manually created codes. The data source is YouTube comments on videos that discussed the opioid epidemic. This data source was chosen as it was easily accessible to the authors, but also because each of the sentiment analysis classifiers mentioned has been heavily used to classify social media data in the past. As social media (and the data it creates) grows, researchers will likely analyze it using the sentiment tools discussed in this paper.</p>
      <p>In the remainder of the paper, we discuss the methods by which data was collected and coded and will then compare the results given by VADER, T2D, LIWC_tone, and ChatGPT 4.0 to human coding. This paper reports which methods give accurate sentiment scores and discusses other practical considerations when using these tools. We will end with a discussion of the results and considerations for future research.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Data Sources</title>
        <p>To evaluate sentiment analysis methods, we used secondary data from a qualitative study about the opioid crisis in the United States [<xref ref-type="bibr" rid="ref33">33</xref>], which included manually coded sentiment from YouTube comments. To collect YouTube comments (N=8761), the term “opioid” epidemic was searched on YouTube with a date range between January 1, 2017, and December 31, 2018. The majority of the videos collected were located on CNN’s (Cable News Network) YouTube channel and the Fox News YouTube Channel. Subsequently, videos were ranked by the number of views, and the 20 most watched videos by CNN (10 newscasts) and Fox News (10 newscasts) were kept for further analysis. As Google Trends indicates (<xref rid="figure1" ref-type="fig">Figure 1</xref>), the chosen dates coincide with a particularly high interest in the opioid epidemic. The comments for each video were downloaded using the Netlytic website. Comments were deidentified by deleting email addresses and assigning codes such as a1, a2, and so on, to track comments on comments. All other comment information was left unaltered.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Google Trends results for the search term “opioid epidemic.” The search was constrained to dates between February 2015 and December 2019. The y-axis indicates percent popularity.</p>
          </caption>
          <graphic xlink:href="formative_v9i1e57395_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Sentiment Measures</title>
        <p>An inductive, iterative approach to content analysis of the data was used to obtain manual sentiment codes. Bacon [<xref ref-type="bibr" rid="ref33">33</xref>] created a codebook to determine the sentiment of comments as positive, neutral, or negative based on the commenter’s attitude toward the video or another comment. The comments were sorted by time of post and video, then coded in the context of the broader discussion. Video transcripts also informed the coding. A subset of comments was analyzed by a second coder, producing good agreement between the 2 coders’ judgments (κ=0.734; 95% CI 0.57-0.89; <italic>P</italic>&lt;.001).</p>
        <p>To begin testing automated sentiment tools, comments with manual codes indicating neutral or unclear sentiment were excluded from the data. The remaining comments were scored using VADER, T2D’s Google Sheets add-in, LIWC, and ChatGPT 4.0. Specifically, the VADER compound score, T2D sentiment score, and LIWC-22 tone score (LIWC tone), a composite measure for positive and negative tone dimensions, were used. ChatGPT 4.0, VADER, and T2D were centered around 0 and ranged from –1 to 1. Negative values indicated a negative sentiment and positive values showed a positive sentiment. LIWC tone scores ranged from 1 to 100, representing percentiles based on standardized scores from large comparison corpora [<xref ref-type="bibr" rid="ref34">34</xref>]. They are calculated using a dictionary with words, word stems, phrases, and select emoticons built for text analysis. LIWC tone overall mean for 15 diverse corpora was 47.81 (SD 26.39). Bacon’s codebook was adapted to prompt the general-purpose LLM to generate positive and negative sentiment classification (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> for the ChatGPT 4.0 prompt). The classification was generated on the third try after two failed attempts.</p>
      </sec>
      <sec>
        <title>Analyses</title>
        <p>We performed the receiver operating characteristic (ROC) curve analysis and computed descriptive statistics, confusion matrices, Cohen , accuracy, specificity, precision, sensitivity (recall), <italic>F</italic><sub>1</sub>-score harmonic mean, and the Matthews correlation coefficient (MCC). Relying solely on the ROC curve without considering precision and negative predictive value can lead to a misleading assessment of a model’s success [<xref ref-type="bibr" rid="ref35">35</xref>]. Although widely used in machine learning, the <italic>F</italic><sub>1</sub>-score also has limitations; it can vary when positive and negative classes are exchanged, potentially distorting its interpretation [<xref ref-type="bibr" rid="ref36">36</xref>]. In addition, the <italic>F</italic><sub>1</sub>-score does not account for correctly classified negative and positive samples, drawing criticism for diverging from more intuitive metrics like accuracy and losing significance when class labels are reversed [<xref ref-type="bibr" rid="ref36">36</xref>]. In contrast, MCC provides a more balanced evaluation, achieving its highest values of –1 or +1 only when the classifier performs well across all 4 key rates of the confusion matrix: sensitivity, specificity, precision, and negative predictive value [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>].</p>
        <p>Finally, to reveal misclassification patterns by model, we analyzed the content of the comments marked as false positives and false negatives from both NLP and LLM models and compared them with true positives and true negatives identified through manual coding. We summarized possible reasons behind misclassifications and provided representative comments as illustrations.</p>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>Data were collected from a social media platform (YouTube) where data are publicly available. However, all sentiment evaluation methods were performed at a macro scale and not at the user level. In addition, social media profile information is not shared in the data provided in this article.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <p>Manual coding placed 63.2% of comments into either positive or negative categories; the remainder were neutral, or their sentiment could not be ascertained. Only positively or negatively classified comments were used for further analyses (N=5533). Positive comments were much less common (79/662, 12%) than negative (4286/4,871, 88%) comments.</p>
      <p>VADER, LIWC tone, and ChatGPT 4.0 were able to classify all comments. However, for unknown reasons, T2D sentiment scores could not be calculated for 514 comments, resulting in 9% of missing values. T2D is a “black box” system, and documentation has not been released. As shown in <xref rid="figure2" ref-type="fig">Figure 2</xref>, LIWC tone analysis most closely matched the high prevalence of negative comments (88%) in manually coded data: 82% of LIWC tone scores fell below the mean of 47.81. ChatGPT 4.0 overestimated negative sentiment, classifying 98% of the comments as negative. In comparison, only 56% of VADER and 66% of T2D scores were negatively scored. VADER score distribution had a mode around 2 and was the most continuous compared with T2D and LIWC. VADER assigned near-0 scores to 15% of comments, T2D had 1% of near-zero data but did not score 9% of comments, and LIWC tone scores were very unevenly distributed with 3 modes (at scale’s endpoints and 23.23).</p>
      <fig id="figure2" position="float">
        <label>Figure 2</label>
        <caption>
          <p>A comparison of Valence Aware Dictionary for Sentiment Reasoning, TEXT2DATA, and Linguistic Inquiry and Word Count tone score distributions for 5533 comments classified as either negative or positive using manual coding. *LIWC tone scores below 47.81 are considered negative. T2D: TEXT2DATA.</p>
        </caption>
        <graphic xlink:href="formative_v9i1e57395_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
      <p>As an overall measure of discrimination, we used the ROC curve [<xref ref-type="bibr" rid="ref37">37</xref>]. Discrimination is the ability of a measure to discern between social media comments that are manually coded as negative or positive. We were interested in evaluating continuously measured VADER, T2D, and LIWC tone scores as predictors of positive and negative sentiment in YouTube comments; manual coding is an outcome against which the 3 models were assessed. ROC curve analysis was not performed for ChatGPT 4.0 because it was only generated as a binary measure. The area under the ROC curve is a measure of the overall discriminatory ability of the binomial logistic regression model, that is, the ability of a chosen sentiment scoring method to classify comments into the 2 groups of our dichotomous dependent variable, a manually coded sentiment where negative is assigned a value –1 and positive is given a value of 1. ROC curve analysis is most suitable for balanced analyses. While we are interested in correctly classifying positives and negatives, our dataset is unbalanced due to the high prevalence of negative comments.</p>
      <p>As shown in <xref rid="figure3" ref-type="fig">Figure 3</xref>, ROC curves with red lines above the blue straight reference line indicate discrimination; the further above the reference line, the better. The area under the ROC curve is equivalent to the concordance probability [<xref ref-type="bibr" rid="ref38">38</xref>]. <xref rid="figure2" ref-type="fig">Figure 2</xref> shows an excellent level of discrimination, according to Hosmer et al [<xref ref-type="bibr" rid="ref39">39</xref>], for VADER’s ability to classify manually coded negative comments (the area under the ROC curve was 0.800, 95% CI 0.78-0.82), followed by T2D and LIWC tone. T2D and LIWC tone demonstrated acceptable discrimination, with the areas under the curve of 0.770 and 0.747, respectively.</p>
      <p>LIWC tone, according to <xref rid="figure4" ref-type="fig">Figure 4</xref>, performs the same or worse across the entire range of true positive rates (sensitivity), except at the short stretch of higher sensitivity rates where it is superior to T2D. Overall, VADER performs better than the 2 other sentiment analysis systems. Next, we tested if VADER performed better for longer strings of text. <xref rid="figure4" ref-type="fig">Figure 4</xref> compares VADER results or all comments, regardless of length, and for longer comments (&gt;100 characters and &gt;200 characters long). VADER indeed performed better when short comments were excluded. Confusion matrices are given in <xref ref-type="table" rid="table1">Tables 1</xref>-<xref ref-type="table" rid="table4">4</xref>.</p>
      <p><xref ref-type="table" rid="table5">Table 5</xref> shows 7 measures of model performance compared with manual coding that are Cohen , accuracy, specificity, precision, sensitivity (recall), <italic>F</italic><sub>1</sub>-score, and MCC.</p>
      <fig id="figure3" position="float">
        <label>Figure 3</label>
        <caption>
          <p>Receiver operating characteristic analyses: a comparison of Valence Aware Dictionary for Sentiment Reasoning, TEXT2DATA, and Linguistic Inquiry and Word Count tone scores’ ability to classify comments that were manually coded as either negative (–1) or positive (1).</p>
        </caption>
        <graphic xlink:href="formative_v9i1e57395_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
      <fig id="figure4" position="float">
        <label>Figure 4</label>
        <caption>
          <p>Receiver operating characteristic curve comparisons for Valence Aware Dictionary for Sentiment Reasoning, TEXT2DATA, and Linguistic Inquiry and Word Count Tone.</p>
        </caption>
        <graphic xlink:href="formative_v9i1e57395_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
      <table-wrap position="float" id="table1">
        <label>Table 1</label>
        <caption>
          <p>A confusion matrix for binary coded Valence Aware Dictionary for Sentiment Reasoning, which are negative (–1) and positive (1) sentiments.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="260"/>
          <col width="300"/>
          <col width="280"/>
          <col width="0"/>
          <col width="160"/>
          <thead>
            <tr valign="top">
              <td>VADER<sup>a</sup></td>
              <td colspan="3">Manual coding, n</td>
              <td>Total, N</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>–1</td>
              <td>1</td>
              <td colspan="2">
                <break/>
              </td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>–1</td>
              <td>3693</td>
              <td>230</td>
              <td colspan="2">3923</td>
            </tr>
            <tr valign="top">
              <td>1</td>
              <td>1178</td>
              <td>432</td>
              <td colspan="2">1610</td>
            </tr>
            <tr valign="top">
              <td>Total</td>
              <td>3923</td>
              <td>662</td>
              <td colspan="2">5533</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table1fn1">
            <p><sup>a</sup>VADER: Valence Aware Dictionary for Sentiment Reasoning.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
      <table-wrap position="float" id="table2">
        <label>Table 2</label>
        <caption>
          <p>A confusion matrix for binary coded t2d: negative (–1) and positive (1) sentiment.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="260"/>
          <col width="300"/>
          <col width="280"/>
          <col width="160"/>
          <thead>
            <tr valign="top">
              <td>T2D<sup>a</sup></td>
              <td colspan="2">Manual coding, n</td>
              <td>Total, n</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>–1</td>
              <td>
                1
              </td>
              <td>
                <break/>
              </td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>–1.00</td>
              <td>3218</td>
              <td>145</td>
              <td>3363</td>
            </tr>
            <tr valign="top">
              <td>1.00</td>
              <td>1277</td>
              <td>379</td>
              <td>1656</td>
            </tr>
            <tr valign="top">
              <td>Total</td>
              <td>4495</td>
              <td>524</td>
              <td>5019</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table2fn1">
            <p><sup>a</sup>T2D: TEXT2DATA.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
      <table-wrap position="float" id="table3">
        <label>Table 3</label>
        <caption>
          <p>A confusion matrix for binary coded LIWC_tone: negative (–1) and positive (1) sentiment.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="260"/>
          <col width="300"/>
          <col width="280"/>
          <col width="0"/>
          <col width="160"/>
          <thead>
            <tr valign="top">
              <td>LIWC_tone<sup>a</sup></td>
              <td colspan="3">Manual coding, n</td>
              <td>Total, n</td>
            </tr>
            <tr valign="top">
              <td> </td>
              <td>–1</td>
              <td>1</td>
              <td colspan="2">
                <break/>
              </td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>–1</td>
              <td>4214</td>
              <td>373</td>
              <td colspan="2">4515</td>
            </tr>
            <tr valign="top">
              <td>1</td>
              <td>729</td>
              <td>289</td>
              <td colspan="2">1018</td>
            </tr>
            <tr valign="top">
              <td>Total</td>
              <td>4871</td>
              <td>662</td>
              <td colspan="2">5533</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table3fn1">
            <p><bold><sup>a</sup></bold>LIWC: Linguistic Inquiry and Word Count.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
      <table-wrap position="float" id="table4">
        <label>Table 4</label>
        <caption>
          <p>A confusion matrix for binary coded ChatGPT 4.0: negative (–1) and positive (1) sentiments.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="260"/>
          <col width="300"/>
          <col width="280"/>
          <col width="0"/>
          <col width="160"/>
          <thead>
            <tr valign="top">
              <td>ChatGPT 4.0</td>
              <td colspan="3">Manual coding, n, negative (–1); positive (1)</td>
              <td>Total, n</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>–1</td>
              <td>1</td>
              <td colspan="2">
                <break/>
              </td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>–1</td>
              <td>4814</td>
              <td>610</td>
              <td colspan="2">5424</td>
            </tr>
            <tr valign="top">
              <td>1</td>
              <td>57</td>
              <td>52</td>
              <td colspan="2">109</td>
            </tr>
            <tr valign="top">
              <td>Total</td>
              <td>4871</td>
              <td>662</td>
              <td colspan="2">5533</td>
            </tr>
          </tbody>
        </table>
      </table-wrap>
      <table-wrap position="float" id="table5">
        <label>Table 5</label>
        <caption>
          <p>A comparison of Valence Aware Dictionary for Sentiment Reasoning, TEXT2DATA, Linguistic Inquiry and Word Count tone, and ChatGPT 4.0 against manual coding.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="140"/>
          <col width="270"/>
          <col width="110"/>
          <col width="110"/>
          <col width="100"/>
          <col width="100"/>
          <col width="100"/>
          <col width="70"/>
          <thead>
            <tr valign="top">
              <td>Model</td>
              <td>Cohen κ (95% CI; <italic>P</italic> value)</td>
              <td>Accuracy</td>
              <td>Specificity</td>
              <td>Precision</td>
              <td>Sensitivity (recall)</td>
              <td><italic>F</italic><sub>1</sub>-score</td>
              <td>MCC<sup>a</sup></td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>VADER<sup>b</sup></td>
              <td>0.254 (95% CI 0.23- 0.28;<break/>&lt;.001)</td>
              <td>0.74</td>
              <td>0.94</td>
              <td>0.26</td>
              <td>0.65</td>
              <td>0.38</td>
              <td>0.29</td>
            </tr>
            <tr valign="top">
              <td>T2D<sup>c</sup></td>
              <td>0.225 (95% CI (0.2-.25; &lt;.001)</td>
              <td>0.72</td>
              <td>0.96</td>
              <td>0.23</td>
              <td>0.72</td>
              <td>0.35</td>
              <td>0.3</td>
            </tr>
            <tr valign="top">
              <td>LIWC<sup>d</sup>_tone</td>
              <td>0.233 (95% CI 0.20-0.26; &lt;.001)</td>
              <td>0.80</td>
              <td>0.92</td>
              <td>0.28</td>
              <td>0.47</td>
              <td>0.34</td>
              <td>0.24</td>
            </tr>
            <tr valign="top">
              <td>ChatGPT 4.0</td>
              <td>0.105 (95% CI 0.1-0.31; &lt;.001)</td>
              <td>0.88</td>
              <td>0.88</td>
              <td>0.50</td>
              <td>0.08</td>
              <td>0.13</td>
              <td>0.16</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table5fn1">
            <p><sup>a</sup>MCC: Matthews correlation coefficient.</p>
          </fn>
          <fn id="table5fn2">
            <p><sup>b</sup>VADER: Valence Aware Dictionary for Sentiment Reasoning.</p>
          </fn>
          <fn id="table5fn3">
            <p><sup>c</sup>T2D: TEXT2DATA.</p>
          </fn>
          <fn id="table5fn4">
            <p><sup>d</sup>LIWC: Linguistic Inquiry and Word Count.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
      <p>First, Cohen is used to examine the classification of YouTube comments into positive and negative sentiments. κ Values show fair agreement of manual coding with VADER (κ=0.254, 95% CI 0.23-0.28; <italic>P</italic>&lt;.001), LIWC tone (κ=0.233, 95% CI 0.20-0.26; <italic>P</italic>&lt;.001), and T2D (κ=0.26, 95% CI 0.20-0.25; <italic>P</italic>&lt;.001). Overall, NLP demonstrated better agreement than LLM (κ=0.105, 95% CI 0.1-0.31; <italic>P</italic>&lt;.001) for ChatGPT 4.0 indicating especially poor agreement of manual coding [<xref ref-type="bibr" rid="ref40">40</xref>].</p>
      <p>Second, ChatGPT 4.0 achieved the highest accuracy score of 88%, followed by LIWC tone (80%), VADER (74%), and T2D (72%). Third, the proportion of accurately detected negatives or specificity varied from 96% for T2D to 94% for VADER, 92% for LIWC tone, and 88% for ChatGPT 4.0. Fourth, precision reveals how well a model accurately makes positive predictions [<xref ref-type="bibr" rid="ref41">41</xref>]. Precision values indicate that positive predictions were unlikely to be very accurate: ChatGPT 4.0 was at 50%, as compared with even lower values for LIWC tone (28%), VADER (26%), and T2D (23%).</p>
      <p>Fifth, sensitivity, recognized as recall, detects true positives within a confusion matrix, with T2D achieving 72%, VADER achieving 65%, LIWC tone reaching 47%, and ChatGPT 4.0 achieving only 8% [<xref ref-type="bibr" rid="ref41">41</xref>]. Sixth, the <italic>F</italic><sub>1</sub>-score is the harmonic mean of precision and sensitivity (recall), offering an evaluation of true positives and positive predictive values was below 50% for all models: VADER (at 0.38) was closely followed byT2D (0.35), LIWC tone (0.34), whereas ChatGPT’s <italic>F</italic><sub>1</sub>-score stood out as the lowest (0.13). Finally, MCC was calculated by measuring the sensitivity, specificity, precision, and negative predictive value to evaluate the model performance [<xref ref-type="bibr" rid="ref36">36</xref>]. None of the MCCs reported in <xref ref-type="table" rid="table5">Table 5</xref> reached a strong correlation level; they varied from 0.30 (T2D) to 0.29 (VADER), 0.24 (LIWC tone), and the lowest value of 0.16 for ChatGPT 4.0.</p>
      <p>Large discrepancies observed between the automated classifications and human coding may be explained by specific linguistic features, contextual nuances, model limitations, and other factors. To uncover any underlying patterns contributing to misclassifications and determine whether specific errors are more prevalent in 1 model over another, we analyzed comments classified as false positives and false negatives by LLM and NLP models.</p>
      <p><xref ref-type="table" rid="table6">Table 6</xref> shows how sentiment misclassification varied by model. None of the models stood out as being excellent at avoiding both false positives and false negatives; however, ChatGPT 4.0 performed especially poorly. It classified even the most obviously positive comments as negative sentiment, leading to a very high rate (92%) of false negatives.</p>
      <p>An analysis of false positives did not produce model-specific patterns. Sarcastic comments expressing feigned empathy for drug users were frequently misidentified by all models as false positives, for example, when the commenters talked about people dying but also added phrases such as “<italic>White addiction, i truly love it. Now lets see what happens. Lmbao!</italic>” [comment ID: 756]. Other common false positives involved discussions about marijuana as a gateway drug, criticisms of both Democrats and Republicans, racist remarks, and the attribution of blame to drug users.</p>
      <p>Across all 4 models, discussions about the legalization of marijuana were misclassified as false negatives, as illustrated by this comment,</p>
      <disp-quote>
        <p>The opioid crisis was huge here in Florida. Look at what happened to it when they legalized marijuana for medical purposes, opioid deaths dropped! Look at the states that have it as recreational, opioid use and crime drop 20% in just the first year!</p>
        <attrib>comment ID: 3795</attrib>
      </disp-quote>
      <p>Our examination of false negatives also revealed model-specific, thematic patterns. VADER tended to misclassify political posts and comments on drug legalization as false negatives. T2D had difficulties with mentions of safe injection sites and support for Donald Trump. LIWC tone was prone to misclassifying short comments, those with emojis, as well as references to kratom, CBD (Cannabidiol) oil, and marijuana.</p>
      <p>Finally, <xref ref-type="table" rid="table7">Table 7</xref> summarizes the observed differences between sentiment analysis methods applied to our unbalanced social media dataset.</p>
      <p>As shown in <xref ref-type="table" rid="table7">Table 7</xref>, T2D operates as a black box with minimal technical documentation, posing challenges for academic analysis. Unlike the other methods, VADER requires a moderate level of programming skill. All methods have low or no monetary cost (VADER is free) except for T2D, which charges per transaction, potentially making it cost-prohibitive for large datasets. Based on our findings, <xref ref-type="table" rid="table7">Table 7</xref> provides other important considerations when selecting a model.</p>
      <table-wrap position="float" id="table6">
        <label>Table 6</label>
        <caption>
          <p>Misclassified comments by model: false positives, false negatives, and representative comments.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="180"/>
          <col width="140"/>
          <col width="140"/>
          <col width="540"/>
          <thead>
            <tr valign="top">
              <td>Method, compared with manual coding</td>
              <td>False positives, %</td>
              <td>False negatives, %</td>
              <td>Representative comments</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>VADER<sup>a</sup></td>
              <td>30.02</td>
              <td>34.74</td>
              <td>FP<sup>b</sup>: <italic>Doctor’s have become legalized dope dealers, they tried to get my 17 nephew to take opioids. They gave him a 2 month supply. Luckily he didn’t take it because he’d seen what it can do. He’s still playing football no thanks to the doc and big pharma</italic> [comment ID: 70]; FN<sup>c</sup>: <italic>Yes! It [kratom] got me off pain meds... pain medication</italic> [comment ID: 2470]</td>
            </tr>
            <tr valign="top">
              <td>T2D<sup>d</sup></td>
              <td>28.40</td>
              <td>27.67</td>
              <td>FP: <italic>Big Phama! Big Insurance! Doctors get a cut for each pill script filled! We’re worth more dead, than alive</italic>! [comment ID 71]; FN: <italic>Wish this war on opioids started earlier so many people gone. Very grateful for President Trump</italic> [comment ID: 1483]</td>
            </tr>
            <tr valign="top">
              <td>LIWC<sup>e</sup>_tone</td>
              <td>14.97</td>
              <td>56.34</td>
              <td>FP: <italic>Sue the opioid companies? What like the CIA<sup>f</sup>? LMFAO #CNNISFAKENEWS</italic> [comment ID: 5872]; FN: K<italic>ratom is the best maintenance program you could ever discover. Was on oxy for years, then methadone for years, then I found Kratom. It helps with pain, cravings, and has a bonus effect of reducing the craving for alcohol...</italic> [comment ID: 2478]</td>
            </tr>
            <tr valign="top">
              <td>ChatGPT 4.0</td>
              <td>1.17</td>
              <td>92.14</td>
              <td>FP: <italic>This isn’t helping anyone. The medical industry is already terrified to prescribe these drugs, you people who don’t take opioids have no idea of what damage you are doing to your medical system until you need to use it…</italic> [comment ID: 909]; FN: <italic>I have used MJ In the past to get off of Opioids and Xanax and Vicodin.. Yes, it does work and it also helps with Pain very well. It should be LEGAL even for NonMedical Use! It helps MANY problems and Various Ailments!</italic> [comment ID: 3071]</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table6fn1">
            <p><sup>a</sup>VADER: Valence Aware Dictionary for Sentiment Reasoning.</p>
          </fn>
          <fn id="table6fn2">
            <p><sup>b</sup>FP: false positive.</p>
          </fn>
          <fn id="table6fn3">
            <p><sup>c</sup>FN: false negative.</p>
          </fn>
          <fn id="table6fn4">
            <p><sup>d</sup>T2D: TEXT2DATA.</p>
          </fn>
          <fn id="table6fn5">
            <p><sup>e</sup>LIWC: Linguistic Inquiry and Word Count.</p>
          </fn>
          <fn id="table6fn6">
            <p><sup>f</sup>CIA: Central Intelligence Agency.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
      <table-wrap position="float" id="table7">
        <label>Table 7</label>
        <caption>
          <p>Considerations for selecting sentiment analysis methods when using social media datasets that are unbalanced toward negatives.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="130"/>
          <col width="150"/>
          <col width="120"/>
          <col width="140"/>
          <col width="230"/>
          <col width="230"/>
          <thead>
            <tr valign="top">
              <td>Method</td>
              <td>Implementation type</td>
              <td>Programming skill</td>
              <td>Cost</td>
              <td>Additional cons</td>
              <td>Additional pros</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>Manual coding</td>
              <td>Coding by a trained researcher</td>
              <td>None</td>
              <td>Time for creating codebook, manual coding</td>
              <td>Retraining for intercoder reliability</td>
              <td>Reaches good agreement when 2 humans code comments, accurate coding of ambiguous comments (sarcasm, etc)</td>
            </tr>
            <tr valign="top">
              <td>VADER<sup>a</sup></td>
              <td>Rule-based dictionary</td>
              <td>Moderate</td>
              <td>Free</td>
              <td>Not user-friendly for beginners, may code sarcastic comments as false positives.</td>
              <td>Low runtime, fair agreement with manual codes, excellent discrimination, and performance can be improved by excluding short comments (&lt;100 characters)</td>
            </tr>
            <tr valign="top">
              <td>T2D<sup>b</sup></td>
              <td>Black box</td>
              <td>None</td>
              <td>Paid API<sup>c</sup>: Free up to 1000 transactions, US $27/ month for 10,000 transactions.</td>
              <td>Might not code all data, implementation is not defined (black box), may code sarcastic comments as false positives.</td>
              <td>Low learning curve, self-contained within the same spreadsheet, high runtime, fair agreement with manual codes, acceptable discrimination</td>
            </tr>
            <tr valign="top">
              <td>LIWC<sup>d</sup>_tone</td>
              <td>Rule-based dictionary across multiple LIWC dimensions</td>
              <td>None</td>
              <td>Academic license: US $55 (1-year license) US $129 (3-year license)</td>
              <td>May misclassify sarcastic comments as false positives and short comments as false negatives.</td>
              <td>Includes a contextualizer that highlights words in reported dimensions, low learning curve, low runtime, accurate estimation of prevalence of negative versus positive sentiment, fair agreement with manual codes, acceptable discrimination</td>
            </tr>
            <tr valign="top">
              <td>ChatGPT 4.0</td>
              <td>Large language model</td>
              <td>None</td>
              <td>US $20/month</td>
              <td>Requires prompt design, might not be consistent between iterations, may not be responsive due to high API usage, poor agreement with manual codes, inaccurate estimation of prevalence of negative vs positive sentiment, low MCC<sup>e</sup> may code sarcastic comments as false positives, hallucination possible.</td>
              <td>Low runtime through the OpenAI API can provide reasoning for classification</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table7fn1">
            <p><sup>a</sup>VADER: Valence Aware Dictionary for Sentiment Reasoning.</p>
          </fn>
          <fn id="table7fn2">
            <p><sup>b</sup>T2D: TEXT2DATA.</p>
          </fn>
          <fn id="table7fn3">
            <p><sup>c</sup>API: application programming interface.</p>
          </fn>
          <fn id="table7fn4">
            <p><sup>d</sup>LIWC: Linguistic Inquiry and Word Count.</p>
          </fn>
          <fn id="table7fn5">
            <p><sup>e</sup>MCC: Matthews correlation coefficient.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>This study involved a comparison of manual sentiment coding to 4 automated sentiment analysis methods, namely VADER, T2D, LIWC_tone, and ChatGPT 4.0. We aimed to assess the efficacy of these sentiment analysis techniques in categorizing comments as either positive or negative sentiment in YouTube comments.</p>
        <p>YouTube and other social media platforms are valuable repositories of comments and reviews on topics relevant to various organizations and stakeholders, such as businesses, public policy analysis, and politicians. Our corpus, comments on the US opioid crisis, was manually analyzed to reveal the struggles of opioid epidemic victims, their families, and communities, the issues of value to health policy analysis [<xref ref-type="bibr" rid="ref33">33</xref>]. Like many other datasets of interest to social media researchers, it was skewed toward negative sentiment (7:1) and contained relatively few positive comments. Social media discussions often lean in the negative direction, such as Facebook posts on vaccine hesitancy [<xref ref-type="bibr" rid="ref42">42</xref>] or long Covid discussions on Twitter (rebranded as X) [<xref ref-type="bibr" rid="ref43">43</xref>]. Moreover, identifying social media content with negative effects is particularly valuable because it is more likely to be shared than positive content [<xref ref-type="bibr" rid="ref44">44</xref>,<xref ref-type="bibr" rid="ref45">45</xref>].</p>
        <p>Our analysis showed that accurate positive sentiment classification can be important and policy-relevant when applied to negatively skewed data. The discussion of the opioid epidemic was primarily about human suffering but it was not all negative. Some YouTube users, for example, praised kratom as a means of overcoming opioid addiction. A subset of comments highlighted positive experiences of overcoming addiction and mixed reactions to health policies, such as safe injection sites and marijuana legalization. Marijuana, for example, can be discussed as a gateway drug (negative sentiment) and also from a harm reduction perspective as a substitute for opioids and other harmful street drugs (positive sentiment). Classifying the data by sentiment enables researchers to explore diverse perspectives of the digital public, potentially leading to health policy insights. Given the importance of sentiment analysis in unbalanced datasets, this study offers valuable guidance for social media researchers on the pros and cons of several available methods.</p>
        <p>Overall, VADER performed best on the ROC curve analysis, demonstrating excellent discriminatory capabilities compared with LIWC tone (acceptable) and T2D (acceptable). VADER’s performance improved when short comments were excluded, a finding that verified VADER’s sensitivity to text length suggested by Nair et al [<xref ref-type="bibr" rid="ref18">18</xref>] and Tymann et al [<xref ref-type="bibr" rid="ref19">19</xref>]. However, the ROC analysis has limitations when applied to data with an unbalanced count of negative and positive comments. Also, it could not be computed for binary ChatGPT 4.0 data. LIWC tone performed better than other automated models when estimating the prevalence of negative and positive sentiment in our data.</p>
        <p>A comparison of Cohen κ values indicated that the NLP models (VADER, followed by LIWC and T2D) showed only fair agreement of manual coding, whereas ChatGPT 4.0 had poor agreement. While all models performed better than chance when predicting the dominant class, leading to higher precision, their level of agreement with manual coding is not exceptionally high. Moreover, variations in accuracy, specificity, precision, and sensitivity (recall), <italic>F</italic><sub>1</sub>-score, and MCC did not suggest a single superior model. All models evaluated had relatively low <italic>F</italic><sub>1</sub>–scores, which serve as overall prediction performance measures that combine precision and recall. <italic>F</italic><sub>1</sub>-score was below 50% for all automated models and especially low for ChatGPT 4.0. The same pattern was observed for MCC: none of the models resulted in a significant correlation (T2D’s value was the highest at 0.30), but ChatGPT 4.0 performed the worst. While we cannot endorse a particular best model, our analysis suggests caution when using ChatGPT 4.0 to classify sentiment in an unbalanced dataset.</p>
        <p>A total of 5 sets of instructions were uploaded to train ChatGPT 4.0 to conduct sentiment analysis, incorporating the codebook used by manual coders. The file upload process could have been more convenient, and 2 out of 3 attempts resulted in errors during file reading. LLM sentiment analysis was only accessible through a paid subscription, as the free version could not handle uploading and coding thousands of comments or formatting the results for further statistical analysis and comparison. Despite this, ChatGPT 4.0, like NLP models, seemed to rely on identifying words labeled as positive or negative in the codebook to classify comments, which might have led to misclassifications of sarcastic comments.</p>
        <p>ChatGPT 4.0 outperformed the NLP models as an LLM in only 2 measures: accuracy and precision. It required specific coding instructions expressed as prompts, which we derived from the manually created codebook. Even though the same codebook was used for manual coding and designing ChatGPT 4.0 prompts, their level of agreement could have been better. Considering generation failures, poor agreement with manual coding, and the need for a paid subscription, ChatGPT 4.0 may be different from the model of choice for social media researchers looking to perform sentiment analysis on unbalanced datasets. Our comparison of ChatGPT 4.0 to 3 NLP models indicated that the general-purpose LLM has yet to surpass the performance of traditional NLP models, at least for unbalanced datasets with highly prevalent (7:1) negative comments.</p>
        <p>Across all models, false negatives were associated with discussions on the legalization of marijuana and the observed reduction in mortality in states with more permissive drug policies. The primary issue with NLP models may stem from their reliance on pre-existing dictionaries to classify sentiment in a way that is not target specific. They cannot interpret the nuances of certain words within specific contexts the way a human can. To mitigate this, LIWC-22 and similar NLP models may require the creation of tailored dictionaries to better grasp the particular meanings of words in relevant contexts. Even then, NLP models may never be able to differentiate between negative statements of fact and negative sentiment with a specific target.</p>
        <p>Not only NLP models but perhaps also LLM, tended to link any drug-related vocabulary with negative sentiment, failing to consider the context or nuance, particularly in discussions about overcoming the opioid crisis. This highlights a significant limitation of dictionary-based NLP models: their inability to accurately classify positive comments or recognize positive aspects of complex, contentious discussions compared with the accuracy of manual coding. Misclassification was observed for comments with sarcasm, leading models to mistake feigned empathy for genuine concern. Sentiment classification is complex and requires a deep understanding of the issues to interpret social media discourse accurately.</p>
        <p>Manual coding remains the most reliable method for detecting sentiments when analyzing complex topics on social media, especially for unbalanced datasets. In addition to being time-intensive, it has other limitations. According to Krippendorff [<xref ref-type="bibr" rid="ref46">46</xref>], texts often have multiple meanings. Manual coding is an interpretive process that may only sometimes match the commenter’s intent, even when there is a good interrater agreement. On the other hand, automated sentiment classification may not consistently align with human judgments due to sentiment’s inherently subjective and context-dependent nature, lowering reliability [<xref ref-type="bibr" rid="ref47">47</xref>]. Fair reliability can be considered appropriate in cases involving complex or subjective tasks, exploratory research, resource constraints, qualitative analysis, or research contexts where a high level of agreement is not a primary objective [<xref ref-type="bibr" rid="ref47">47</xref>]. Researchers must carefully weigh the trade-offs between achieving higher reliability and the practical constraints specific to their research.</p>
        <p>Future research should explore whether general-purpose and fine-tuned LLMs and NLP models demonstrate comparable discriminatory performance in social media samples where positive comments are more prevalent than negative ones. Researchers should also test different methods for prompting general-purpose LLMs to follow complex manual codebook instructions.</p>
      </sec>
      <sec>
        <title>Conclusions and Recommendations</title>
        <p>We offer suggestions for VADER, T2D, LIWC tone, and ChatGPT 4.0 applications in the semantic classification of social media, specific to an unbalanced dataset with a high prevalence of negative comments. None of the automated models emerged as a clear leader. With caution, we recommend a no-cost tool, VADER, due to its excellent discrimination, according to our ROC curve analysis, which improves when the comments are at least 100 characters long. VADER requires some programming skills and may underestimate the prevalence of negative comments in unbalanced datasets. LIWC tone may be useful for social media researchers studying negative emotions, public worries, or dissatisfaction when they need to accurately estimate the prevalence of positive versus negative comments in their datasets. Researchers using T2D must know that it may only score some data and, compared with other NLP methods, can be time-consuming and cost prohibitive. ChatGPT 4.0 did not demonstrate superior performance. While the use of general-purpose LLMs is promising, it remains to be determined how to translate manual codebook instructions into prompts best to achieve superior classification results.</p>
      </sec>
    </sec>
  </body>
  <back>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>A codebook with representative comments has been added through <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, as well as a copy of the ChatGPT 4.0 prompts used in sentiment analysis, and a list of videos that evoked YouTube comments.</p>
      </sec>
    </notes>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Representative code book and the prompt used for ChatGPT sentiment analysis.  A list of videos from which YouTube comments were extracted are also included.</p>
        <media xlink:href="formative_v9i1e57395_app1.pdf" xlink:title="PDF File  (Adobe PDF File), 228 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">API</term>
          <def>
            <p>application programming interface</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">LIWC</term>
          <def>
            <p>Linguistic Inquiry and Word Count</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">LLM</term>
          <def>
            <p>large language model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">MCC</term>
          <def>
            <p>Matthews correlation coefficient</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">ROC</term>
          <def>
            <p>receiver operating characteristic</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">T2D</term>
          <def>
            <p>TEXT2DATA</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">VADER</term>
          <def>
            <p>Valence Aware Dictionary and Sentiment Reasoning</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="web">
          <source>Social Media Fact Sheet</source>
          <access-date>2024-10-02</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.pewresearch.org/internet/fact-sheet/social-media/">https://www.pewresearch.org/internet/fact-sheet/social-media/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ortiz-Ospina</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>The rise of social media</article-title>
          <source>Our World in Data</source>
          <access-date>2024-10-02</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ourworldindata.org/rise-of-social-media">https://ourworldindata.org/rise-of-social-media</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Khan</surname>
              <given-names>ML</given-names>
            </name>
            <name name-style="western">
              <surname>Malik</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Researching Youtube: Methods, Tools, and Analytics</article-title>
          <source>The Sage Handbook of Social Media Research Methods</source>
          <year>2022</year>
          <publisher-loc>Los Angeles, Washington DC</publisher-loc>
          <publisher-name>Sage Publications</publisher-name>
          <fpage>651</fpage>
          <lpage>663</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lyu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ren</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Du</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Detecting depression of Chinese microblog users  text analysis: Combining Linguistic Inquiry Word Count (LIWC) with culture and suicide related lexicons</article-title>
          <source>Front Psychiatry</source>
          <year>2023</year>
          <volume>14</volume>
          <fpage>1121583</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36846219"/>
          </comment>
          <pub-id pub-id-type="doi">10.3389/fpsyt.2023.1121583</pub-id>
          <pub-id pub-id-type="medline">36846219</pub-id>
          <pub-id pub-id-type="pmcid">PMC9947407</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>JX</given-names>
            </name>
            <name name-style="western">
              <surname>Leu</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Holst</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Stock price movement prediction based on Stocktwits investor sentiment using FinBERT and ensemble SVM</article-title>
          <source>PeerJ Comput Sci</source>
          <year>2023</year>
          <volume>9</volume>
          <fpage>e1403</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37346695"/>
          </comment>
          <pub-id pub-id-type="doi">10.7717/peerj-cs.1403</pub-id>
          <pub-id pub-id-type="medline">37346695</pub-id>
          <pub-id pub-id-type="pii">cs-1403</pub-id>
          <pub-id pub-id-type="pmcid">PMC10280432</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Pan</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Xia</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>E-commerce product review sentiment classification based on a naïve Bayes continuous learning framework</article-title>
          <source>Information Processing &amp; Management</source>
          <year>2020</year>
          <volume>57</volume>
          <issue>5</issue>
          <fpage>102221</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1016/j.ipm.2020.102221"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.ipm.2020.102221</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Subramanian</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Easwaramoorthy Sathiskumar</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Deepalakshmi</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Cho</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Manikandan</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>A survey on hate speech detection and sentiment analysis using machine learning and deep learning models</article-title>
          <source>Alexandria Engineering Journal</source>
          <year>2023</year>
          <volume>80</volume>
          <fpage>110</fpage>
          <lpage>121</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1016/j.aej.2023.08.038"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.aej.2023.08.038</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Appel</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Chiclana</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Carter</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Fujita</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>A hybrid approach to the sentiment analysis problem at the sentence level</article-title>
          <source>Knowledge-Based Systems</source>
          <year>2016</year>
          <volume>108</volume>
          <fpage>110</fpage>
          <lpage>124</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1016/j.knosys.2016.05.040"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.knosys.2016.05.040</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hutto</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Gilbert</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>VADER: A parsimonious rule-based model for sentiment analysis of social media text</article-title>
          <source>ICWSM</source>
          <year>2014</year>
          <volume>8</volume>
          <issue>1</issue>
          <fpage>216</fpage>
          <lpage>225</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1609/icwsm.v8i1.14550"/>
          </comment>
          <pub-id pub-id-type="doi">10.1609/icwsm.v8i1.14550</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Alaei</surname>
              <given-names>AR</given-names>
            </name>
            <name name-style="western">
              <surname>Becken</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Stantic</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Sentiment analysis in tourism: capitalizing on big data</article-title>
          <source>J Travel Res</source>
          <year>2019</year>
          <volume>58</volume>
          <fpage>175</fpage>
          <lpage>191</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1177/0047287517747753"/>
          </comment>
          <pub-id pub-id-type="doi">10.1177/0047287517747753</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ribeiro</surname>
              <given-names>FN</given-names>
            </name>
            <name name-style="western">
              <surname>Araújo</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gonçalves</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>André Gonçalves</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Benevenuto</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>SentiBench - a benchmark comparison of state-of-the-practice sentiment analysis methods</article-title>
          <source>EPJ Data Sci</source>
          <year>2016</year>
          <volume>5</volume>
          <issue>1</issue>
          <fpage>1</fpage>
          <lpage>29</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1140/epjds/s13688-016-0085-1"/>
          </comment>
          <pub-id pub-id-type="doi">10.1140/epjds/s13688-016-0085-1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Botchway</surname>
              <given-names>RK</given-names>
            </name>
            <name name-style="western">
              <surname>Jibril</surname>
              <given-names>AB</given-names>
            </name>
            <name name-style="western">
              <surname>Kwarteng</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Chovancova</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Oplatková</surname>
              <given-names>ZK</given-names>
            </name>
          </person-group>
          <article-title>A review of social media posts from UniCredit bank in Europe: a sentiment analysis approach</article-title>
          <year>2019</year>
          <conf-name>Proceedings of the 3rd International Conference on Business and Information Management</conf-name>
          <conf-date>2020 July 17th-19th</conf-date>
          <conf-loc>Ireland</conf-loc>
          <pub-id pub-id-type="doi">10.1145/3361785.3361814</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fasha</surname>
              <given-names>EF</given-names>
            </name>
            <name name-style="western">
              <surname>Keikhosrokiani</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Asl</surname>
              <given-names>MP</given-names>
            </name>
          </person-group>
          <article-title>Opinion mining using sentiment analysis: a case study of readers’ response on Long Litt Woon’s The Way Through the Woods in Goodreads</article-title>
          <source>Advances on Intelligent Informatics and Computing</source>
          <year>2021</year>
          <publisher-loc>Cham</publisher-loc>
          <publisher-name>Springer International Publishing</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bestvater</surname>
              <given-names>SE</given-names>
            </name>
            <name name-style="western">
              <surname>Monroe</surname>
              <given-names>BL</given-names>
            </name>
          </person-group>
          <article-title>Sentiment is not stance: target-aware opinion classification for political text analysis</article-title>
          <source>Polit Anal</source>
          <year>2022</year>
          <volume>31</volume>
          <issue>2</issue>
          <fpage>235</fpage>
          <lpage>256</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1017/pan.2022.10"/>
          </comment>
          <pub-id pub-id-type="doi">10.1017/pan.2022.10</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nithyanand</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Schaffner</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Gill</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Online political discourse in the Trump era</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on November 17, 2017</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1711.05303"/>
          </comment>
          <pub-id pub-id-type="doi">10.5860/choice.45-0602</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Loveys</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Crutchley</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Wyatt</surname>
              <given-names>EA</given-names>
            </name>
            <name name-style="western">
              <surname>Coppersmith</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Small but mighty: affective micropatterns for quantifying mental health from social media language</article-title>
          <year>2017</year>
          <conf-name>Proceedings of the Fourth Workshop on Computational Linguistics and Clinical Psychology - from Linguistic Signal to Clinical Reality</conf-name>
          <conf-date>2016 June 16</conf-date>
          <conf-loc>California, USA</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.18653/v1/w17-3110"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/w17-3110</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Valdez</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Ten Thij</surname>
              <given-names>Marijn</given-names>
            </name>
            <name name-style="western">
              <surname>Bathina</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Rutter</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Bollen</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Social media insights into US mental health during the COVID-19 pandemic: longitudinal analysis of Twitter data</article-title>
          <source>J Med Internet Res</source>
          <year>2020</year>
          <volume>22</volume>
          <issue>12</issue>
          <fpage>e21418</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2020/12/e21418/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/21418</pub-id>
          <pub-id pub-id-type="medline">33284783</pub-id>
          <pub-id pub-id-type="pii">v22i12e21418</pub-id>
          <pub-id pub-id-type="pmcid">PMC7744146</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nair</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Veena</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Vinayak</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Comparative study of twitter sentiment on covid-19 tweets</article-title>
          <year>2021</year>
          <conf-name>Proceedings of 5th International Conference on Computing Methodologies and Communication</conf-name>
          <conf-date>2021 April 08-10</conf-date>
          <conf-loc>Erode, India</conf-loc>
          <publisher-name>IEEE</publisher-name>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1109/ICCMC51019.2021.9418320"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/iccmc51019.2021.9418320</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tymann</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Lutz</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Palsbröker</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Gips</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>GerVADER-A German Adaptation of the VADER Sentiment Analysis tool for Social Media Texts</article-title>
          <year>2021</year>
          <conf-name>Lernen, Wissen, Daten, Analysen</conf-name>
          <conf-date>2024 September 23-25</conf-date>
          <conf-loc>Germany</conf-loc>
          <fpage>178</fpage>
          <lpage>189</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="web">
          <source>TEXT2DATA</source>
          <access-date>2024-10-02</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://text2data.com/sentiment-analysis-api">https://text2data.com/sentiment-analysis-api</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hao</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>The application and comparison of web services for sentiment analysis in tourism</article-title>
          <year>2015</year>
          <conf-name>Proceedings of the 12th International Conference on Service Systems and Service Management (ICSSSM)</conf-name>
          <conf-date>2015 June 22-24</conf-date>
          <conf-loc>Guangzhou</conf-loc>
          <fpage>1</fpage>
          <lpage>6</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1109/ICSSSM.2015.7170341"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/icsssm.2015.7170341</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ramesh</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>The correlation between Twitter sentiments and polling results for the 2016 Presidential race</article-title>
          <source>South Carolina Junior Academy of Science</source>
          <year>2016</year>
          <access-date>2024-10-02</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://scholarexchange.furman.edu/scjas/2016/all/273">https://scholarexchange.furman.edu/scjas/2016/all/273</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tan</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Lim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Qi</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Miao</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>A comparison of web services for sentiment analysis in digital mental health interventions</article-title>
          <year>2022</year>
          <conf-name>Proceedings of International Conference on Human-Computer Interaction</conf-name>
          <conf-date>2023 July 23–28</conf-date>
          <conf-loc>Copenhagen, Denmark</conf-loc>
          <publisher-name>Springer International Publishing</publisher-name>
          <fpage>389</fpage>
          <lpage>407</lpage>
          <pub-id pub-id-type="doi">10.1007/978-3-031-05061-9_28</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Boyd</surname>
              <given-names>RL</given-names>
            </name>
            <name name-style="western">
              <surname>Ashokkumar</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Seraj</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Pennebaker</surname>
              <given-names>JW</given-names>
            </name>
          </person-group>
          <article-title>The development and psychometric properties of LIWC-22</article-title>
          <source>The University of Texas at Austin</source>
          <year>2022</year>
          <access-date>2024-10-02</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.liwc.app">https://www.liwc.app</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sell</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Farreras</surname>
              <given-names>IG</given-names>
            </name>
          </person-group>
          <article-title>LIWC-ing at a century of introductory college textbooks: have the sentiments changed?</article-title>
          <source>Procedia Computer Science</source>
          <year>2017</year>
          <volume>118</volume>
          <fpage>108</fpage>
          <lpage>112</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1016/j.procs.2017.11.151"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.procs.2017.11.151</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Smirnova</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Laranetto</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Kolenda</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Ideology through sentiment analysis: a changing perspective on Russia and Islam in NYT</article-title>
          <source>Discourse &amp; Communication</source>
          <year>2017</year>
          <volume>11</volume>
          <issue>3</issue>
          <fpage>296</fpage>
          <lpage>313</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1177/1750481317699347"/>
          </comment>
          <pub-id pub-id-type="doi">10.1177/1750481317699347</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Olagunju</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Oyebode</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Orji</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Exploring key issues affecting African mobile eCommerce applications using sentiment and thematic analysis</article-title>
          <source>IEEE Access</source>
          <year>2020</year>
          <volume>8</volume>
          <fpage>114475</fpage>
          <lpage>114486</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1109/access.2020.3000093"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/access.2020.3000093</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Boukes</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>van de Velde</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Araujo</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Vliegenthart</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>What’s the Tone? Easy doesn’t do it: analyzing performance and agreement between off-the-shelf sentiment analysis tools</article-title>
          <source>Commun Methods Meas</source>
          <year>2019</year>
          <volume>14</volume>
          <issue>2</issue>
          <fpage>83</fpage>
          <lpage>104</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1080/19312458.2019.1671966"/>
          </comment>
          <pub-id pub-id-type="doi">10.1080/19312458.2019.1671966</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hartmann</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Huppertz</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Schamp</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Heitmann</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Comparing automated text classification methods</article-title>
          <source>Int J Res Mark</source>
          <year>2019</year>
          <volume>36</volume>
          <issue>1</issue>
          <fpage>20</fpage>
          <lpage>38</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1016/j.ijresmar.2018.09.009"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.ijresmar.2018.09.009</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Han</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Tian</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Qiang</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Ge</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Summary of ChatGPT-related research and perspective towards the future of large language models</article-title>
          <source>Meta-Radiology</source>
          <year>2023</year>
          <volume>1</volume>
          <issue>2</issue>
          <fpage>100017</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1016/j.metrad.2023.100017"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.metrad.2023.100017</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Baskara</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Mukarto</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Exploring the implications of chatgpt for language learning in higher education</article-title>
          <source>IJELTAL</source>
          <year>2023</year>
          <volume>7</volume>
          <fpage>343</fpage>
          <lpage>358</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://dx.doi.org/10.21093/ijeltal.v7i2.1387"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fuchs</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Exploring the opportunities and challenges of NLP models in higher education: is Chat GPT a blessing or a curse?</article-title>
          <source>Front Educ</source>
          <year>2023</year>
          <volume>8</volume>
          <fpage>1166682</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.3389/feduc.2023.1166682"/>
          </comment>
          <pub-id pub-id-type="doi">10.3389/feduc.2023.1166682</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bacon</surname>
              <given-names>LL</given-names>
            </name>
          </person-group>
          <source>Opioid epidemic: CNN and Fox News Youtube viewers' stories, claims, and solutions</source>
          <access-date>2024-10-02</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.proquest.com/openview/af08077924c109cc16ab7be4b4c42e5b/1.pdf?pq-origsite=gscholar&amp;cbl=2026366&amp;diss=y">https://tinyurl.com/yemxye4f</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cohn</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Mehl</surname>
              <given-names>MR</given-names>
            </name>
            <name name-style="western">
              <surname>Pennebaker</surname>
              <given-names>JW</given-names>
            </name>
          </person-group>
          <article-title>Linguistic markers of psychological change surrounding September 11, 2001</article-title>
          <source>Psychol Sci</source>
          <year>2004</year>
          <volume>15</volume>
          <issue>10</issue>
          <fpage>687</fpage>
          <lpage>693</lpage>
          <pub-id pub-id-type="doi">10.1111/j.0956-7976.2004.00741.x</pub-id>
          <pub-id pub-id-type="medline">15447640</pub-id>
          <pub-id pub-id-type="pii">PSCI741</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chicco</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Jurman</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>The matthews correlation coefficient (MCC) should replace the ROC AUC as the standard metric for assessing binary classification</article-title>
          <source>BioData Min</source>
          <year>2023</year>
          <volume>16</volume>
          <issue>1</issue>
          <fpage>4</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://biodatamining.biomedcentral.com/articles/10.1186/s13040-023-00322-4"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s13040-023-00322-4</pub-id>
          <pub-id pub-id-type="medline">36800973</pub-id>
          <pub-id pub-id-type="pii">10.1186/s13040-023-00322-4</pub-id>
          <pub-id pub-id-type="pmcid">PMC9938573</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chicco</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Jurman</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>The advantages of the matthews correlation coefficient (MCC) over F1 score and accuracy in binary classification evaluation</article-title>
          <source>BMC Genomics</source>
          <year>2020</year>
          <volume>21</volume>
          <issue>1</issue>
          <fpage>6</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-019-6413-7"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12864-019-6413-7</pub-id>
          <pub-id pub-id-type="medline">31898477</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12864-019-6413-7</pub-id>
          <pub-id pub-id-type="pmcid">PMC6941312</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hilbe</surname>
              <given-names>JM</given-names>
            </name>
          </person-group>
          <source>Logistic Regression Models</source>
          <year>2009</year>
          <publisher-loc>New York, NY</publisher-loc>
          <publisher-name>Chapman and Hall/CRC</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gonen</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <source>Analyzing receiver operating characteristic curves with SAS</source>
          <year>2007</year>
          <publisher-loc>United States</publisher-loc>
          <publisher-name>SAS Institute</publisher-name>
          <fpage>362</fpage>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hosmer</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Lemeshow</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sturdivant</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <source>Applied Logistic Regression</source>
          <year>2013</year>
          <publisher-loc>Chicester, UK</publisher-loc>
          <publisher-name>Wiley</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="web">
          <source>Laerd Statistics</source>
          <access-date>2002-10-24</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://statistics.laerd.com/premium/spss/ck/cohens-kappa-in-spss-7.php">https://statistics.laerd.com/premium/spss/ck/cohens-kappa-in-spss-7.php</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sharma</surname>
              <given-names>DK</given-names>
            </name>
            <name name-style="western">
              <surname>Chatterjee</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kaur</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Vavilala</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Deep learning applications for disease diagnosis</article-title>
          <source>Deep Learning for Medical Applications with Unique Data</source>
          <year>2022</year>
          <volume>1</volume>
          <fpage>31</fpage>
          <lpage>51</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1016/b978-0-12-824145-5.00005-8"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/b978-0-12-824145-5.00005-8</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Aleksandric</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Anderson</surname>
              <given-names>HI</given-names>
            </name>
            <name name-style="western">
              <surname>Melcher</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Nilizadeh</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wilson</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Spanish facebook posts as an indicator of COVID-19 vaccine hesitancy in Texas</article-title>
          <source>Vaccines (Basel)</source>
          <year>2022</year>
          <volume>10</volume>
          <issue>10</issue>
          <fpage>1713</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=vaccines10101713"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/vaccines10101713</pub-id>
          <pub-id pub-id-type="medline">36298580</pub-id>
          <pub-id pub-id-type="pii">vaccines10101713</pub-id>
          <pub-id pub-id-type="pmcid">PMC9609763</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Martin</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chepo</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Deom</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Khalid</surname>
              <given-names>AF</given-names>
            </name>
            <name name-style="western">
              <surname>Vindrola-Padros</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>#LongCOVID affects children too: a Twitter analysis of healthcare workers sentimentdiscourse about Long COVID in childrenyoung people in the UK</article-title>
          <source>medRxiv</source>
          <comment>Preprint posted online on July 25, 2022</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1101/2022.07.20.22277865"/>
          </comment>
          <pub-id pub-id-type="doi">10.1101/2022.07.20.22277865</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Schöne</surname>
              <given-names>JP</given-names>
            </name>
            <name name-style="western">
              <surname>Garcia</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Parkinson</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Goldenberg</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Negative expressions are shared more on twitter for public figures than for ordinary users</article-title>
          <source>PNAS Nexus</source>
          <year>2023</year>
          <volume>2</volume>
          <issue>7</issue>
          <fpage>pgad219</fpage>
          <pub-id pub-id-type="doi">10.1093/pnasnexus/pgad219</pub-id>
          <pub-id pub-id-type="medline">37457891</pub-id>
          <pub-id pub-id-type="pii">pgad219</pub-id>
          <pub-id pub-id-type="pmcid">PMC10338895</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Schöne</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Parkinson</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Goldenberg</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Negativity spreads more than positivity on twitter after both positive and negative political situations</article-title>
          <source>Affect Sci</source>
          <year>2020</year>
          <volume>2</volume>
          <issue>4</issue>
          <fpage>379</fpage>
          <lpage>390</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1007/s42761-021-00057-7"/>
          </comment>
          <pub-id pub-id-type="doi">10.31234/osf.io/x9e7u</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Krippendorff</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Commentary: A dissenting view on so-called paradoxes of reliability coefficients</article-title>
          <source>Annals of the International Communication Association</source>
          <year>2016</year>
          <volume>36</volume>
          <issue>1</issue>
          <fpage>481</fpage>
          <lpage>499</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1080/23808985.2013.11679143"/>
          </comment>
          <pub-id pub-id-type="doi">10.1080/23808985.2013.11679143</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Provoost</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ruwaard</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>van Breda</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Riper</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Bosse</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Validating automated sentiment analysis of online cognitive behavioral therapy patient texts: an exploratory study</article-title>
          <source>Front Psychol</source>
          <year>2019</year>
          <volume>10</volume>
          <fpage>1065</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/31156504"/>
          </comment>
          <pub-id pub-id-type="doi">10.3389/fpsyg.2019.01065</pub-id>
          <pub-id pub-id-type="medline">31156504</pub-id>
          <pub-id pub-id-type="pmcid">PMC6530336</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
