<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v9i1e64723</article-id><article-id pub-id-type="doi">10.2196/64723</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Evaluating Large Language Models for Sentiment Analysis and Hesitancy Analysis on Vaccine Posts From Social Media: Qualitative Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Annan</surname><given-names>Augustine</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Eiden</surname><given-names>Amanda L</given-names></name><degrees>MBA, MPH, PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wang</surname><given-names>Dong</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Du</surname><given-names>Jingcheng</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Rastegar-Mojarad</surname><given-names>Majid</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Nomula</surname><given-names>Varun Kumar</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wang</surname><given-names>Xiaoyan</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>IMO Health</institution><addr-line>Rosemont</addr-line><addr-line>IL</addr-line><country>United States</country></aff><aff id="aff2"><institution>Merck &#x0026; Co, Inc</institution><addr-line>126 East Lincoln Avenue</addr-line><addr-line>Rahway</addr-line><addr-line>NJ</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Mavragani</surname><given-names>Amaryllis</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Zammit</surname><given-names>Alban</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Pang</surname><given-names>Yiran</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Amanda L Eiden, MBA, MPH, PhD, Merck &#x0026; Co, Inc, 126 East Lincoln Avenue, Rahway, NJ, 07065, United States, 1 267-305-0672; <email>amanda.eiden@merck.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>15</day><month>10</month><year>2025</year></pub-date><volume>9</volume><elocation-id>e64723</elocation-id><history><date date-type="received"><day>24</day><month>07</month><year>2024</year></date><date date-type="rev-recd"><day>19</day><month>12</month><year>2024</year></date><date date-type="accepted"><day>17</day><month>08</month><year>2025</year></date></history><copyright-statement>&#x00A9; Augustine Annan, Amanda L Eiden, Dong Wang, Jingcheng Du, Majid Rastegar-Mojarad, Varun Kumar Nomula, Xiaoyan Wang. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 15.10.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2025/1/e64723"/><abstract><sec><title>Background</title><p>In the digital age, social media has become a crucial platform for public discourse on diverse health-related topics, including vaccines. Efficient sentiment analysis and hesitancy detection are essential for understanding public opinions and concerns. Large language models (LLMs) offer advanced capabilities for processing complex linguistic patterns, potentially providing valuable insights into vaccine-related discourse.</p></sec><sec><title>Objective</title><p>This study aims to evaluate the performance of various LLMs in sentiment analysis and hesitancy detection related to vaccine discussions on social media and identify the most efficient, accurate, and cost-effective model for detecting vaccine-related public sentiment and hesitancy trends.</p></sec><sec sec-type="methods"><title>Methods</title><p>We used several LLMs&#x2014;generative pretrained transformer (GPT-3.5), GPT-4, Claude-3 Sonnet, and Llama 2&#x2014;to process and classify complex linguistic data related to human papillomavirus; measles, mumps, and rubella; and vaccines overall from X (formerly known as Twitter), Reddit, and YouTube. The models were tested across different learning paradigms: zero-shot, 1-shot, and few-shot to determine their adaptability and learning efficiency with varying amounts of training data. We evaluated the models&#x2019; performance using accuracy, <italic>F</italic><sub>1</sub>-score, precision, and recall. In addition, we conducted a cost analysis focused on token usage to assess the computational efficiency of each approach.</p></sec><sec sec-type="results"><title>Results</title><p>GPT-4 (<italic>F</italic><sub>1</sub>-score=0.85 and accuracy=0.83) outperformed GPT-3.5, Llama 2, and Claude-3 Sonnet across various metrics, regardless of the sentiment type or learning paradigm. Few-shot learning did not significantly enhance performance compared with the zero-shot paradigm. Moreover, the increased computational costs and token usage associated with few-shot learning did not justify its application, given the marginal improvement in model performance. The analysis highlighted challenges in classifying neutral sentiments and convenience, correctly interpreting sarcasm, and accurately identifying indirect expressions of vaccine hesitancy, emphasizing the need for model refinement.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>GPT-4 emerged as the most accurate model, excelling in sentiment and hesitancy analysis. Performance differences between learning paradigms were minimal, making zero-shot learning preferable for its balance of accuracy and computational efficiency. However, the zero-shot GPT-4 model is not the most cost-effective compared with traditional machine learning. A hybrid approach, using LLMs for initial annotation and traditional models for training, could optimize cost and performance. Despite reliance on specific LLM versions and a limited focus on certain vaccine types and platforms, our findings underscore the capabilities and limitations of LLMs in vaccine sentiment and hesitancy analysis, highlighting the need for ongoing evaluation and adaptation in public health communication strategies.</p></sec></abstract><kwd-group><kwd>vaccine sentiment</kwd><kwd>vaccine hesitancy</kwd><kwd>large language models</kwd><kwd>GPT4</kwd><kwd>social media platforms</kwd><kwd>public health communication</kwd><kwd>LLMs</kwd><kwd>NLP</kwd><kwd>machine learning</kwd><kwd>artificial intelligence</kwd><kwd>language models</kwd><kwd>sentiment analysis</kwd><kwd>hesitancy analysis</kwd><kwd>vaccine posts</kwd><kwd>social media</kwd><kwd>vaccine</kwd><kwd>public opinion</kwd><kwd>vaccine-related</kwd><kwd>public sentiment</kwd><kwd>computational efficiency</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>In the era of digital communication, social media platforms have become central to the dissemination and exchange of public opinions on health-related topics, including vaccination. The vast and dynamic nature of these platforms offers a rich dataset for analyzing public sentiment and hesitancy toward vaccines, which is critical for developing effective health communication strategies. The advent of advanced artificial intelligence (AI) technologies, particularly large language models (LLMs) such as OpenAI&#x2019;s generative pretrained transformer (GPT-4.0) [<xref ref-type="bibr" rid="ref1">1</xref>] and GPT-3.5 [<xref ref-type="bibr" rid="ref2">2</xref>], Anthropic&#x2019;s Claude 3 Sonnet [<xref ref-type="bibr" rid="ref3">3</xref>], and Meta&#x2019;s Llama 2 [<xref ref-type="bibr" rid="ref4">4</xref>], represents a significant leap forward in our capacity to understand and interpret extensive amounts of unstructured text data [<xref ref-type="bibr" rid="ref5">5</xref>]. These models have achieved impressive results in a variety of applications; for example, they significantly aid in diagnostics and personalizing treatment approaches and prove successful in medical licensing examinations and specialized medical tasks [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref12">12</xref>], thereby showcasing the expansive range of their capabilities [<xref ref-type="bibr" rid="ref13">13</xref>-<xref ref-type="bibr" rid="ref15">15</xref>].</p><p>The use of LLMs extends beyond traditional medical applications. These technologies are set to transform public health research by offering insights into public sentiments expressed on the web, particularly on complex topics such as vaccine hesitancy; defined by reluctance or delay in accepting vaccines despite the availability of vaccination services [<xref ref-type="bibr" rid="ref16">16</xref>], vaccine hesitancy is a barrier to global health initiatives, especially in ongoing campaigns against diseases such as human papillomavirus (HPV) and measles, mumps, and rubella (MMR). Addressing this challenge requires a nuanced understanding of public opinions, concerns, and misinformation patterns, which LLMs are uniquely positioned to facilitate [<xref ref-type="bibr" rid="ref17">17</xref>].</p><p>The rapid evolution of LLMs, from earlier models such as GPT-2 and BERT to more advanced iterations, has brought substantial improvements in language comprehension and task-solving abilities [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref18">18</xref>-<xref ref-type="bibr" rid="ref20">20</xref>]. Other organizations, including Anthropic with its Claude models [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref21">21</xref>] and Meta AI with the Llama series [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref22">22</xref>], have also contributed significantly to this field.</p><p>Despite these advancements, research comparing different LLMs and learning paradigms in public health sentiment analysis remains limited. This study aims to address this gap by evaluating the efficacy of various LLMs in analyzing sentiments and hesitancy related to HPV, MMR, and general vaccines across 3 social media platforms: X (formerly Twitter), Reddit, and YouTube.</p><p>In this work, we explored the adaptability and effectiveness of LLMs in processing health-related sentiments by using zero-shot, 1-shot, and few-shot learning paradigms. A statistical comparison of performance across these paradigms reveals significant insights into optimizing model efficiency and resource allocation for large-scale sentiment analysis. Specifically, our analysis delves into the practical implications of selecting learning paradigms based on their computational costs and accuracy in detecting nuanced expressions of vaccine sentiment and hesitancy.</p><p>By pinpointing the optimal combinations of LLMs and learning paradigms for robust sentiment and hesitancy analysis, our research directly informs the strategic deployment of AI in crafting targeted public health messaging. This endeavor not only enhances our understanding of public discourse around vaccines but also provides a foundation for addressing vaccine hesitancy more effectively, thereby contributing to the advancement of public health communication strategies.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Data Collection and Annotation</title><p>We analyzed English language social media posts from X (formerly Twitter), Reddit, and YouTube, focusing on public sentiment toward general vaccines, HPV, and MMR vaccines. Data collection spanned from January 1, 2011, to October 31, 2021. We selected these platforms because they provide robust application programming interface (API) access and represent a significant share of vaccine-related discourse on the web. English language posts were prioritized due to their widespread usage in large markets and accessibility. The detailed data collection and annotation procedures follow the methodology described in a previous study [<xref ref-type="bibr" rid="ref23">23</xref>]. Data were retrieved using the APIs provided by the platforms, following their data privacy and ethical guidelines.</p><p>To ensure the relevance and quality of the data, we tailored our search queries for each platform, as variations in text format and query logic required customization. Inclusion keywords such as &#x201C;vaccine,&#x201D; &#x201C;immunization,&#x201D; &#x201C;HPV vaccine,&#x201D; and &#x201C;MMR vaccine&#x201D; were used, while exclusion keywords (eg, &#x201C;software updates&#x201D; or &#x201C;sports immunities&#x201D;) filtered out irrelevant content. Duplicate posts and spam were removed during preprocessing.</p><p>The data collection period includes the COVID-19 pandemic, a significant global event that likely influenced public sentiment and hesitancy discussions on vaccines. With the widespread availability of COVID-19 vaccines during this time, public discourse around vaccines may have become more polarized, amplifying safety concerns and misinformation. Although this study does not focus on COVID-19&#x2013;specific sentiments, the pandemic era context may have shaped the overall tone and topics of vaccine-related discussions. Posts unrelated to COVID-19 vaccines were also included to capture general vaccine sentiment trends, ensuring that the analysis remained relevant to MMR, HPV, and general vaccines.</p><p>Our study involved a dual-layered annotation approach for 10,485 social media posts. Posts were first categorized for sentiment (positive, neutral, or negative) and then for vaccine hesitancy. The hesitancy annotation involved a 2-step process: initially determining whether a post was hesitant or nonhesitant, and for those identified as hesitant, further categorizing them based on the World Health Organization (WHO&#x2019;s 3Cs [confidence, complacency, and convenience] model) [<xref ref-type="bibr" rid="ref24">24</xref>].</p><p>To ensure balance, we evenly annotated 1165 posts across 3 platforms (X, Reddit, and YouTube) and vaccine types (general, HPV, and MMR). The final dataset comprised 36% positive, 26% neutral, and 38% negative sentiments, with 39% of posts identified as hesitant. Among hesitant posts, 58% reflected confidence issues, 25% complacency, and 17% convenience concerns.</p><p>Posts were annotated for sentiment as positive (favorable opinions or experiences regarding vaccines), neutral (informational or mixed sentiment posts), or negative (adverse opinions or potentially deterring information). Hesitancy was annotated based on the WHO&#x2019;s 3Cs model: confidence (distrust in vaccine efficacy, safety, or delivery systems), complacency (low perceived risk of vaccine-preventable diseases), and convenience (barriers to vaccine accessibility or difficulties in obtaining vaccination) [<xref ref-type="bibr" rid="ref23">23</xref>].</p></sec><sec id="s2-2"><title>Few-Shot Learning and Evaluation</title><p>For evaluation, 7515 annotated posts were used to assess the LLMs&#x2019; performance. <xref ref-type="table" rid="table1">Table 1</xref> provides the distribution of these posts across sentiment and hesitancy categories, proportional to the overall annotated dataset. A detailed distribution of annotated posts in each sentiment and 3Cs construct for each platform and vaccine topic group is shown in Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Distribution of evaluation posts by sentiment and hesitancy categories.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Category</td><td align="left" valign="bottom">Count of posts</td><td align="left" valign="bottom">Percentage</td></tr></thead><tbody><tr><td align="left" valign="top">Sentiment</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Positive</td><td align="left" valign="top">2705</td><td align="left" valign="top">36</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Neutral</td><td align="left" valign="top">1954</td><td align="left" valign="top">26</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Negative</td><td align="left" valign="top">2856</td><td align="left" valign="top">38</td></tr><tr><td align="left" valign="top">Hesitancy</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Nonhesitant</td><td align="left" valign="top">4584</td><td align="left" valign="top">61</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Hesitant</td><td align="left" valign="top">2931</td><td align="left" valign="top">39</td></tr><tr><td align="left" valign="top">&#x2003;&#x2003;Confidence</td><td align="left" valign="top">1850</td><td align="left" valign="top">63% of hesitant</td></tr><tr><td align="left" valign="top">&#x2003;&#x2003;Complacency</td><td align="left" valign="top">1020</td><td align="left" valign="top">35% of hesitant</td></tr><tr><td align="left" valign="top">&#x2003;&#x2003;Convenience</td><td align="left" valign="top">720</td><td align="left" valign="top">25% of hesitant</td></tr></tbody></table></table-wrap><p>Few-shot learning experiments were conducted separately for each vaccine type and each platform. For each experiment, posts were selected using a balanced sampling approach to ensure balanced representation across sentiment categories (positive, neutral, and negative). For each configuration (eg, 5-shot), we sampled an equal number of examples per sentiment category. Specifically, for 5-shot learning, we included 5 examples from each sentiment category (positive, neutral, and negative) per vaccine type per platform. This resulted in a total of 15 examples per vaccine type per platform for 5-shot learning. The same sampling approach was applied to other configurations, ensuring consistency and diversity in training examples across platforms and vaccine topics.</p><p>A similar methodology was used for hesitancy analysis. Posts were sampled to represent hesitant and nonhesitant categories as well as the WHO&#x2019;s 3Cs constructs (confidence, complacency, and convenience), ensuring balanced representation across platforms and vaccine types.</p><p>To maintain consistency and enable fair comparisons, the same few-shot examples were reused across experiments for a given configuration, minimizing variability in training data and ensuring a controlled evaluation of model performance.</p><sec id="s2-2-1"><title>Sentiment Annotation</title><p>Sentiment annotation was a multiclass classification task, assigning 1 of 3 sentiment labels to each post:</p><list list-type="bullet"><list-item><p>Positive: Posts that mention, report, or share positive opinions, news, or experiences about vaccines or vaccination.</p><list list-type="bullet"><list-item><p><italic>Example</italic>: &#x201C;Getting vaccinated is a vital step to safeguard your health and future.&#x201D;</p></list-item></list></list-item><list-item><p>Neutral: Posts that relate to vaccines or vaccination topics but contain no clear sentiment, express mixed sentiments, or do not explicitly advocate for or against vaccination.</p><list list-type="bullet"><list-item><p><italic>Example</italic>: &#x201C;HPV has multiple strains, and vaccines cover some of them but not all.&#x201D;</p></list-item></list></list-item><list-item><p>Negative: Posts that mention, report, or share negative opinions, news, or experiences about vaccines or vaccination, which may discourage vaccination.</p><list list-type="bullet"><list-item><p><italic>Example</italic>: &#x201C;The government is pushing vaccines despite the growing number of adverse reactions.&#x201D;</p></list-item></list></list-item></list></sec><sec id="s2-2-2"><title>3Cs Vaccine Hesitancy Annotation</title><p>The annotation of vaccine hesitancy was based on the WHO&#x2019;s 3Cs model, with each construct evaluated independently. Posts labeled as &#x201C;Lack of confidence,&#x201D; &#x201C;Complacent,&#x201D; or &#x201C;Inconvenient&#x201D; were considered hesitant, while posts without any of these constructs were labeled as nonhesitant. Definitions and an example for each construct are as follows:</p><list list-type="bullet"><list-item><p>Lack of confidence: Posts reflecting mistrust in vaccine efficacy, safety, vaccine delivery system, or motivations of policy makers.</p><list list-type="bullet"><list-item><p><italic>Example</italic>: &#x201C;Vaccinated individuals are still catching the disease, so how effective are these shots really?&#x201D;</p></list-item></list></list-item><list-item><p>Complacency: Posts where the perceived risks of vaccine-preventable diseases are low and vaccination is deemed unnecessary.</p><list list-type="bullet"><list-item><p><italic>Example</italic>: &#x201C;I never got vaccinated as a kid and I turned out to be fine, so vaccines aren&#x2019;t critical or essential.&#x201D;</p></list-item></list></list-item><list-item><p>Inconvenience: Posts highlighting physical, geographical, financial, or systemic barriers to vaccination, including issues related to health literacy and service accessibility.</p><list list-type="bullet"><list-item><p><italic>Example</italic>: &#x201C;I wanted to get the vaccine, but the nearest clinic is three hours away, and I cannot take time off work.&#x201D;</p></list-item></list></list-item></list><p>The annotation was performed by trained annotators with a medical background. All annotators underwent training with a guideline developed for this study. Annotators achieved a high interannotator agreement score (Cohen &#x03BA;=0.93), indicating strong reliability. Discrepancies were resolved through discussion and consensus. Examples of annotated posts and the full annotation framework are detailed in a previous paper [<xref ref-type="bibr" rid="ref23">23</xref>].</p></sec></sec><sec id="s2-3"><title>Implementation of LLMs</title><p>In this study, we leveraged 4 advanced LLMs: OpenAI&#x2019;s GPT-3.5 and GPT-4, Anthropic&#x2019;s Claude 3 Sonnet, and Meta&#x2019;s Llama 2. These models were chosen for their effectiveness in natural language processing (NLP) tasks such as text generation, sentiment analysis, and question answering, making them ideal for analyzing sentiments and hesitancy in social media discourse about vaccination.</p><p>GPT-3.5, with its 175 billion parameters, offers substantial text generation and understanding capabilities [<xref ref-type="bibr" rid="ref25">25</xref>]. GPT-4 builds on this foundation with enhanced nuanced text interpretation and more sophisticated training techniques, making it particularly effective for detailed analysis of complex topics such as vaccine sentiment in social media conversations [<xref ref-type="bibr" rid="ref26">26</xref>]. Claude 3 Sonnet was selected for its advanced processing capabilities, improved steerability, and the ability to handle up to 200,000 tokens, making it particularly suitable for comprehensive sentiment analysis [<xref ref-type="bibr" rid="ref27">27</xref>]. Meta&#x2019;s Llama 2 was chosen for its open-source nature, flexibility, and robust performance in detecting nuanced sentiments and hesitancies, providing a strategic advantage for academic research.</p></sec><sec id="s2-4"><title>Model Evaluation Through Learning Paradigms</title><p>We evaluated the LLMs using 3 learning paradigms&#x2014;zero-shot, 1-shot, and few-shot learning&#x2014;to test their adaptability and efficiency with varying amounts of prior information, simulating real-world scenarios where labeled data may be scarce.</p><sec id="s2-4-1"><title>Zero-Shot Learning</title><p>In the zero-shot learning paradigm, the model analyzes tasks without any prior examples or specific training related to the task [<xref ref-type="bibr" rid="ref28">28</xref>]. It relies entirely on its pretrained knowledge to infer the correct output. For our study, models classified vaccine sentiment or hesitancy from social media posts using only a carefully crafted prompt, without any sample posts or classifications. This setup tests the model&#x2019;s ability to generalize from unrelated training data to new, unseen tasks.</p></sec><sec id="s2-4-2"><title>One-Shot Learning</title><p>In 1-shot learning, the model is provided with a single example post for each category before making predictions. This minimal context helps the model understand the task with just 1 reference point per sentiment class or hesitancy construct.</p></sec><sec id="s2-4-3"><title>Few-Shot Learning</title><p>Few-shot learning involves presenting the model with a set of multiple examples (<italic>k</italic> examples) to enhance its understanding and classification accuracy [<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref30">30</xref>]. This methodology includes variations such as 5-shot and 10-shot settings. For sentiment analysis, <italic>k</italic> examples were provided for each category (positive, neutral, and negative). For hesitancy analysis, <italic>k</italic> examples were provided for each of the WHO&#x2019;s 3Cs model dimensions (confidence, complacency, and convenience). The few-shot learning approach allowed us to explore how models perform with an increasing number of examples, testing their precision in detecting and classifying sentiments and hesitancies with minimal and slightly more information.</p><p>This structured approach confirms the flexibility and effectiveness of the models in adapting to different levels of available information. It provides a comprehensive perspective on their use in analyzing a broad spectrum of public opinions. By evaluating these models in scenarios that mirror real-world data availability, we aim to demonstrate the potential of LLMs to swiftly and accurately analyze complex vaccine-related social media discourse, potentially supporting enhanced public health communication strategies.</p></sec></sec><sec id="s2-5"><title>Prompt Schema Design</title><p>The prompts used in this study were carefully crafted to align with the definitions and instructions provided to human annotators for vaccine sentiment and hesitancy terms. This approach, known as prompt engineering [<xref ref-type="bibr" rid="ref31">31</xref>], ensured that the models&#x2019; interpretation matched human understanding, thereby enhancing prediction relevance and accuracy.</p><p>Prompts included explicit definitions and criteria for categorizing sentiments (eg, &#x201C;positive,&#x201D; &#x201C;neutral,&#x201D; and &#x201C;negative&#x201D;) and hesitancy constructs based on the WHO&#x2019;s 3Cs model (confidence, complacency, and convenience) [<xref ref-type="bibr" rid="ref15">15</xref>]. For example, in analyzing HPV vaccination&#x2013;related posts, the prompt specified that quoted content should take precedence in cases of mixed sentiment and provided guidance for handling ambiguous language (<xref ref-type="fig" rid="figure1">Figure 1</xref>). Prompts also included requests for the models to provide explanations for their classifications, which enhanced interpretability and supported iterative refinement during the development phase [<xref ref-type="bibr" rid="ref32">32</xref>].</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Prompt example for sentiment analysis. The text inside the dashed box is demonstrations of the few-shot setting and would be removed under the zero-shot setting. HPV: human papillomavirus; JSON: JavaScript Object Notation.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e64723_fig01.png"/></fig><p>Preliminary testing compared multiple prompt versions to evaluate clarity, task alignment, and interpretability. For instance, variations in linguistic framing, the level of detail provided, and the structure of schema-specific instructions were tested using a subset of the dataset. Prompts that introduced ambiguity or failed to consistently elicit accurate classifications were refined or excluded. The final schema was selected based on its ability to achieve robust and consistent performance across platforms, vaccine types, and sentiment categories.</p><p>The final schema used in this study has been disclosed in the supplementary material for transparency and reproducibility (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). Future research could explore the impact of alternative prompt designs, including variations in linguistic framing, level of detail, and schema-specific instructions, on model performance. Such studies would be particularly valuable for tasks requiring the interpretation of complex or ambiguous language.</p><p>By incorporating these detailed and context-specific instructions into our prompts, we ensured that the LLMs were well equipped to interpret and analyze the sentiment of vaccine-related social media posts accurately [<xref ref-type="bibr" rid="ref33">33</xref>]. This approach underscores the critical role of prompt engineering in leveraging the full capabilities of LLMs for nuanced tasks such as sentiment analysis, where the subtleties of language and expression necessitate a high degree of precision and clarity in the instructions provided to the models.</p></sec><sec id="s2-6"><title>Hyperparameter Settings</title><p>We predefined 2 key hyperparameters to optimize the LLMs&#x2019; performance: temperature and maximum tokens. The temperature parameter, which controls output randomness, was set to zero to ensure deterministic responses. This setting is crucial in the context of health-related sentiment analysis, where consistency and reproducibility are paramount. The maximum token (words and subwords) limit was set to 512, allowing models to provide detailed reasoning for their classifications, which is valuable for understanding the nuances of vaccine-related sentiments.</p><p>To ensure reliability, each social media post was processed through each LLM 3 times. We achieved a 97% consistency rate in categorizations across iterations. For the 3% of inconsistent cases, we used a majority vote approach to determine the final category. This rigorous process enhances the credibility of our results in the context of public health communication research.</p></sec><sec id="s2-7"><title>Evaluation Metrics</title><p>To assess the effectiveness of the LLMs, within the framework of our study, we relied on a suite of established NLP evaluation metrics: accuracy, precision, recall, and the <italic>F</italic><sub>1</sub>-score [<xref ref-type="bibr" rid="ref34">34</xref>]. These metrics collectively offer a comprehensive snapshot of the models&#x2019; performance across various learning paradigms, ensuring a well-rounded assessment of their capabilities. These metrics were calculated for each sentiment and hesitancy category. Moreover, to ensure the understanding of the models&#x2019; performance, these evaluations were conducted separately for each vaccine type under discussion and across the various social media platforms included in our study.</p></sec><sec id="s2-8"><title>Statistical Analysis for Model Comparison</title><p>To rigorously compare LLM performance across zero-shot, 1-shot, and few-shot learning paradigms, we conducted ANOVA tests [<xref ref-type="bibr" rid="ref35">35</xref>]. We focused on <italic>F</italic><sub>1</sub>-scores and accuracy as key performance indicators. This analysis helps determine the most effective learning approach for sentiment and hesitancy analysis in vaccine-related social media discussions, a critical aspect of modern public health monitoring.</p><p>We set the significance level at <italic>P</italic>&#x003C;.05 for identifying statistically significant differences. All statistical analyses were performed using R (version 4.2.2; R Core Team) providing a rigorous and replicable framework for our evaluations. This comprehensive evaluation framework allows us to assess the scalability and applicability of these AI models in real-world public health scenarios, potentially informing strategies for efficient, large-scale analysis of vaccine sentiments and hesitancy in diverse digital environments.</p></sec><sec id="s2-9"><title>Ethical Considerations</title><p>All data used in this study were publicly available, and ethical guidelines for social media research were followed. Data privacy policies of the platforms (X, Reddit, and YouTube) were adhered to, and all posts were deidentified using unique random IDs. To further safeguard privacy, no actual posts are directly quoted or reproduced in this manuscript. Instead, illustrative examples provided in the text are entirely theoretical and were crafted to reflect the general themes and patterns observed in the data. These examples ensure that no identifiable user information is disclosed, while still providing clarity on the study&#x2019;s methodology and findings.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Sentiment Analysis Overview</title><p>This section provides an overview of the sentiment analysis results, comparing the performance of the LLMs based on accuracy, <italic>F</italic><sub>1</sub>-score, precision, and recall. We focused on the models&#x2019; abilities to classify sentiments (positive, neutral, and negative) in a zero-shot learning setting (<xref ref-type="table" rid="table2">Table 2</xref>).</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Large language models' performance (measured in accuracy, recall, precision, and <italic>F</italic><sub>1</sub>-score) on vaccine sentiment analysis.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Sentiment</td><td align="left" valign="bottom">Accuracy (%)</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score (%)</td><td align="left" valign="bottom">Precision (%)</td><td align="left" valign="bottom">Recall (%)</td></tr></thead><tbody><tr><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">Negative</td><td align="left" valign="top">0.801</td><td align="left" valign="top">0.753</td><td align="left" valign="top">0.672</td><td align="left" valign="top">0.834</td></tr><tr><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">Neutral</td><td align="left" valign="top">0.821</td><td align="left" valign="top">0.633</td><td align="left" valign="top">0.612</td><td align="left" valign="top">0.659</td></tr><tr><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">Positive</td><td align="left" valign="top">0.819</td><td align="left" valign="top">0.781</td><td align="left" valign="top">0.711</td><td align="left" valign="top">0.861</td></tr><tr><td align="left" valign="top">GPT-4</td><td align="left" valign="top">Negative</td><td align="left" valign="top">0.940</td><td align="left" valign="top">0.932</td><td align="left" valign="top">0.910</td><td align="left" valign="top">0.950</td></tr><tr><td align="left" valign="top">GPT-4</td><td align="left" valign="top">Neutral</td><td align="left" valign="top">0.932</td><td align="left" valign="top">0.901</td><td align="left" valign="top">0.891</td><td align="left" valign="top">0.910</td></tr><tr><td align="left" valign="top">GPT-4</td><td align="left" valign="top">Positive</td><td align="left" valign="top">0.921</td><td align="left" valign="top">0.930</td><td align="left" valign="top">0.920</td><td align="left" valign="top">0.940</td></tr><tr><td align="left" valign="top">Claude-3s</td><td align="left" valign="top">Negative</td><td align="left" valign="top">0.882</td><td align="left" valign="top">0.801</td><td align="left" valign="top">0.841</td><td align="left" valign="top">0.760</td></tr><tr><td align="left" valign="top">Claude-3s</td><td align="left" valign="top">Neutral</td><td align="left" valign="top">0.821</td><td align="left" valign="top">0.690</td><td align="left" valign="top">0.701</td><td align="left" valign="top">0.681</td></tr><tr><td align="left" valign="top">Claude-3s</td><td align="left" valign="top">Positive</td><td align="left" valign="top">0.870</td><td align="left" valign="top">0.820</td><td align="left" valign="top">0.780</td><td align="left" valign="top">0.862</td></tr><tr><td align="left" valign="top">Llama 2</td><td align="left" valign="top">Negative</td><td align="left" valign="top">0.749</td><td align="left" valign="top">0.641</td><td align="left" valign="top">0.691</td><td align="left" valign="top">0.590</td></tr><tr><td align="left" valign="top">Llama 2</td><td align="left" valign="top">Neutral</td><td align="left" valign="top">0.719</td><td align="left" valign="top">0.140</td><td align="left" valign="top">0.110</td><td align="left" valign="top">0.221</td></tr><tr><td align="left" valign="top">Llama 2</td><td align="left" valign="top">Positive</td><td align="left" valign="top">0.721</td><td align="left" valign="top">0.421</td><td align="left" valign="top">0.411</td><td align="left" valign="top">0.418</td></tr></tbody></table></table-wrap><p>GPT-4 exhibited the highest performance across all sentiment types, with accuracy ranging from 92% to 94%, <italic>F</italic><sub>1</sub>-scores from 90% to 93%, precision from 89% to 92%, and recall from 91% to 95%. Claude-3 followed, with accuracy between 82% and 88%, <italic>F</italic><sub>1</sub>-scores from 69% to 82%, precision from 70% to 84%, and recall from 68% to 86%. GPT-3.5 showed comparable performance with Claude-3, with accuracy ranging from 80% to 82%, <italic>F</italic><sub>1</sub>-scores from 63% to 78%, precision from 61% to 71%, and recall from 66% to 86%. Llama 2 exhibited the lowest performance, with accuracy between 72% and 75%, <italic>F</italic><sub>1</sub>-scores from 14% to 64%, precision from 11% to 69%, and recall from 22% to 59%.</p></sec><sec id="s3-2"><title>Insights From Shot-Learning Paradigms</title><p>We evaluated the performance of the 4 LLMs across zero-shot, 1-shot, 5-shot, and 10-shot learning paradigms. The results represent the averaged performance across vaccine types (HPV, MMR, and general) and platforms (X, Reddit, and YouTube), ensuring a comprehensive evaluation. The models&#x2019; accuracy and <italic>F</italic><sub>1</sub>-scores remained relatively consistent across different paradigms for each sentiment type (<xref ref-type="fig" rid="figure2">Figures 2</xref> and <xref ref-type="fig" rid="figure3">3</xref>). This consistency suggests that the models can effectively learn and generalize from few examples, maintaining their relative performance.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p><italic>F</italic><sub>1</sub>-score performance of large language models in sentiment analysis across learning paradigms.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e64723_fig02.png"/></fig><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Accuracy performance of large language models in sentiment analysis across learning paradigms.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e64723_fig03.png"/></fig><p>For GPT-4, accuracy scores for positive sentiment started at 92% in the zero-shot setting and increased marginally to 93.0% in the 10-shot setting. Neutral sentiment accuracy remained stable, ranging between 92.0% and 93%, while for negative sentiment, accuracy improved slightly from 92.0% in the 5-shot setting to 93.0% in the 10-shot setting. <italic>F</italic><sub>1</sub>-scores for GPT-4 reflected similar stability, with the highest values observed for negative sentiment, reaching 93% in both zero-shot and 10-shot settings.</p><p>GPT-3.5 demonstrated stable performance across paradigms. Accuracy for positive sentiment rose slightly from 82% in the zero-shot setting to 83% in the 10-shot setting, while neutral sentiment accuracy fluctuated around 80%. Negative sentiment accuracy, however, decreased from 80% in the zero-shot setting to 78% in the 10-shot setting. <italic>F</italic><sub>1</sub>-scores followed a similar pattern, with positive sentiment improving to 78% in the 5-shot setting before slightly declining to 77% in the 10-shot setting.</p><p>Claude-3s showed consistent performance across paradigms. Accuracy for neutral sentiment increased from 82% in the zero-shot setting to 83% in the 10-shot setting, while for positive sentiment, accuracy decreased slightly from 87% in the zero-shot setting to 85% in the 10-shot setting. <italic>F</italic><sub>1</sub>-scores for negative sentiment remained stable across all paradigms, with values around 80%.</p><p>Llama 2 exhibited modest improvements with additional examples. Accuracy for positive sentiment improved slightly from 71% in the zero-shot setting to 72% in the 1-shot setting, stabilizing at 71% in the 5- and 10-shot settings. <italic>F</italic><sub>1</sub>-scores for neutral sentiment remained low, reaching a maximum of 15% in the 10-shot setting. However, for negative sentiment, <italic>F</italic><sub>1</sub>-scores improved from 64% in the zero-shot setting to 67% in the 5- and 10-shot settings.</p><p>Overall, the results highlight GPT-4&#x2019;s superior performance in sentiment analysis across all paradigms, with consistent improvements observed as the number of examples increased. While GPT-3.5 and Claude-3s also exhibited stable and moderate performance, Llama2 showed the greatest need for further optimization despite modest gains in certain settings.</p></sec><sec id="s3-3"><title>Performance Across Vaccine Types and Social Media Platforms in Zero-Shot Settings</title><p>GPT-4 demonstrated the highest <italic>F</italic><sub>1</sub>-scores, ranging from 87% to 91%, showcasing its strong capability to generalize and accurately classify sentiment across all vaccine categories. Claude-3 followed closely with <italic>F</italic><sub>1</sub>-scores between 78% and 82%, indicating its robust performance in the zero-shot setting. GPT-3.5 showed moderate performance with <italic>F</italic><sub>1</sub>-scores ranging from 65% to 85%. Llama 2 had the lowest scores, between 22% and 42%, highlighting the variability in performance among the models (<xref ref-type="fig" rid="figure4">Figure 4</xref>).</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Zero-shot <italic>F</italic><sub>1</sub>-scores across models and vaccine types. HPV: human papillomavirus; MMR: measles, mumps, and rubella.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e64723_fig04.png"/></fig><p>Furthermore, we present the performance of the 4 language models in sentiment analysis across 3 social media platforms: Reddit, X, and YouTube (<xref ref-type="table" rid="table2">Table 2</xref>). While the overall performance trend remains consistent across platforms, there are minor variations worth noting. GPT-3.5 exhibited a slightly higher <italic>F</italic><sub>1</sub>-score on YouTube (71.9%) than Reddit (71.4%) and X (70.6%). Claude-3 showed a higher <italic>F</italic><sub>1</sub>-score on YouTube (78.9%) than X (78.6%) and Reddit (77.5%). Llama 2 had a higher <italic>F</italic><sub>1</sub>-score on YouTube (41.5%) than X (39.9%) and Reddit (38.8%). GPT-4 maintained consistent performance across platforms with <italic>F</italic><sub>1</sub>-scores ranging from 91.7% to 91.8%.</p></sec><sec id="s3-4"><title>Hesitancy Analysis Overview</title><p>GPT-4 led in both sentiment and hesitancy analyses, although its accuracy scores in hesitancy analysis (83%-96%) were slightly lower than in sentiment analysis (92%-94%; <xref ref-type="table" rid="table3">Table 3</xref>). This suggests that accurately identifying and classifying hesitancy types might be marginally more challenging than sentiment analysis, even for the best-performing model (<xref ref-type="table" rid="table4">Table 4</xref>). Claude-3 showed consistent performance across both tasks, with accuracy scores in hesitancy analysis (80%-87%) comparable with sentiment analysis (82%-88%). GPT-3.5 and Llama 2 exhibited slight improvements in accuracy for some hesitancy types compared with their sentiment analysis performance. For instance, GPT-3.5 achieved an accuracy of 84.9% for convenience hesitancy, higher than its sentiment analysis accuracy scores (80%-82%), while Llama 2&#x2019;s accuracy for confidence hesitancy (70%) was close to its performance in negative sentiment analysis (75%).</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Large language models' performance (measured in accuracy and <italic>F</italic><sub>1</sub>-score) across social media platforms on vaccine sentiment analysis.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" rowspan="2">Metric</td><td align="left" valign="bottom" colspan="4">Reddit</td><td align="left" valign="bottom" colspan="4">X, formerly known as Twitter</td><td align="left" valign="bottom" colspan="4">YouTube</td></tr><tr><td align="left" valign="bottom">GPT-3.5</td><td align="left" valign="bottom">GPT-4</td><td align="left" valign="bottom">Claude3s</td><td align="left" valign="bottom">Llama2</td><td align="left" valign="bottom">GPT-3.5</td><td align="left" valign="bottom">GPT-4</td><td align="left" valign="bottom">Claude3s</td><td align="left" valign="bottom">Llama2</td><td align="left" valign="bottom">GPT-3.5</td><td align="left" valign="bottom">GPT-4</td><td align="left" valign="bottom">Claude3s</td><td align="left" valign="bottom">Llama2</td></tr></thead><tbody><tr><td align="left" valign="top">Accuracy</td><td align="left" valign="top">0.809</td><td align="left" valign="top">0.929</td><td align="left" valign="top">0.852</td><td align="left" valign="top">0.731</td><td align="left" valign="top">0.809</td><td align="left" valign="top">0.929</td><td align="left" valign="top">0.851</td><td align="left" valign="top">0.729</td><td align="left" valign="top">0.81</td><td align="left" valign="top">0.928</td><td align="left" valign="top">0.854</td><td align="left" valign="top">0.722</td></tr><tr><td align="left" valign="top"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="top">0.714</td><td align="left" valign="top">0.917</td><td align="left" valign="top">0.775</td><td align="left" valign="top">0.388</td><td align="left" valign="top">0.706</td><td align="left" valign="top">0.918</td><td align="left" valign="top">0.786</td><td align="left" valign="top">0.399</td><td align="left" valign="top">0.719</td><td align="left" valign="top">0.918</td><td align="left" valign="top">0.789</td><td align="left" valign="top">0.415</td></tr></tbody></table></table-wrap><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Large Language models' performance (measured in accuracy, recall, precision, and <italic>F</italic><sub>1</sub>-score) on vaccine hesitancy analysis.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Metric and hesitancy type</td><td align="left" valign="bottom">GPT-3.5</td><td align="left" valign="bottom">GPT-4</td><td align="left" valign="bottom">Claude 3s</td><td align="left" valign="bottom">Llama 2</td></tr></thead><tbody><tr><td align="left" valign="top">Accuracy</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Confidence</td><td align="left" valign="top">0.789</td><td align="left" valign="top">0.832</td><td align="left" valign="top">0.801</td><td align="left" valign="top">0.700</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Convenience</td><td align="left" valign="top">0.849</td><td align="left" valign="top">0.961</td><td align="left" valign="top">0.871</td><td align="left" valign="top">0.729</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Complacency</td><td align="left" valign="top">0.823</td><td align="left" valign="top">0.864</td><td align="left" valign="top">0.844</td><td align="left" valign="top">0.699</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Hesitancy</td><td align="left" valign="top">0.762</td><td align="left" valign="top">0.878</td><td align="left" valign="top">0.844</td><td align="left" valign="top">0.668</td></tr><tr><td align="left" valign="top">Precision</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Confidence</td><td align="left" valign="top">0.855</td><td align="left" valign="top">0.957</td><td align="left" valign="top">0.796</td><td align="left" valign="top">0.614</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Convenience</td><td align="left" valign="top">0.764</td><td align="left" valign="top">0.849</td><td align="left" valign="top">0.819</td><td align="left" valign="top">0.532</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Complacency</td><td align="left" valign="top">0.621</td><td align="left" valign="top">0.696</td><td align="left" valign="top">0.679</td><td align="left" valign="top">0.286</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Hesitancy</td><td align="left" valign="top">0.755</td><td align="left" valign="top">0.851</td><td align="left" valign="top">0.811</td><td align="left" valign="top">0.548</td></tr><tr><td align="left" valign="top">Recall</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Confidence</td><td align="left" valign="top">0.679</td><td align="left" valign="top">0.759</td><td align="left" valign="top">0.849</td><td align="left" valign="top">0.575</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Convenience</td><td align="left" valign="top">0.799</td><td align="left" valign="top">0.80</td><td align="left" valign="top">0.790</td><td align="left" valign="top">0.492</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Complacency</td><td align="left" valign="top">0.784</td><td align="left" valign="top">0.867</td><td align="left" valign="top">0.897</td><td align="left" valign="top">0.135</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Hesitancy</td><td align="left" valign="top">0.677</td><td align="left" valign="top">0.856</td><td align="left" valign="top">0.849</td><td align="left" valign="top">0.479</td></tr><tr><td align="left" valign="top"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Confidence</td><td align="left" valign="top">0.757</td><td align="left" valign="top">0.847</td><td align="left" valign="top">0.757</td><td align="left" valign="top">0.594</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Convenience</td><td align="left" valign="top">0.781</td><td align="left" valign="top">0.824</td><td align="left" valign="top">0.804</td><td align="left" valign="top">0.511</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Complacency</td><td align="left" valign="top">0.693</td><td align="left" valign="top">0.772</td><td align="left" valign="top">0.773</td><td align="left" valign="top">0.183</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Hesitancy</td><td align="left" valign="top">0.714</td><td align="left" valign="top">0.853</td><td align="left" valign="top">0.823</td><td align="left" valign="top">0.511</td></tr></tbody></table></table-wrap></sec><sec id="s3-5"><title>Impact of Learning Paradigms on Hesitancy Analysis</title><p>We explored the impact of zero-shot, 1-shot, 5-shot, and 10-shot learning paradigms on the models&#x2019; hesitancy analysis capabilities, focusing on accuracy and <italic>F</italic><sub>1</sub>-scores. The results indicate that GPT-4 continued to excel, with accuracy scores ranging from 83.2% for confidence in the zero-shot setting to 96% for convenience in the zero-shot setting (<xref ref-type="fig" rid="figure5">Figure 5</xref>). Notably, GPT-4&#x2019;s performance improves as the number of shots increases, particularly for overall hesitancy, reaching an accuracy of 92% in the 10-shot setting. This showcases GPT-4&#x2019;s exceptional ability to learn from additional examples and accurately detect overall vaccine hesitancy.</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Model accuracy performance by hesitancy type and number of shots.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e64723_fig05.png"/></fig><p>The <italic>F</italic><sub>1</sub>-scores tell a similar story. GPT-4 maintained its lead, with <italic>F</italic><sub>1</sub>-scores ranging from 77.2% for complacency in the zero-shot setting to 89.5% for overall hesitancy in the 10-shot setting (<xref ref-type="fig" rid="figure6">Figure 6</xref>). The increase in <italic>F</italic><sub>1</sub>-scores as the number of shots grows highlights GPT-4&#x2019;s ability to not only accurately classify hesitancy but also comprehensively identify relevant instances.</p><fig position="float" id="figure6"><label>Figure 6.</label><caption><p>Model <italic>F</italic><sub>1</sub>-score performance by hesitancy type and number of shots.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e64723_fig06.png"/></fig><p>Claude-3 showed substantial improvement in accuracy for convenience hesitancy as the number of shots increased, reaching 91.2% in the 10-shot setting. This underscores Claude-3&#x2019;s adaptability and potential to capture specific aspects of hesitancy when provided with more examples. GPT-3.5 exhibited an improvement in accuracy for convenience hesitancy in the 10-shot setting (87.9%). Similarly, its <italic>F</italic><sub>1</sub> scores for complacency hesitancy increased from 69.3% in the zero-shot setting to 73.9% in the 10-shot setting, indicating GPT-3.5&#x2019;s capacity for growth when given more examples. Llama 2, despite lower overall performance, showed consistent improvement in both accuracy and <italic>F</italic><sub>1</sub>-scores across all hesitancy types as the number of shots increased, suggesting its potential to learn and adapt.</p></sec><sec id="s3-6"><title>Statistical Analysis for Model Comparative Analysis</title><p>We conducted a statistical comparison of GPT-4&#x2019;s <italic>F</italic><sub>1</sub>-scores across different learning paradigms for both sentiment and hesitancy analysis. In sentiment analysis, an ANOVA test indicated no significant differences (F-statistic=0.021; <italic>P</italic>=.981) in GPT-4&#x2019;s <italic>F</italic><sub>1</sub>-score performance across learning paradigms. Similarly, in hesitancy analysis, the ANOVA test showed no significant differences (F-statistic=0.835; <italic>P</italic>=.476) in GPT-4&#x2019;s <italic>F</italic><sub>1</sub>-score performance across learning paradigms. These results suggest that GPT-4&#x2019;s <italic>F</italic><sub>1</sub>-score performance remains consistent regardless of the number of training examples provided.</p></sec><sec id="s3-7"><title>Token Usage Analysis</title><p>We analyzed the number of tokens required to run the LLMs for sentiment analysis and hesitancy detection across different vaccine categories, social media platforms, and learning paradigms. Understanding token usage patterns is crucial for estimating computational costs associated with using these models. For GPT-4, the cost is US $0.03 per 1000 tokens, while for GPT-3.5, it is US $0.0005 per 1000 tokens [<xref ref-type="bibr" rid="ref36">36</xref>]. For Claude-3 Sonnet, the cost is approximately US $0.003 per 1000 tokens [<xref ref-type="bibr" rid="ref37">37</xref>]. Unlike these models, Llama 2 does not have a direct cost per token; instead, its cost is related to the infrastructure required to host and run the model, such as the cost of GPU servers. The cost varies based on the instance type and configuration used for deployment [<xref ref-type="bibr" rid="ref38">38</xref>,<xref ref-type="bibr" rid="ref39">39</xref>].</p><p>Analyzing the average number of tokens required per post for each combination of vaccine type, social media platform, and learning paradigm (<xref ref-type="fig" rid="figure7">Figure 7</xref>), the results demonstrate that the number of tokens required increases with the number of training examples provided to the models. Posts related to HPV generally require more tokens than those related to MMR and general vaccines, particularly on Reddit. This suggests that discussions surrounding HPV may be more complex or lengthy, leading to higher token usage.</p><fig position="float" id="figure7"><label>Figure 7.</label><caption><p>Token usage across learning paradigms and platforms for different vaccine discussions. HPV: human papillomavirus; MMR: measles, mumps, and rubella.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e64723_fig07.png"/></fig><p>Regarding social media platforms, posts from Reddit consistently require more tokens than those from YouTube and X, across all vaccine types and learning paradigms. This indicates that the longer and more detailed nature of discussions on Reddit contributes to higher token usage compared with the other platforms.</p></sec><sec id="s3-8"><title>Case-Based Evaluation of Model Performance</title><p>This section provides a case-based evaluation of misclassified posts, highlighting examples of both correct and incorrect predictions made by 4 LLMs: GPT-3.5, Llama2, Claude-3s, and GPT-4. The evaluation focuses on complex sentiments, sarcasm, and implicit support for vaccines, which often challenge LLMs in distinguishing nuanced opinions. Each post is analyzed based on its true label, the models&#x2019; predictions, and their explanations, with an assessment of where and why misclassifications occurred. To safeguard privacy, all examples presented in this section are theoretical and were crafted to reflect the types of posts analyzed in the study, rather than quoting or reproducing actual user-generated content (<xref ref-type="other" rid="box1">Textboxes 1</xref><xref ref-type="other" rid="box2"/><xref ref-type="other" rid="box3"/><xref ref-type="other" rid="box4"/><xref ref-type="other" rid="box5"/>-<xref ref-type="other" rid="box6">6</xref>).</p><boxed-text id="box1"><title> Post analysis: explicit sentiment with mixed vaccine views.</title><p>Post:</p><p>&#x201C;You are false. The disease has a high recovery rate for healthy individuals with no underlying conditions. If I want to avoid measles, I trust the measles vaccine&#x2014;it works. But the COVID-19 vaccine? It&#x2019;s ineffective. Stop spreading unnecessary fear!&#x201D;</p><list list-type="bullet"><list-item><p>True Label: Positive (regarding MMR vaccine)</p></list-item><list-item><p>Predictions:</p><list list-type="bullet"> <list-item><p>GPT-3.5: Negative</p></list-item> <list-item><p>Llama2: Negative</p></list-item> <list-item><p>Claude-3s: Positive</p></list-item> <list-item><p>GPT-4: Positive</p></list-item> </list><p>Explanations:</p><p>GPT-3.5 and Llama2 focused on the aggressive tone and negative statements about the COVID-19 vaccine, overlooking the implicit endorsement of the measles vaccine. Claude-3s and GPT-4 identified the positive sentiment toward the measles vaccine despite the broader negative tone.</p><p>Error assessment: &#x00B7;</p></list-item><list-item><p>Contextual interpretation: Models struggled with distinguishing sentiment toward different vaccines within the same post.</p></list-item><list-item><p>Sarcasm and irony: Negative phrasing (&#x201C;fear-mongering crap&#x201D;) misled some models into assigning an overall negative sentiment.</p></list-item></list></boxed-text><boxed-text id="box2"><title> Post analysis: complex sentiment with implicit support.</title><p>Post:</p><p>&#x201C;Someone compared getting vaccinated to making a noble sacrifice&#x2014;seriously? Saying vaccines are pointless because only older people are at risk is both wrong and ridiculous. But whatever, it&#x2019;s your choice. I&#x2019;m done caring.&#x201D;</p><list list-type="bullet"><list-item><p>True label: Positive</p></list-item><list-item><p>Predictions:</p><list list-type="bullet"><list-item><p>GPT-3.5: Negative</p></list-item><list-item><p>Llama2: Negative</p></list-item><list-item><p>Claude-3s: Positive</p></list-item><list-item><p>GPT-4: Positive</p></list-item></list></list-item></list><p>Explanations:</p><p>GPT-3.5 and Llama2 misclassified the sentiment as negative due to the critical tone and the phrase &#x201C;I&#x2019;m done caring,&#x201D; which implies indifference. Claude-3s and GPT-4 correctly identified the critique of antivaccine reasoning as implicit support for vaccination.</p><p>Error assessment:</p><list list-type="bullet"><list-item><p>Complex sentiment and implicit support: The post critiques antivaccine arguments, implicitly supporting vaccination, but uses sarcasm and frustration, which confused some models.</p></list-item><list-item><p>Challenge in parsing ambiguity: Phrases such as &#x201C;It&#x2019;s your choice&#x201D; were interpreted differently across models, with some seeing them as neutral or dismissive.</p></list-item></list></boxed-text><boxed-text id="box3"><title> Post analysis: critique with ambiguous tone.</title><p>Post:</p><p>&#x201C;Oh, so being smart now means trusting pharmaceutical companies completely? Here&#x2019;s the reality: those shouting about personal freedoms when asked to wear a mask seem more likely to try bizarre &#x2018;natural remedies&#x2019; than to actually take a scientifically-proven vaccine.&#x201D;</p><list list-type="bullet"><list-item><p>&#x00B7; True label: Positive</p></list-item><list-item><p>Predictions:</p><list list-type="bullet"> <list-item><p>GPT-3.5: Negative</p></list-item> <list-item><p>Llama2: Neutral</p></list-item> <list-item><p>Claude-3s: Negative</p></list-item> <list-item><p>GPT-4: Neutral</p></list-item> </list> </list-item></list><p>Explanations:</p><p>All models struggled to classify this post correctly due to the sarcastic tone and critical phrasing. Some interpreted the critique of antivaccine rhetoric as a negative sentiment rather than implicit support for vaccination.</p><p>Error assessment:</p><list list-type="bullet"><list-item><p>Complex sentiment and implicit support: The sarcastic tone and indirect language posed challenges, leading to difficulty recognizing the underlying provaccine stance.</p></list-item><list-item><p>Misinterpretation of irony: The critique of intelligence and behavior in the context of antivaccine arguments led models to overemphasize the negative sentiment, missing the implicit support for vaccination.</p></list-item></list></boxed-text><boxed-text id="box4"><title> Post analysis: concern over measles outbreak.</title><p>Post:</p><p>&#x201C;Measles isn&#x2019;t just back in the US; we&#x2019;re seeing the same issues here in Europe. It&#x2019;s getting out of hand, and the number of people refusing to vaccinate their kids is shocking. %VIDEO%: The return of measles&#x2014;what&#x2019;s causing it?&#x201D;</p><list list-type="bullet"><list-item><p>True label: Positive</p></list-item><list-item><p>Predictions:</p><list list-type="bullet"> <list-item><p>GPT-3.5: Negative</p></list-item> <list-item><p>Llama2: Neutral</p></list-item> <list-item><p>Claude-3s: Negative</p></list-item> <list-item><p>GPT-4: Positive</p></list-item> </list> </list-item></list><p>Explanations:</p><p>GPT-3.5 and Claude-3s misclassified the example due to phrases such as &#x201C;out of hand&#x201D; and &#x201C;refusing to vaccinate,&#x201D; interpreting them as indicators of negative sentiment. GPT-4 correctly identified the post as expressing concern over measles outbreaks, which implicitly supports vaccination efforts to address the issue.</p><p>Error assessment:</p><list list-type="bullet"><list-item><p>Complexity in contextual signals: The use of emotionally charged phrases such as &#x201C;out of hand&#x201D; misled some models into perceiving the sentiment as negative, despite the post&#x2019;s implicit advocacy for vaccination to control outbreaks.</p></list-item></list></boxed-text><boxed-text id="box5"><title> Post analysis: sarcastic critique of herd immunity arguments.</title><p>Post:</p><p>&#x201C;Be sure to tell everyone you&#x2019;re skipping vaccines for diseases like polio, rubella, or whooping cough. Herd immunity will handle it, right? A few thousand dead kids is just the cost of freedom. Moo!&#x201D;</p><list list-type="bullet"><list-item><p>True label: Positive</p></list-item><list-item><p>Predictions:</p><list list-type="bullet"><list-item><p>GPT-3.5: Negative</p></list-item><list-item><p>Llama2: Negative</p></list-item><list-item><p>Claude-3s: Negative</p></list-item><list-item><p>GPT-4: Positive</p></list-item></list></list-item></list><p>Explanations:</p><p>GPT-3.5, Llama2, and Claude-3s struggled with interpreting the sarcastic tone, focusing on phrases such as &#x201C;dead kids&#x201D; as indicators of negative sentiment. GPT-4 correctly understood the sarcasm as a critique of antivaccine reasoning, identifying the post&#x2019;s implicit support for vaccination.</p><p>Error assessment:</p><list list-type="bullet"><list-item><p>Failure to recognize sarcasm: Most models struggled with the heavily sarcastic tone, focusing on surface-level negative phrases rather than the implicit provaccine stance.</p></list-item></list></boxed-text><boxed-text id="box6"><title> Post analysis for constructs.</title><p>Post:</p><p>&#x201C;I thought about getting the HPV vaccine, but then I found out it doesn&#x2019;t cover all strains. What&#x2019;s the point if I could still catch something? Plus, the nearest clinic is too far, and I can&#x2019;t miss work just for a shot. Maybe I&#x2019;ll wait and see.&#x201D;</p><list list-type="bullet"><list-item><p>True label: Hesitant (Confidence + Convenience)</p></list-item><list-item><p>Predictions:</p><list list-type="bullet"><list-item><p>GPT-4: Hesitant (confidence)</p></list-item><list-item><p>Claude-3s: Hesitant (complacency)</p></list-item><list-item><p>GPT-3.5: Nonhesitant</p></list-item><list-item><p>Llama2: Nonhesitant</p></list-item></list></list-item></list><p>Explanations:</p><p>GPT-4 successfully identified the confidence issues (concerns about the vaccine&#x2019;s incomplete coverage) but overlooked the logistical barrier related to clinic distance and work constraints. Claude-3s misclassified the post as complacency, interpreting &#x201C;wait and see&#x201D; as indicating low-perceived disease risk, rather than logistical or confidence-related barriers. GPT-3.5 and Llama2 both classified the post as nonhesitant, failing to recognize the implicit skepticism and practical challenges expressed in the post.</p><p>Error assessment:</p><list list-type="bullet"><list-item><p>Difficulty in addressing multiconstruct hesitancy: The intertwined nature of confidence (concerns about vaccine efficacy) and convenience (logistical challenges) constructs posed challenges for all models.</p></list-item><list-item><p>Oversimplification of constructs: Models such as Claude-3s and GPT-4 overemphasized individual constructs, leading to incomplete classification.</p></list-item><list-item><p>Missed implicit barriers: GPT-3.5 and Llama2 failed to interpret the implicit logistical challenges as indicative of hesitancy, instead categorizing the post as nonhesitant.</p></list-item></list></boxed-text></sec><sec id="s3-9"><title>Arguments</title><p>This case-based evaluation highlights the challenges faced by LLMs in accurately classifying nuanced and complex sentiments. Models often misclassify posts due to sarcasm, indirect language, or the presence of mixed sentiments targeting different aspects of vaccination. While GPT-4 demonstrated the highest accuracy in identifying implicit support, other models frequently struggled with contextual interpretation. Future work could focus on improving models&#x2019; ability to handle sarcasm, irony, and subtle expressions of support or critique.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study evaluated the effectiveness of LLMs in sentiment analysis and hesitancy detection regarding vaccine-related posts on social media, focusing on models including GPT-3.5, GPT-4, Claude-3 Sonnet, and Llama 2. Our results indicate that GPT-4 outperforms other models across various metrics, consistent with recent advancements in LLM capabilities for processing complex linguistic patterns [<xref ref-type="bibr" rid="ref40">40</xref>].</p></sec><sec id="s4-2"><title>Sentiment and Hesitancy Analysis Insights</title><p>GPT-4 consistently outperformed other models across all sentiment types and learning paradigms, attributable to its larger parameter size and extensive pretraining on diverse datasets, enabling it to capture subtle nuances and contextual information more effectively [<xref ref-type="bibr" rid="ref41">41</xref>]. However, our analysis also revealed that the performance of smaller LLMs, such as Claude 3s and GPT-3.5, was not far behind GPT-4, particularly in the few-shot learning paradigm. Although fine-tuning was not conducted in this study, previous research has shown that fine-tuned smaller LLMs can achieve performance levels similar to state-of-the-art models in some domain-specific applications, such as financial sentiment analysis [<xref ref-type="bibr" rid="ref41">41</xref>]. Exploring fine-tuning as a future direction could further enhance these models&#x2019; task-specific performance. From a cost perspective, smaller models can be a more viable and budget-friendly alternative. For instance, GPT-3.5 costs approximately US $0.0005 per 1000 tokens, while GPT-4 costs around US $0.03 per 1000 tokens [<xref ref-type="bibr" rid="ref36">36</xref>]. This substantial difference in cost highlights the economic advantage of using smaller LLMs for large-scale applications where budget constraints are a factor. Moreover, the slightly reduced performance of smaller LLMs in comparison with larger models might be offset by their significantly lower operational costs, making them a practical choice for many real-world applications.</p><p>Our study found that GPT-4 handles discussions about HPV vaccines better than those about general vaccines or MMR, likely due to the more consistent language used in HPV discussions. In contrast, conversations about general vaccines or MMR cover a broader range of topics, making it harder for models to perform consistently well. In addition, the platform impacts model performance, with Reddit allowing for longer, more complex discussions, and YouTube comments often using informal language and slang.</p><p>The performance of LLMs in classifying neutral sentiments revealed notable challenges, highlighting the inherent difficulties due to their less expressive nature and greater reliance on contextual understanding. Enhanced model training that better captures the subtleties of neutral language and incorporates contextual cues is necessary for improving neutral sentiment classification.</p><p>In hesitancy analysis, GPT-4 and Claude-3 Sonnet led the pack. Interestingly, the models generally achieved higher accuracy in hesitancy analysis than in sentiment analysis, likely due to the more specific nature of hesitancy categories. However, lower <italic>F</italic><sub>1</sub>-scores for the complacency hesitancy type indicate challenges in capturing nuanced and context-dependent expressions of hesitancy. Complacency, often expressed through passive language that neither outright rejects nor endorses vaccination, requires a nuanced detection approach that current models struggle to provide. This issue highlights the need for improvements in model training, suggesting that integrating more complex, contextually rich datasets could enhance model sensitivity to the passive expressions typical of complacency.</p></sec><sec id="s4-3"><title>Consistency Across Learning Paradigms</title><p>Our analysis revealed that increasing the number of shots did not always lead to improved performance, aligning with findings that few-shot learning&#x2019;s effectiveness depends on the quality and relevance of the examples provided [<xref ref-type="bibr" rid="ref41">41</xref>]. GPT-4 showed consistent performance across zero-shot, 1-shot, and few-shot learning paradigms, with no significant differences in performance enhancements with more training examples. This consistency makes zero-shot learning a cost-effective strategy due to its balance of accuracy and reduced computational resources [<xref ref-type="bibr" rid="ref42">42</xref>,<xref ref-type="bibr" rid="ref43">43</xref>].</p><p>Our token usage analysis indicated that the number of tokens required increased with the number of training examples, with the zero-shot paradigm requiring the least tokens and the 10-shot paradigm requiring the most. Token usage also varied across vaccine types and social media platforms, suggesting that the complexity and length of discussions impact computational costs.</p><p>While LLMs such as GPT-4 offer superior performance metrics, these benefits come with higher computational expenses and longer processing times. For instance, GPT-4 achieved an accuracy of 88% and an <italic>F</italic><sub>1</sub>-score of 0.85, outperforming traditional machine learning methods [<xref ref-type="bibr" rid="ref23">23</xref>]. However, the associated usage costs and processing times pose challenges for large-scale, real-time deployment.</p></sec><sec id="s4-4"><title>Hybrid Approach for Improved Efficiency</title><p>To address these challenges, we propose a hybrid approach leveraging both LLMs and traditional machine learning methods, combined with manual annotation. LLMs such as GPT-4 can handle a significant portion of the annotation workload, processing large volumes of data efficiently and providing consistent, high-quality annotations based on predefined criteria, reducing the burden of manual annotation and minimizing variability associated with human annotators.</p><p>A subset of the data should undergo manual annotation by human experts to ensure that nuanced and complex cases are accurately labeled, providing a benchmark to validate and refine LLM-generated annotations. Once annotated, traditional machine learning models can be trained using this hybrid-annotated dataset. These models, being more computationally efficient, can be deployed for real-time monitoring and analysis of social media data.</p><p>By integrating automated and manual annotation methods, we optimize both performance and cost, ensuring accurate and timely insights into vaccine sentiment and hesitancy. This approach enhances the feasibility of using advanced NLP techniques in public health and sets a precedent for future research in leveraging hybrid models for complex analytical tasks.</p></sec><sec id="s4-5"><title>Insights From Case-Based Evaluation</title><p>The case-based evaluation underscores the complexities faced by LLMs when classifying nuanced sentiments and vaccine hesitancy constructs in social media posts. Posts containing sarcasm, indirect phrasing, and implicit sentiments were particularly challenging for models. For instance, sarcastic critiques of antivaccine rhetoric were frequently misclassified as negative sentiment, highlighting the limitations of LLMs in interpreting figurative language and irony. This aligns with previous findings that sarcasm and indirect expressions pose significant challenges in text classification tasks [<xref ref-type="bibr" rid="ref44">44</xref>].</p><p>GPT-4 consistently exhibited better contextual understanding, particularly in recognizing implicit support for vaccination. However, even GPT-4 struggled with multiconstruct hesitancy posts, such as those intertwining confidence issues and logistical barriers. Other models, such as GPT-3.5, Llama2, and Claude-3s, often misclassified posts by overrelying on surface-level cues, failing to capture the deeper sentiment or hesitancy constructs. These observations reflect broader limitations in LLMs&#x2019; ability to handle multidimensional health-related discourse.</p><p>The evaluation highlights the need for improved training and fine-tuning strategies that incorporate domain-specific linguistic features and multilabel constructs. Future research should focus on enhancing models&#x2019; ability to parse complex sentiment, handle mixed tone, and interpret multiconstruct hesitancy more effectively.</p></sec><sec id="s4-6"><title>Limitations</title><sec id="s4-6-1"><title>Dataset Size and Representation</title><p>A key limitation of this study is the size of the evaluation dataset, comprising 7515 annotated social media posts. While efforts were made to ensure balance across sentiment and hesitancy categories, certain subcategories, such as complacency and convenience within hesitancy, may still contain relatively small counts. This could limit the robustness of conclusions drawn for these specific subcategories. Furthermore, the reuse of the same training posts across few-shot learning experiments was a deliberate choice to maintain consistency and enable a controlled comparison of model performance. However, this may limit the diversity of training examples, which could impact the generalizability of the findings. Future studies could address these limitations by expanding the dataset, particularly within less-represented categories, and exploring the effects of more diverse training samples on LLM performance.</p></sec><sec id="s4-6-2"><title>Contextual Influences and Pandemic Polarization</title><p>The data used in this study span up until October 2021, encompassing the period of the COVID-19 pandemic. The COVID-19 pandemic created a highly polarized environment around vaccine discussions on social media, amplifying both supportive and skeptical voices. This polarization likely influenced the data composition, with heightened expressions of hesitancy driven by safety concerns, misinformation, and politicized discourse. These patterns may have shaped the ability of LLMs to classify sentiments and hesitancy constructs, particularly when handling mixed tones or ambiguous expressions. For instance, the increased prevalence of emotionally charged or politicized language during the pandemic might have led to overemphasis on negative cues in posts that were otherwise supportive of vaccination. Addressing these nuances requires models capable of disentangling context-dependent sentiments, a challenge underscored in this study&#x2019;s findings. Future research could investigate how temporal factors, such as the pandemic&#x2019;s progression, impact sentiment and hesitancy patterns.</p></sec><sec id="s4-6-3"><title>Limitations of API-Based Models</title><p>While our study offers valuable insights into the performance of LLMs in analyzing vaccine-related sentiments and hesitancy on social media, it is important to acknowledge several limitations that future research should address. First, our analysis was limited to the models available through Azure and Amazon Bedrock&#x2019;s API services at the time of our research. These included GPT-3.5, GPT-4, Claude-3 Sonnet, and Llama 2. While these models were accessible and suitable, given our computational resources, they were not necessarily the most advanced versions available. The rapid development in the field of LLMs means that more recent and potentially more effective models could offer enhanced performance on similar tasks. Also, the reliance on API versions available through Azure and Amazon Web Services means that the models and tools we used may not have been the most optimized for our specific research purposes. API versions are frequently updated to include improvements and new features that could significantly enhance model performance. Consequently, subsequent research should strive to use the most recent API versions to ensure that findings are based on the best possible technological tools and to maintain comparability with other contemporary research.</p></sec><sec id="s4-6-4"><title>Limited Vaccine Types and Platforms</title><p>Our analysis focused on a limited set of vaccine types and social media platforms. While these choices were based on data availability and relevance to our research questions, they may not capture the full spectrum of vaccine-related discussions on social media. Future research could expand the scope to include a wider range of vaccine types and platforms to assess the generalizability of our findings and identify potential differences in sentiment and hesitancy patterns across various contexts.</p></sec></sec><sec id="s4-7"><title>Practical Integration and Ethical Considerations</title><p>This study focused on the technical aspects of using LLMs for vaccine sentiment analysis and hesitancy detection, but it did not delve into the practical implications and challenges of integrating these models into real-world public health initiatives. Future research should explore the ethical, social, and organizational factors that may influence the adoption and effectiveness of LLMs in addressing vaccine hesitancy. This could include investigating issues such as data privacy, algorithmic fairness, and the potential unintended consequences of using LLMs in public health communication strategies.</p><p>While our study highlights the potential of LLMs in identifying trends in vaccine hesitancy, it is important to recognize that these models are just one piece of the puzzle. Combating vaccine hesitancy requires a multifaceted approach that combines data-driven insights with effective communication strategies, community engagement, and trust-building efforts. Future research should explore how LLMs can be integrated into comprehensive public health interventions that address the complex and multidimensional nature of vaccine hesitancy.</p></sec><sec id="s4-8"><title>Future Directions</title><sec id="s4-8-1"><title>Advanced Prompting Techniques</title><p>Future work could explore innovative prompting methods such as chain of thought reasoning and clue and reasoning prompting. These approaches could enhance the reasoning capabilities of LLMs, particularly in tasks requiring interpretation of sarcasm, indirect language, or multiconstruct hesitancy. Additionally, retrieval-augmented generation could be used to retrieve contextually relevant examples for improved performance during inference.</p></sec><sec id="s4-8-2"><title>Domain-Specific Fine-Tuning</title><p>While fine-tuning was not conducted in this study, its potential to improve performance in domain-specific tasks warrants exploration. For instance, fine-tuning smaller models such as GPT-3.5 on datasets with domain-specific linguistic features or domain-specific datasets enriched with figurative language, sarcasm, and implicit expressions could enhance their ability to capture vaccine-related nuances while maintaining cost efficiency.</p></sec><sec id="s4-8-3"><title>Dataset Expansion and Diversification</title><p>Increasing the size and diversity of datasets, particularly within underrepresented categories such as complacency and convenience, could provide more robust training and evaluation data. In addition, temporal analyses comparing pre- and postpandemic datasets could help identify shifts in public sentiment and hesitancy patterns.</p></sec><sec id="s4-8-4"><title>Cross-Platform and Vaccine Type Analyses</title><p>Expanding the study to include additional social media platforms and vaccine types could improve generalizability and provide deeper insights into context-specific discourse patterns. Platforms such as Instagram or TikTok, which cater to different user demographics, may reveal distinct sentiment trends.</p></sec><sec id="s4-8-5"><title>Ethical and Practical Considerations</title><p>Future research should explore the ethical implications of deploying LLMs in public health communication. This includes addressing algorithmic biases, ensuring data privacy, and minimizing unintended consequences, such as the spread of misinformation.</p></sec><sec id="s4-8-6"><title>Development of Novel Metrics</title><p>Traditional metrics, such as <italic>F</italic><sub>1</sub>-scores, are insufficient for capturing performance in nuanced tasks such as sarcasm detection and implicit sentiment classification. Future studies should develop novel metrics, such as a &#x201C;sarcasm sensitivity index&#x201D; or an &#x201C;implicit sentiment alignment score,&#x201D; to better evaluate LLMs in these complex areas.</p></sec><sec id="s4-8-7"><title>Explainability and Interpretability Research</title><p>Investigating explainability methods to understand LLMs&#x2019; decision-making processes is a promising direction. For example, using attention heatmaps to visualize how models focus on specific words or phrases could reveal their interpretation mechanisms. This could help refine models to better handle mixed or ambiguous language constructs, such as multiconstruct hesitancy posts.</p></sec></sec><sec id="s4-9"><title>Conclusions</title><p>Our study provides a foundation for understanding the performance and limitations of LLMs in vaccine sentiment analysis and hesitancy detection tasks. However, there are several limitations and opportunities for future research that should be addressed to fully harness the potential of these models in promoting vaccine acceptance and uptake. As the field of LLMs continues to evolve rapidly, researchers and public health professionals should remain vigilant in evaluating the latest developments and adapting their approaches to ensure the most effective and responsible use of these powerful tools in addressing the critical challenge of vaccine hesitancy.</p></sec></sec></body><back><ack><p>This work was funded by Merck Sharp &#x0026; Dohme LLC, a subsidiary of Merck &#x0026; Co, Inc. The content is the sole responsibility of the authors and does not necessarily represent the official views of Merck &#x0026; Co, Inc, or IMO Health Inc.</p></ack><fn-group><fn fn-type="con"><p>JD, ALE, and AA contributed to study concept and design. AA, ALE, DW, and MRM performed the experiments. AA, ALE, DW, and VKN drafted the manuscript. AA, ALE, and DW contributed to acquisition, analysis, or interpretation of data. JD, ALE, and XW contributed to study supervision. All authors contributed to critical revision of the manuscript for important intellectual content.</p></fn><fn fn-type="conflict"><p>ALE and DW are current employees and VKN is a contractor of Merck Sharp &#x0026; Dohme LLC, a subsidiary of Merck &#x0026; Co, Inc, who may own stock or stock options in Merck &#x0026; Co, Inc. All the other authors were employed by IMO Health, Inc, and, therefore, IMO Health, Inc, was compensated for activities related to execution of the study.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">API</term><def><p>application programming interface</p></def></def-item><def-item><term id="abb3">GPT</term><def><p>generative pretrained transformer</p></def></def-item><def-item><term id="abb4">HPV</term><def><p>human papillomavirus</p></def></def-item><def-item><term id="abb5">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb6">MMR</term><def><p>measles, mumps, and rubella</p></def></def-item><def-item><term id="abb7">NLP</term><def><p>natural language processing</p></def></def-item><def-item><term id="abb8">3Cs</term><def><p>confidence, complacency, and convenience</p></def></def-item><def-item><term id="abb9">WHO</term><def><p>World Health Organization</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="web"><article-title>GPT-4 technical report</article-title><source>OpenAI</source><year>2023</year><month>03</month><day>15</day><access-date>2025-09-18</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://cdn.openai.com/papers/gpt-4.pdf">https://cdn.openai.com/papers/gpt-4.pdf</ext-link></comment></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="web"><article-title>OpenAI API documentation (GPT-3.5 series)</article-title><source>OpenAI</source><year>2022</year><month>03</month><day>15</day><access-date>2025-09-29</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://platform.openai.com/docs/api-reference/introduction">https://platform.openai.com/docs/api-reference/introduction</ext-link></comment></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="web"><article-title>Introducing the next generation of Claude</article-title><source>Anthropic</source><year>2024</year><access-date>2025-09-29</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://docs.claude.com/en/home">https://docs.claude.com/en/home</ext-link></comment></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="web"><article-title>Llama 2</article-title><source>Meta AI</source><access-date>2025-09-29</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.llama.com/llama2/">https://www.llama.com/llama2/</ext-link></comment></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>P ran</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>J yao</given-names> </name><name name-style="western"><surname>Huo</surname><given-names>T tong</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>S xiang</given-names> </name><name name-style="western"><surname>Ye</surname><given-names>Z wei</given-names> </name></person-group><article-title>Application of artificial intelligence in medicine: An overview</article-title><source>Curr Med Sci</source><year>2021</year><month>12</month><volume>41</volume><issue>6</issue><fpage>1105</fpage><lpage>1115</lpage><pub-id pub-id-type="doi">10.1007/s11596-021-2474-3</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kung</surname><given-names>TH</given-names> </name><name name-style="western"><surname>Cheatham</surname><given-names>M</given-names> </name><name name-style="western"><surname>Medenilla</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title><source>PLOS Digit Health</source><year>2023</year><month>02</month><volume>2</volume><issue>2</issue><fpage>e0000198</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id><pub-id pub-id-type="medline">36812645</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Guerra</surname><given-names>GA</given-names> </name><name name-style="western"><surname>Hofmann</surname><given-names>H</given-names> </name><name name-style="western"><surname>Sobhani</surname><given-names>S</given-names> </name><etal/></person-group><article-title>GPT-4 artificial intelligence model outperforms ChatGPT, medical students, and neurosurgery residents on neurosurgery written board-like questions</article-title><source>World Neurosurg</source><year>2023</year><month>11</month><volume>179</volume><fpage>e160</fpage><lpage>e165</lpage><pub-id pub-id-type="doi">10.1016/j.wneu.2023.08.042</pub-id><pub-id pub-id-type="medline">37597659</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kung</surname><given-names>JE</given-names> </name><name name-style="western"><surname>Marshall</surname><given-names>C</given-names> </name><name name-style="western"><surname>Gauthier</surname><given-names>C</given-names> </name><name name-style="western"><surname>Gonzalez</surname><given-names>TA</given-names> </name><name name-style="western"><surname>Jackson</surname><given-names>JB</given-names> </name></person-group><article-title>Evaluating ChatGPT performance on the orthopaedic in-training examination</article-title><source>JB JS Open Access</source><year>2023</year><volume>8</volume><issue>3</issue><pub-id pub-id-type="doi">10.2106/JBJS.OA.23.00056</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Oh</surname><given-names>N</given-names> </name><name name-style="western"><surname>Choi</surname><given-names>GS</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>WY</given-names> </name></person-group><article-title>ChatGPT goes to the operating room: evaluating GPT-4 performance and its potential in surgical education and training in the era of large language models</article-title><source>Ann Surg Treat Res</source><year>2023</year><month>05</month><volume>104</volume><issue>5</issue><fpage>269</fpage><lpage>273</lpage><pub-id pub-id-type="doi">10.4174/astr.2023.104.5.269</pub-id><pub-id pub-id-type="medline">37179699</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Brin</surname><given-names>D</given-names> </name><name name-style="western"><surname>Sorin</surname><given-names>V</given-names> </name><name name-style="western"><surname>Vaid</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Comparing ChatGPT and GPT-4 performance in USMLE soft skill assessments</article-title><source>Sci Rep</source><year>2023</year><month>10</month><day>1</day><volume>13</volume><issue>1</issue><fpage>16492</fpage><pub-id pub-id-type="doi">10.1038/s41598-023-43436-9</pub-id><pub-id pub-id-type="medline">37779171</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tracy</surname><given-names>BM</given-names> </name><name name-style="western"><surname>Hazen</surname><given-names>BJ</given-names> </name><name name-style="western"><surname>Ward</surname><given-names>CR</given-names> </name><name name-style="western"><surname>Winer</surname><given-names>JH</given-names> </name><name name-style="western"><surname>Pettitt</surname><given-names>BJ</given-names> </name></person-group><article-title>Sustained clinical performance during surgical rotations predicts NBME shelf exam outcomes</article-title><source>J Surg Educ</source><year>2020</year><volume>77</volume><issue>6</issue><fpage>e116</fpage><lpage>e120</lpage><pub-id pub-id-type="doi">10.1016/j.jsurg.2020.06.033</pub-id><pub-id pub-id-type="medline">32651118</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kossoff</surname><given-names>EH</given-names> </name><name name-style="western"><surname>Hubbard</surname><given-names>TW</given-names> </name><name name-style="western"><surname>Gowen</surname><given-names>CW</given-names>  <suffix>Jr</suffix></name></person-group><article-title>Early clinical experience enhances third-year pediatrics clerkship performance</article-title><source>Acad Med</source><year>1999</year><month>11</month><volume>74</volume><issue>11</issue><fpage>1238</fpage><lpage>1241</lpage><pub-id pub-id-type="doi">10.1097/00001888-199911000-00019</pub-id><pub-id pub-id-type="medline">10587688</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Garcia-Vidal</surname><given-names>C</given-names> </name><name name-style="western"><surname>Sanjuan</surname><given-names>G</given-names> </name><name name-style="western"><surname>Puerta-Alcalde</surname><given-names>P</given-names> </name><name name-style="western"><surname>Moreno-Garc&#x00ED;a</surname><given-names>E</given-names> </name><name name-style="western"><surname>Soriano</surname><given-names>A</given-names> </name></person-group><article-title>Artificial intelligence to support clinical decision-making processes</article-title><source>EBioMedicine</source><year>2019</year><month>08</month><volume>46</volume><fpage>27</fpage><lpage>29</lpage><pub-id pub-id-type="doi">10.1016/j.ebiom.2019.07.019</pub-id><pub-id pub-id-type="medline">31303500</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ellahham</surname><given-names>S</given-names> </name></person-group><article-title>Artificial intelligence: the future for diabetes care</article-title><source>Am J Med</source><year>2020</year><month>08</month><volume>133</volume><issue>8</issue><fpage>895</fpage><lpage>900</lpage><pub-id pub-id-type="doi">10.1016/j.amjmed.2020.03.033</pub-id><pub-id pub-id-type="medline">32325045</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singhal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Azizi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Large language models encode clinical knowledge</article-title><source>Nature New Biol</source><year>2023</year><month>08</month><volume>620</volume><issue>7972</issue><fpage>172</fpage><lpage>180</lpage><pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id><pub-id pub-id-type="medline">37438534</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="web"><article-title>Vaccine hesitancy: a growing challenge for immunization programmes</article-title><source>World Health Organization</source><year>2015</year><access-date>2025-9-29</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.who.int/news/item/18-08-2015-vaccine-hesitancy-a-growing-challenge-for-immunization-programmes">https://www.who.int/news/item/18-08-2015-vaccine-hesitancy-a-growing-challenge-for-immunization-programmes</ext-link></comment></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Naveed</surname><given-names>H</given-names> </name><name name-style="western"><surname>Khan</surname><given-names>AU</given-names> </name><name name-style="western"><surname>Qiu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Yao</surname><given-names>L</given-names> </name><name name-style="western"><surname>Haq</surname><given-names>AU</given-names> </name><name name-style="western"><surname>Ullah</surname><given-names>S</given-names> </name><etal/></person-group><article-title>A comprehensive overview of large language models</article-title><access-date>2024-02-11</access-date><comment>Preprint posted online on 2023</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2307.06435">https://arxiv.org/abs/2307.06435</ext-link></comment></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Radford</surname><given-names>A</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Child</surname><given-names>R</given-names> </name><name name-style="western"><surname>Luan</surname><given-names>D</given-names> </name><name name-style="western"><surname>Amodei</surname><given-names>D</given-names> </name><name name-style="western"><surname>Sutskever</surname><given-names>I</given-names> </name></person-group><article-title>Language models are unsupervised multitask learners</article-title><source>OpenAI blog</source><year>2019</year><access-date>2025-09-29</access-date><volume>1</volume><issue>8</issue><fpage>9</fpage><comment><ext-link ext-link-type="uri" xlink:href="https://storage.prod.researchhub.com/uploads/papers/2020/06/01/language-models.pdf">https://storage.prod.researchhub.com/uploads/papers/2020/06/01/language-models.pdf</ext-link></comment></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="report"><person-group person-group-type="author"><name name-style="western"><surname>Radford</surname><given-names>A</given-names> </name><name name-style="western"><surname>Narasimhan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Salimans</surname><given-names>T</given-names> </name><name name-style="western"><surname>Sutskever</surname><given-names>I</given-names> </name></person-group><article-title>Improving language understanding by generative pre-training</article-title><year>2018</year><access-date>2024-01-11</access-date><publisher-name>OpenAI</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf">https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf</ext-link></comment></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Raffel</surname><given-names>C</given-names> </name><name name-style="western"><surname>Shazeer</surname><given-names>N</given-names> </name><name name-style="western"><surname>Roberts</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>K</given-names> </name><name name-style="western"><surname>Narang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Matena</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Exploring the limits of transfer learning with a unified text-to-text transformer</article-title><source>J Mach Learn Res</source><year>2020</year><access-date>2025-09-29</access-date><volume>21</volume><issue>140</issue><fpage>1</fpage><lpage>67</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://www.jmlr.org/papers/v21/20-074.html">https://www.jmlr.org/papers/v21/20-074.html</ext-link></comment></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="web"><article-title>Claude: a next-generation AI assistant</article-title><source>Anthropic Blog</source><year>2023</year><access-date>2024-07-15</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.anthropic.com/claude">https://www.anthropic.com/claude</ext-link></comment></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Touvron</surname><given-names>H</given-names> </name><name name-style="western"><surname>Lavril</surname><given-names>T</given-names> </name><name name-style="western"><surname>Izacard</surname><given-names>G</given-names> </name><name name-style="western"><surname>Martinet</surname><given-names>X</given-names> </name><name name-style="western"><surname>Lachaux</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Lacroix</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Llama: open and efficient foundation language models</article-title><source>arXiv</source><access-date>2024-05-20</access-date><comment>Preprint posted online on 2023</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2302.13971">https://arxiv.org/abs/2302.13971</ext-link></comment></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>LC</given-names> </name><name name-style="western"><surname>Eiden</surname><given-names>AL</given-names> </name><name name-style="western"><surname>He</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Natural language processing-powered real-time monitoring solution for vaccine sentiments and hesitancy on social media: system development and validation</article-title><source>JMIR Med Inform</source><year>2024</year><month>06</month><day>21</day><volume>12</volume><issue>4</issue><fpage>e57164</fpage><pub-id pub-id-type="doi">10.2196/57164</pub-id><pub-id pub-id-type="medline">38904984</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="web"><article-title>Report of the SAGE working group on vaccine hesitancy</article-title><source>World Health Organization</source><year>2014</year><access-date>2025-9-29</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.asset-scienceinsociety.eu/sites/default/files/sage_working_group_revised_report_vaccine_hesitancy.pdf">https://www.asset-scienceinsociety.eu/sites/default/files/sage_working_group_revised_report_vaccine_hesitancy.pdf</ext-link></comment></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Alarcon</surname><given-names>N</given-names> </name></person-group><article-title>OpenAI presents GPT-3, a 175 billion parameters language model</article-title><source>NVIDIA Technical Blog</source><year>2020</year><month>04</month><day>7</day><access-date>2024-04-10</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://developer.nvidia.com/blog/openai-presents-gpt-3-a-175-billion-parameters-language-model/">https://developer.nvidia.com/blog/openai-presents-gpt-3-a-175-billion-parameters-language-model/</ext-link></comment></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Bastian</surname><given-names>M</given-names> </name></person-group><article-title>GPT-3.5 might be a strong example of the efficiency potential of large AI models</article-title><source>The Decoder</source><year>2023</year><month>03</month><day>31</day><access-date>2024-03-10</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://the-decoder.com/gpt-3-5-might-be-a-strong-example-of-the-efficiency-potential-of-large-ai-models/">https://the-decoder.com/gpt-3-5-might-be-a-strong-example-of-the-efficiency-potential-of-large-ai-models/</ext-link></comment></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="web"><article-title>Anthropic&#x2019;s Claude 3 Sonnet Model now on Amazon Bedrock</article-title><source>Amazon Web Services</source><year>2024</year><access-date>2024-05-09</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://aws.amazon.com/about-aws/whats-new/2024/03/anthropics-claude-3-sonnet-model-amazon-bedrock/">https://aws.amazon.com/about-aws/whats-new/2024/03/anthropics-claude-3-sonnet-model-amazon-bedrock/</ext-link></comment></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Bergmann</surname><given-names>D</given-names> </name></person-group><article-title>What is zero-shot learning?</article-title><source>IBM</source><year>2024</year><access-date>2025-9-29</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.ibm.com/think/topics/zero-shot-learning">https://www.ibm.com/think/topics/zero-shot-learning</ext-link></comment></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="web"><article-title>What is zero-shot classification?</article-title><source>Hugging Face</source><year>2022</year><month>11</month><day>16</day><access-date>2024-02-05</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://huggingface.co/tasks/zero-shot-classification">https://huggingface.co/tasks/zero-shot-classification</ext-link></comment></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Brown</surname><given-names>T</given-names> </name><name name-style="western"><surname>Mann</surname><given-names>B</given-names> </name><name name-style="western"><surname>Ryder</surname><given-names>N</given-names> </name><name name-style="western"><surname>Subbiah</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kaplan</surname><given-names>JD</given-names> </name><name name-style="western"><surname>Dhariwal</surname><given-names>P</given-names> </name><etal/></person-group><article-title>Language models are few-shot learners</article-title><source>Adv Neural Inf Process Syst</source><year>2020</year><access-date>2025-09-29</access-date><volume>33</volume><fpage>1877</fpage><lpage>1901</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper_files/paper/2020/hash/1457c0d6bfcb4967418bfb8ac142f64a-Abstract.html?">https://proceedings.neurips.cc/paper_files/paper/2020/hash/1457c0d6bfcb4967418bfb8ac142f64a-Abstract.html?</ext-link></comment></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>P</given-names> </name><name name-style="western"><surname>Yuan</surname><given-names>W</given-names> </name><name name-style="western"><surname>Fu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Hayashi</surname><given-names>H</given-names> </name><name name-style="western"><surname>Neubig</surname><given-names>G</given-names> </name></person-group><article-title>Pre-train, prompt, and predict: a systematic survey of prompting methods in natural language processing</article-title><source>ACM Comput Surv</source><year>2023</year><month>09</month><day>30</day><volume>55</volume><issue>9</issue><fpage>1</fpage><lpage>35</lpage><pub-id pub-id-type="doi">10.1145/3560815</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Polak</surname><given-names>MP</given-names> </name><name name-style="western"><surname>Morgan</surname><given-names>D</given-names> </name></person-group><article-title>Extracting accurate materials data from research papers with conversational language models and prompt engineering</article-title><source>Nat Commun</source><year>2024</year><month>02</month><day>21</day><volume>15</volume><issue>1</issue><fpage>1569</fpage><pub-id pub-id-type="doi">10.1038/s41467-024-45914-8</pub-id><pub-id pub-id-type="medline">38383556</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Gao</surname><given-names>T</given-names> </name><name name-style="western"><surname>Fisch</surname><given-names>A</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>D</given-names> </name></person-group><article-title>Making pre-trained language models better few-shot learners</article-title><year>2021</year><conf-name>Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1)</conf-name><conf-loc>Online</conf-loc><fpage>3816</fpage><lpage>3830</lpage><pub-id pub-id-type="doi">10.18653/v1/2021.acl-long.295</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Walker</surname><given-names>SM</given-names></name></person-group><article-title>F-score: what are accuracy, precision, recall, and F1 score?</article-title><source>Klu</source><year>2023</year><month>07</month><day>4</day><access-date>2024-03-15</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://klu.ai/glossary/accuracy-precision-recall-f1">https://klu.ai/glossary/accuracy-precision-recall-f1</ext-link></comment></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Roberts</surname><given-names>M</given-names> </name><name name-style="western"><surname>Russo</surname><given-names>R</given-names> </name></person-group><source>A Student&#x2019;s Guide to Analysis of Variance</source><year>1999</year><edition>1</edition><publisher-name>Routledge</publisher-name></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="web"><article-title>Pricing</article-title><source>OpenAI</source><access-date>2024-05-25</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openai.com/api/pricing/">https://openai.com/api/pricing/</ext-link></comment></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="web"><article-title>Claude API</article-title><source>Anthropic</source><access-date>2025-9-29</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://docs.claude.com/en/home">https://docs.claude.com/en/home</ext-link></comment></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="web"><article-title>Introducing Llama 2 on Azure</article-title><source>Microsoft</source><access-date>2024-02-22</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://techcommunity.microsoft.com/blog/azure-ai-foundry-blog/introducing-llama-2-on-azure/3881233">https://techcommunity.microsoft.com/blog/azure-ai-foundry-blog/introducing-llama-2-on-azure/3881233</ext-link></comment></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="web"><article-title>Fine-tune and deploy Llama 2 models cost-effectively in Amazon SageMaker JumpStart with AWS Inferentia and AWS Trainium</article-title><source>Amazon Web Services</source><year>2024</year><access-date>2024-05-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://aws.amazon.com/blogs/machine-learning/fine-tune-and-deploy-llama-2-models-cost-effectively-in-amazon-sagemaker-jumpstart-with-aws-inferentia-and-aws-trainium/">https://aws.amazon.com/blogs/machine-learning/fine-tune-and-deploy-llama-2-models-cost-effectively-in-amazon-sagemaker-jumpstart-with-aws-inferentia-and-aws-trainium/</ext-link></comment></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Baktash</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Dawodi</surname><given-names>M</given-names> </name></person-group><article-title>Gpt-4: a review on advancements and opportunities in natural language processing</article-title><source>arXiv</source><access-date>2024-05-13</access-date><comment>Preprint posted online on 2023</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2305.03195">https://arxiv.org/abs/2305.03195</ext-link></comment></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Fatemi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>Y</given-names> </name></person-group><article-title>A comparative analysis of fine-tuned LLMS and few-shot learning of LLMS for financial sentiment analysis</article-title><source>arXiv</source><access-date>2024-05-14</access-date><comment>Preprint posted online on 2023</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2312.08725">https://arxiv.org/abs/2312.08725</ext-link></comment></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>YH</given-names> </name><name name-style="western"><surname>Deng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>B</given-names> </name><name name-style="western"><surname>Pan</surname><given-names>SJ</given-names> </name><name name-style="western"><surname>Bing</surname><given-names>L</given-names> </name></person-group><article-title>Understanding LLMS: a comprehensive overview from training to inference</article-title><comment>Preprint posted online on 2024</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2401.02038">https://arxiv.org/abs/2401.02038</ext-link></comment></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>W</given-names> </name><name name-style="western"><surname>Deng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>B</given-names> </name><name name-style="western"><surname>Pan</surname><given-names>SJ</given-names> </name><name name-style="western"><surname>Bing</surname><given-names>L</given-names> </name></person-group><article-title>Sentiment analysis in the era of large language models: a reality check</article-title><source>arXiv</source><comment>Preprint posted online on 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2305.15005</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Ghosh</surname><given-names>A</given-names> </name><name name-style="western"><surname>Veale</surname><given-names>T</given-names> </name></person-group><article-title>Fracking sarcasm using neural network</article-title><year>2016</year><conf-name>Proceedings of the 7th Workshop on Computational Approaches to Subjectivity, Sentiment and Social Media Analysis</conf-name><conf-date>Jun 16-17, 2016</conf-date><conf-loc>San Diego, CA</conf-loc><fpage>161</fpage><lpage>169</lpage><pub-id pub-id-type="doi">10.18653/v1/W16-0425</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Number of annotated posts for each social media platform, categorized by vaccine sentiment and World Health Organization&#x2019;s 3Cs (confidence, complacency, and convenience) model vaccine hesitancy group. HPV: human papillomavirus vaccines; MMR: measles, mumps, and rubella vaccines; General: general or unspecified vaccines.</p><media xlink:href="formative_v9i1e64723_app1.docx" xlink:title="DOCX File, 18 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Prompt schema.</p><media xlink:href="formative_v9i1e64723_app2.docx" xlink:title="DOCX File, 16 KB"/></supplementary-material></app-group></back></article>