<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v9i1e72815</article-id><article-id pub-id-type="doi">10.2196/72815</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>AI in Qualitative Health Research Appraisal: Comparative Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Landerholm</surname><given-names>August</given-names></name><xref ref-type="aff" rid="aff1"/></contrib></contrib-group><aff id="aff1"><institution>Physiotherapy Department, Healthscience Faculty, M&#x00E4;lardalen University</institution><addr-line>Avdelningen f&#x00F6;r Fysioterapi Akademin f&#x00F6;r H&#x00E4;lsa, V&#x00E5;rd Och V&#x00E4;lf&#x00E4;rd M&#x00E4;lardalens Universitet</addr-line><addr-line>V&#x00E4;ster&#x00E5;s</addr-line><country>Sweden</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Mavragani</surname><given-names>Amaryllis</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Mohanadas</surname><given-names>Sadhasivam</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>nomula</surname><given-names>varun kumar</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to August Landerholm, Physiotherapy Department, Healthscience Faculty, M&#x00E4;lardalen University, Avdelningen f&#x00F6;r Fysioterapi Akademin f&#x00F6;r H&#x00E4;lsa, V&#x00E5;rd Och V&#x00E4;lf&#x00E4;rd M&#x00E4;lardalens Universitet, V&#x00E4;ster&#x00E5;s, 721 21, Sweden, 46 702129863; <email>august.landerholm@mdu.se</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>8</day><month>7</month><year>2025</year></pub-date><volume>9</volume><elocation-id>e72815</elocation-id><history><date date-type="received"><day>18</day><month>02</month><year>2025</year></date><date date-type="rev-recd"><day>08</day><month>05</month><year>2025</year></date><date date-type="accepted"><day>15</day><month>05</month><year>2025</year></date></history><copyright-statement>&#x00A9; August Landerholm. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 8.7.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2025/1/e72815"/><abstract><sec><title>Background</title><p>Qualitative research appraisal is crucial for ensuring credible findings but faces challenges due to human variability. Artificial intelligence (AI) models have the potential to enhance the efficiency and consistency of qualitative research assessments.</p></sec><sec><title>Objective</title><p>This study aims to evaluate the performance of 5 AI models (GPT-3.5, Claude 3.5, Sonar Huge, GPT-4, and Claude 3 Opus) in assessing the quality of qualitative research using 3 standardized tools: Critical Appraisal Skills Programme (CASP), Joanna Briggs Institute (JBI) checklist, and Evaluative Tools for Qualitative Studies (ETQS).</p></sec><sec sec-type="methods"><title>Methods</title><p>AI-generated assessments of 3 peer-reviewed qualitative papers in health and physical activity&#x2013;related research were analyzed. The study examined systematic affirmation bias, interrater reliability, and tool-dependent disagreements across the AI models. Sensitivity analysis was conducted to evaluate the impact of excluding specific models on agreement levels.</p></sec><sec sec-type="results"><title>Results</title><p>Results revealed a systematic affirmation bias across all AI models, with &#x201C;Yes&#x201D; rates ranging from 75.9% (145/191; Claude 3 Opus) to 85.4% (164/192; Claude 3.5). GPT-4 diverged significantly, showing lower agreement (&#x201C;Yes&#x201D;: 115/192, 59.9%) and higher uncertainty (&#x201C;Cannot tell&#x201D;: 69/192, 35.9%). Proprietary models (GPT-3.5 and Claude 3.5) demonstrated near-perfect alignment (Cramer <italic>V</italic>=0.891; <italic>P</italic>&#x003C;.001), while open-source models showed greater variability. Interrater reliability varied by assessment tool, with CASP achieving the highest baseline consensus (Krippendorff &#x03B1;=0.653), followed by JBI (&#x03B1;=0.477), and ETQS scoring lowest (&#x03B1;=0.376). Sensitivity analysis revealed that excluding GPT-4 increased CASP agreement by 20% (&#x03B1;=0.784), while removing Sonar Huge improved JBI agreement by 18% (&#x03B1;=0.561). ETQS showed marginal improvements when excluding GPT-4 or Claude 3 Opus (+9%, &#x03B1;=0.409). Tool-dependent disagreements were evident, particularly in ETQS criteria, highlighting AI&#x2019;s current limitations in contextual interpretation.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The findings demonstrate that AI models exhibit both promise and limitations as evaluators of qualitative research quality. While they enhance efficiency, AI models struggle with reaching consensus in areas requiring nuanced interpretation, particularly for contextual criteria. The study underscores the importance of hybrid frameworks that integrate AI scalability with human oversight, especially for contextual judgment. Future research should prioritize developing AI training protocols that emphasize qualitative epistemology, benchmarking AI performance against expert panels to validate accuracy thresholds, and establishing ethical guidelines for disclosing AI&#x2019;s role in systematic reviews. As qualitative methodologies evolve alongside AI capabilities, the path forward lies in collaborative human-AI workflows that leverage AI&#x2019;s efficiency while preserving human expertise for interpretive tasks.</p></sec></abstract><kwd-group><kwd>artificial intelligence</kwd><kwd>qualitative research appraisal</kwd><kwd>systematic reviews</kwd><kwd>interrater agreement</kwd><kwd>CASP checklist</kwd><kwd>Critical Appraisal Skills Programme</kwd><kwd>JBI checklist</kwd><kwd>Joanna Briggs Institute</kwd><kwd>ETQS</kwd><kwd>Evaluative Tools for Qualitative Studies</kwd><kwd>large language models</kwd><kwd>affirmation bias</kwd><kwd>human-AI collaboration</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Importance of Quality Assessment in Qualitative Research</title><p>Systematic quality assessment is foundational for establishing the credibility, dependability, and transferability of qualitative research findings [<xref ref-type="bibr" rid="ref1">1</xref>]. Rigorous appraisal enables readers to evaluate the trustworthiness of study conclusions and their applicability to real-world contexts. Unlike quantitative methodologies, qualitative research prioritizes contextual richness and interpretive depth, necessitating frameworks that account for methodological diversity across paradigms (eg, phenomenology and ethnography).</p><p>Cross-study comparison is crucial for building a robust evidence base in any given field; yet, qualitative studies have seen little attention in systematic reviews. Facilitating and synthesizing research in qualitative methodologies is challenging because these approaches are based on diverse philosophical foundations, such as phenomenology, ethnography, and grounded theory, as well as a wide variety of analytical methods [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref3">3</xref>]. The chosen analysis method of each research group may have varying degrees of subjective data interpretation, leading to diverse findings and conclusions [<xref ref-type="bibr" rid="ref4">4</xref>]. Further complicating is the contextual nature of each study, where the findings may be highly dependent on its unique context [<xref ref-type="bibr" rid="ref3">3</xref>]. The richness of data in qualitative datasets also encourages attempts at cohesive summaries, which can be challenging without losing important details or context [<xref ref-type="bibr" rid="ref1">1</xref>]. Qualitative studies typically use smaller sample sizes than their quantitative methodological counterparts and use purposively selected samples [<xref ref-type="bibr" rid="ref5">5</xref>]. Depending on the degree of rigor in the selection criteria, it becomes difficult to determine transferability or comparability [<xref ref-type="bibr" rid="ref6">6</xref>]. Facilitating comparison and research synthesis becomes easier through standardized quality assessment criteria.</p></sec><sec id="s1-2"><title>Systematic Assessment for Qualitative Research</title><p>Various assessment tools exist for systematically addressing qualitative study quality and have all been developed to address the unique challenges of evaluating qualitative research [<xref ref-type="bibr" rid="ref7">7</xref>]. If a research group wishes to systematically address a field that has been given qualitative attention, their choice of assessment tool will provide different weight to studies in the final synthesis, potentially affecting the review&#x2019;s conclusions. There are also potential limitations of an overly systematic approach to addressing qualitative study quality, as overly rigid quality criteria may not capture the diversity of qualitative research approaches more broadly [<xref ref-type="bibr" rid="ref2">2</xref>].</p><p>There are methodological issues with systematic quality assessment that endanger the potential credibility of a systematic review. Quality assessment of qualitative research is often time-consuming and labor-intensive, limiting the number of studies that systematic reviews tend to include. Human reviewers may have varying interpretations of quality criteria, leading to inconsistencies [<xref ref-type="bibr" rid="ref8">8</xref>]. Different research groups might dissolve this in different ways, with bias present [<xref ref-type="bibr" rid="ref9">9</xref>]. Humans&#x2019; ability to recognize patterns is ever so present in systematic reviews, as a reviewer can detect recurring themes or quality indicators across supposedly independent quality assessments. These aspects may be augmented using another, nonhuman, reviewer [<xref ref-type="bibr" rid="ref10">10</xref>].</p><p>In the context of health research, the systematic appraisal of qualitative studies is particularly critical, as it directly shapes the evidence base used to inform health care practice, policy, and patient care [<xref ref-type="bibr" rid="ref11">11</xref>]. Qualitative research in health not only helps to understand patient experiences, barriers, and preferences but also guides the development of interventions and health technologies that are responsive to real-world needs. Therefore, the rigor and consistency of quality assessment tools are deemed trustworthy and ultimately influence clinical decision-making, health policy recommendations, and the quality of care delivered to diverse patient populations.</p></sec><sec id="s1-3"><title>The Role of Artificial Intelligence in Research Quality Assessment</title><p>Artificial intelligence (AI) has received attention in the qualitative research field, offering new possibilities for many steps of the research process [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]. AI&#x2019;s role in systematic reviews has been theorized and tested in the quantitative space, but its potential in the qualitative field is given less attention [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>]. AI&#x2019;s ability to work efficiently and at scale holds the potential for systematic appraisal of study quality, and the system&#x2019;s consistency could reduce human bias and variability [<xref ref-type="bibr" rid="ref16">16</xref>]. Assessments done by AI could also enable the inclusion or exclusion of pattern recognition, as an AI can be instructed to complete a task independently or dependent on previous tasks. However, while AI can process complex data, it may struggle with the nuanced, context-dependent nature of qualitative research.</p><p>Large language models are among the most popular AI tools for researchers today [<xref ref-type="bibr" rid="ref17">17</xref>]. These advanced AI systems offer capabilities that could potentially revolutionize the process of qualitative research quality assessment, as they are pretrained to analyze vast amounts of textual data to enable nuanced analysis [<xref ref-type="bibr" rid="ref18">18</xref>]. Furthermore, these systems are multilingual, potentiating the inclusion of a paper written in a language unknown to the human researcher. Finally, these specific systems can be fine-tuned or prompted (post training) to follow specific assessment criteria, suitable for the systematic quality assessment of qualitative studies [<xref ref-type="bibr" rid="ref13">13</xref>]. The reliability and validity testing of the tools are problematic, as new versions or updates of the tools push their capabilities faster than the scientific community can assess their usefulness. The need to test these tools still exists, as the prevalence of its use is growing [<xref ref-type="bibr" rid="ref19">19</xref>].</p></sec><sec id="s1-4"><title>Aims of the Study</title><p>Given the complex landscape of qualitative research quality assessment and the emerging potential of augmenting AI in research processes, this study aims (1) to evaluate and compare the performance of different AI models in assessing the quality of qualitative research studies using various assessment tools; (2) to compare the ratings given by 5 AI models (GPT-3.5, Claude 3.5, Sonar Huge, GPT-4, and Claude 3 Opus) when assessing qualitative studies; (3) to evaluate the interrater agreement among these AI models using 3 different assessment tools: Critical Appraisal Skills Programme (CASP), Joanna Briggs Institute (JBI), and Evaluative Tool for Qualitative Studies (ETQS); (4) to analyze how the exclusion of individual AI models affects the overall interrater agreement for each assessment tool; and (5) to identify specific items or criteria within these assessment tools that lead to prominent disagreements among the AI raters.</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Overview</title><p>These models are chosen based on their diverse architectures and capabilities, which are crucial for a comprehensive analysis of AI augmentation in qualitative research. The selected models are included in <xref ref-type="table" rid="table1">Table 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Overview of artificial intelligence (AI) models used for qualitative research appraisal.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Developer</td><td align="left" valign="bottom">Release date</td><td align="left" valign="bottom">Size (B parameters)</td></tr></thead><tbody><tr><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">OpenAI</td><td align="left" valign="top">2022</td><td align="left" valign="top">175</td></tr><tr><td align="left" valign="top">Claude 3.5</td><td align="left" valign="top">Anthropic</td><td align="left" valign="top">2024</td><td align="left" valign="top">Not disclosed</td></tr><tr><td align="left" valign="top">Sonar Huge</td><td align="left" valign="top">Perplexity AI, based on Llama 3.1</td><td align="left" valign="top">2024</td><td align="left" valign="top">405</td></tr><tr><td align="left" valign="top">GPT-4</td><td align="left" valign="top">OpenAI</td><td align="left" valign="top">2023</td><td align="left" valign="top">Not disclosed</td></tr><tr><td align="left" valign="top">Claude 3 Opus</td><td align="left" valign="top">Anthropic</td><td align="left" valign="top">2024</td><td align="left" valign="top">Not disclosed</td></tr></tbody></table></table-wrap><p>This diverse selection of models detailed in <xref ref-type="table" rid="table1">Table 1</xref> aims to identify which AI performance metrics are most beneficial for qualitative research quality assessment and to highlight areas where AI may complement or challenge each other. To evaluate the performance of these models, specific criteria will be used, including accuracy in coding, contextual understanding, and bias detection. The evaluation will use a standardized dataset of qualitative research papers to ensure a robust comparison across models.</p></sec><sec id="s2-2"><title>Quality Assessment Tools for Qualitative Research</title><sec id="s2-2-1"><title>Overview</title><p>The AI models will be instructed to use 3 widely recognized quality assessment tools for qualitative research. These tools have been comparatively analyzed in previous studies [<xref ref-type="bibr" rid="ref20">20</xref>]. The CASP checklist was chosen for its widespread use and accessibility in various research fields and has been previously used in qualitative assessments and syntheses [<xref ref-type="bibr" rid="ref21">21</xref>-<xref ref-type="bibr" rid="ref23">23</xref>]. The JBI Critical Appraisal Checklist for Qualitative Research was chosen for its focus on the alignment between research objectives and methodological choices [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>]. Finally, the ETQS was selected for its comprehensive approach to evaluating qualitative research integrity, offering a more nuanced assessment of methodological rigor. Each tool offers perspectives on qualitative research quality [<xref ref-type="bibr" rid="ref20">20</xref>], which will facilitate a multifaceted assessment of the AI models&#x2019; ability to understand and evaluate different aspects of qualitative studies. The combination of these tools will provide a robust framework for comparing AI performance across various dimensions of qualitative research quality.</p></sec><sec id="s2-2-2"><title>Assessment Process</title><p>Three peer-reviewed qualitative research papers have been selected as the source material for this study. These papers represent diverse topics within health and physical activity research, providing a robust basis for evaluating the AI models&#x2019; performance across varied contexts (<xref ref-type="table" rid="table2">Table 2</xref>).</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Summary of qualitative health studies evaluated by artificial intelligence models.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Study title</td><td align="left" valign="bottom">Year</td><td align="left" valign="bottom">Focus area</td><td align="left" valign="bottom">Methodology</td></tr></thead><tbody><tr><td align="left" valign="bottom">Paper A: A qualitative study examining the validity and comprehensibility of physical activity items: developed and tested in children with juvenile idiopathic arthritis [<xref ref-type="bibr" rid="ref26">26</xref>]</td><td align="left" valign="bottom">2019</td><td align="left" valign="bottom">Physical activity in children with juvenile idiopathic arthritis</td><td align="left" valign="bottom">Qualitative interviews</td></tr><tr><td align="left" valign="top">Paper B: &#x201C;If only balls could talk...&#x201D;: barriers and opportunities to participation for students with blindness and visual impairment in specialized PE [<xref ref-type="bibr" rid="ref27">27</xref>]</td><td align="left" valign="top">2023</td><td align="left" valign="top">Participation barriers for students with visual impairments in PE<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">Focus groups</td></tr><tr><td align="left" valign="top">Paper C: A qualitative study of exercise and physical activity in adolescents with pediatric-onset multiple sclerosis [<xref ref-type="bibr" rid="ref28">28</xref>]</td><td align="left" valign="top">2019</td><td align="left" valign="top">Exercise and physical activity in adolescents with MS<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">Semistructured interviews</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>PE: physical education.</p></fn><fn id="table2fn2"><p><sup>b</sup>MS: multiple sclerosis.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-2-3"><title>Application of AI Models to Each Assessment Tool</title><p>Each AI model will be tasked with applying all 3 quality assessment tools (CASP, ETQS, and JBI) to the selected studies. All papers are free text, and the AI models will be provided with the full text of each study as well as the assessment criteria for each tool. All AI-generated assessments will be collected and stored for analysis. AI-generated assessments will be formalized in a standardized format.</p></sec><sec id="s2-2-4"><title>Repeated Assessments for Consistency</title><p>The repeated assessments for consistency are as follows:</p><list list-type="order"><list-item><p>Input preparation: The full text of each study will be provided to the AI models along with the complete assessment criteria for each tool. To ensure consistency, the input format will be standardized across all models.</p></list-item><list-item><p>Assessment protocol: AI models will be instructed to conduct a comprehensive quality assessment of each study using all 3 tools independently. Clear instructions will be provided to ensure the models understand the task requirements.</p></list-item><list-item><p>Structured output: To facilitate comparative analysis, AI models will be required to provide their assessments in a standardized format for each tool. This may include numerical scores, categorical ratings, and textual explanations.</p></list-item><list-item><p>Reasoning transparency: The AI models will be prompted to explain their reasoning for each assessment criterion, providing insights into their decision-making process and allowing for evaluation of their understanding of qualitative research principles.</p></list-item><list-item><p>Consistency evaluation: Each AI model will perform the assessment task multiple times to evaluate the consistency of their outputs and identify any variability in their assessments.</p></list-item><list-item><p>Data collection and storage: All AI-generated assessments, including explanations and any variations in repeated assessments, will be systematically collected and stored in a secure database for subsequent analysis. This will ensure data integrity and facilitate comprehensive evaluation.</p></list-item><list-item><p>Bias mitigation: To minimize potential biases, the order of presenting studies and assessment tools to the AI models will be randomized for each evaluation session.</p></list-item></list></sec></sec><sec id="s2-3"><title>Ethical Considerations</title><p>All analyzed studies were previously published and had undergone their own ethical review processes. No new data were collected from individuals. The role of AI in the research process was disclosed, and all AI-assisted assessments were documented and stored securely. No personal or sensitive data were collected or processed. The research did not involve any intervention or interaction with human participants. The study posed no risk to individuals or groups, as it relied solely on secondary analysis of published material.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>All AI models showed high &#x201C;Yes&#x201D; rates (75.9%&#x2010;85.4%), with Claude 3.5 achieving the highest affirmation (164/192, 85.4%), as detailed in <xref ref-type="table" rid="table3">Table 3</xref>. GPT-4 diverged significantly, showing lower agreement (&#x201C;Yes&#x201D;: 115/192, 59.9%) and elevated uncertainty (&#x201C;Cannot tell&#x201D;: 69/192, 35.9%). GPT-3.5 and Claude 3.5 exhibited near-perfect alignment (Cramer <italic>V</italic>=0.891; <italic>P</italic>&#x003C;.001). Sonar Huge (&#x201C;Yes&#x201D;: 148/188, 78.7%) and Claude 3 Opus (145/191, 75.9%) demonstrated moderate consistency. GPT-4&#x2019;s exclusion boosted CASP agreement by 20% (&#x03B1;=0.784 vs 0.653 baseline), highlighting its role as a variability driver. Statistical associations weakened with open-source models (Cramer <italic>V</italic>=0.496&#x2010;0.545), suggesting that architectural differences influence assessment patterns.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Frequency of ratings by rater.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Yes, n/N (%)</td><td align="left" valign="bottom">Cannot tell, n/N (%)</td><td align="left" valign="bottom">No, n/N (%)</td></tr></thead><tbody><tr><td align="left" valign="bottom">GPT-3.5</td><td align="left" valign="bottom">158/192 (82.3)</td><td align="left" valign="bottom">34/192 (17.7)</td><td align="left" valign="bottom">0/192 (0)</td></tr><tr><td align="left" valign="top">Claude 3.5</td><td align="left" valign="top">164/192 (85.4)</td><td align="left" valign="top">28/192 (14.6)</td><td align="left" valign="top">0/192 (0)</td></tr><tr><td align="left" valign="top">Sonar Huge</td><td align="left" valign="top">148/188 (78.7)</td><td align="left" valign="top">33/188 (17.6)</td><td align="left" valign="top">7/188 (3.7)</td></tr><tr><td align="left" valign="top">GPT-4</td><td align="left" valign="top">115/192 (59.9)</td><td align="left" valign="top">69/192 (35.9)</td><td align="left" valign="top">8/192 (4.2)</td></tr><tr><td align="left" valign="top">Claude 3 Opus</td><td align="left" valign="top">145/191 (75.9)</td><td align="left" valign="top">38/191 (19.9)</td><td align="left" valign="top">8/191 (4.2)</td></tr></tbody></table></table-wrap><p><xref ref-type="table" rid="table4">Table 4</xref> demonstrates GPT-3.5&#x2019;s significant agreement across all models (<italic>&#x03C7;</italic>&#x00B2;=47.0&#x2010;152.3; <italic>P</italic>&#x003C;.001), with effect sizes revealing distinct patterns such as perfect concordance with Claude 3.5 (Cramer <italic>V</italic>=0.891; <italic>&#x03C7;</italic>&#x00B2;<sub>1</sub>=152.3), moderate agreement with Sonar Huge (<italic>V</italic>=0.539; <italic>&#x03C7;</italic>&#x00B2;<sub>2</sub>=54.6) and Claude 3 Opus (<italic>V</italic>=0.496; <italic>&#x03C7;</italic>&#x00B2;<sub>2</sub>=47.0), and finally, a weaker association with GPT-4 (<italic>V</italic>=0.545; <italic>&#x03C7;</italic>&#x00B2;<sub>2</sub>=57.0) despite shared commercial development.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>GPT-3.5 associations.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Chi-square<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup> (<italic>df</italic>)</td><td align="left" valign="bottom"><italic>P</italic> value</td><td align="left" valign="bottom">Cramer <italic>V</italic></td></tr></thead><tbody><tr><td align="left" valign="top">Versus Claude 3.5</td><td align="left" valign="top">152.3 (1)</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">0.891</td></tr><tr><td align="left" valign="top">Versus Sonar Huge</td><td align="left" valign="top">54.6 (2)</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">0.539</td></tr><tr><td align="left" valign="top">Versus GPT-4</td><td align="left" valign="top">57.0 (2)</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">0.545</td></tr><tr><td align="left" valign="top">Versus Claude 3 Opus</td><td align="left" valign="top">47.0 (2)</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">0.496</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>Chi-square test results showing associations between GPT-3.5 assessments and those of other artificial intelligence models.</p></fn></table-wrap-foot></table-wrap><p>The sensitivity analysis (summarized in <xref ref-type="table" rid="table5">Table 5</xref>) revealed tool-specific impacts of model exclusion on interrater agreement. For the CASP tool, excluding GPT-4 increased agreement by 20% (&#x03B1;=0.784), while Sonar Huge exclusion raised it by 18% (&#x03B1;=0.773), suggesting that these models introduce divergent interpretations of methodological rigor criteria. Conversely, JBI agreement improved most when excluding Sonar Huge (+18%; &#x03B1;=0.561) but dropped sharply without GPT-3.5 (&#x2212;17%; &#x03B1;=0.398), indicating its stabilizing role for JBI appraisals. ETQS maintained the lowest baseline agreement (&#x03B1;=.376), with marginal improvements when excluding GPT-4 or Claude 3 Opus (+9%; &#x03B1;=0.409). This aligns with findings from <xref ref-type="table" rid="table6">Table 6</xref>, where ETQS criteria like policy implications (item 35) showed full-spectrum disagreements across models. Notably, proprietary models (GPT-3.5 or Claude 3.5) consistently supported consensus-building, as their exclusion reduced CASP or JBI agreement by 12%&#x2010;17%. This pattern mirrors architectural similarities observed in GPT-3.5 and Claude 3.5&#x2019;s coding behaviors.</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Sensitivity analysis of interrater agreement (Krippendorff &#x03B1;)<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup> across model exclusion scenarios.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model exclusion scenario</td><td align="left" valign="bottom">JBI<sup><xref ref-type="table-fn" rid="table5fn2">b</xref></sup> (&#x0394;%<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup>)</td><td align="left" valign="bottom">CASP<sup><xref ref-type="table-fn" rid="table5fn4">d</xref></sup> (&#x0394;%)</td><td align="left" valign="bottom">ETQS<sup><xref ref-type="table-fn" rid="table5fn5">e</xref></sup> (&#x0394;%)</td></tr></thead><tbody><tr><td align="left" valign="top">All 5 models</td><td align="left" valign="top">0.477</td><td align="left" valign="top">0.653</td><td align="left" valign="top">0.376</td></tr><tr><td align="left" valign="top">Exclude GPT-3.5</td><td align="left" valign="top">0.398 (&#x2212;17)</td><td align="left" valign="top">0.572 (&#x2212;12)</td><td align="left" valign="top">0.346 (&#x2212;8)</td></tr><tr><td align="left" valign="top">Exclude Claude 3.5</td><td align="left" valign="top">0.468 (&#x2212;2)</td><td align="left" valign="top">0.572 (&#x2212;12)</td><td align="left" valign="top">0.356 (&#x2212;5)</td></tr><tr><td align="left" valign="top">Exclude Sonar Huge</td><td align="left" valign="top"><italic>0.561 (+18)</italic><sup><xref ref-type="table-fn" rid="table5fn6">f</xref></sup></td><td align="left" valign="top"><italic>0.773 (+18)</italic></td><td align="left" valign="top">0.359 (&#x2212;5)</td></tr><tr><td align="left" valign="top">Exclude GPT-4</td><td align="left" valign="top">0.494 (+3)</td><td align="left" valign="top"><italic>0.784 (+20)</italic></td><td align="left" valign="top">0.409 (+9)</td></tr><tr><td align="left" valign="top">Exclude Claude 3 Opus</td><td align="left" valign="top">0.468 (&#x2212;2)</td><td align="left" valign="top">0.572 (&#x2212;12)</td><td align="left" valign="top">0.409 (+9)</td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>&#x03B1; values represent Krippendorff interrater reliability coefficient.</p></fn><fn id="table5fn2"><p><sup>b</sup>JBI: Joanna Briggs Institute.</p></fn><fn id="table5fn3"><p><sup>c</sup>&#x0394;%=percentage change from full model agreement.</p></fn><fn id="table5fn4"><p><sup>d</sup>CASP: Critical Appraisal Skills Programme.</p></fn><fn id="table5fn5"><p><sup>e</sup>ETQS: Evaluative Tools for Qualitative Studies.</p></fn><fn id="table5fn6"><p><sup>f</sup>Values in italics format highlight agreement improvements &#x2265;10% across all tools.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t6" position="float"><label>Table 6.</label><caption><p>High-discrepancy Evaluative Tools for Qualitative Studies (ETQS) criteria across artificial intelligence models.</p></caption><table id="table6" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">ETQS item</td><td align="left" valign="bottom">Criteria description</td><td align="left" valign="bottom">GPT-3.5</td><td align="left" valign="bottom">Claude 3.5</td><td align="left" valign="bottom">Sonar Huge</td><td align="left" valign="bottom">GPT-4</td><td align="left" valign="bottom">Claude 3 Opus</td><td align="left" valign="bottom">Disagreement score<sup><xref ref-type="table-fn" rid="table6fn1">a</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">35</td><td align="left" valign="top">Generalizability to settings</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Cannot tell</td><td align="left" valign="top">No</td><td align="left" valign="top">No</td><td align="left" valign="top">3</td></tr><tr><td align="left" valign="top">36</td><td align="left" valign="top">Generalizability to populations</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Cannot tell</td><td align="left" valign="top">No</td><td align="left" valign="top">No</td><td align="left" valign="top">3</td></tr><tr><td align="left" valign="top">38</td><td align="left" valign="top">Policy implications</td><td align="left" valign="top">Cannot tell</td><td align="left" valign="top">Cannot tell</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Cannot tell</td><td align="left" valign="top">No</td><td align="left" valign="top">2</td></tr><tr><td align="left" valign="top">43</td><td align="left" valign="top">Reviewer identification</td><td align="left" valign="top">Cannot tell</td><td align="left" valign="top">Cannot tell</td><td align="left" valign="top">Cannot tell</td><td align="left" valign="top">No</td><td align="left" valign="top">Yes</td><td align="left" valign="top">2</td></tr><tr><td align="left" valign="top">44</td><td align="left" valign="top">Review date verification</td><td align="left" valign="top">Cannot tell</td><td align="left" valign="top">Cannot tell</td><td align="left" valign="top">Cannot tell</td><td align="left" valign="top">No</td><td align="left" valign="top">Yes</td><td align="left" valign="top">2</td></tr><tr><td align="left" valign="top">8</td><td align="left" valign="top">Methodological framework alignment</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Yes</td><td align="left" valign="top">No</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Yes</td><td align="left" valign="top">2</td></tr></tbody></table><table-wrap-foot><fn id="table6fn1"><p><sup>a</sup>Number of distinct response categories (yes or cannot tell or no) per criterion.</p></fn></table-wrap-foot></table-wrap><p><xref ref-type="fig" rid="figure1">Figure 1</xref> illustrates how excluding specific AI models affects interrater agreement across 3 qualitative research assessment tools: JBI, CASP, and ETQS. The CASP tool demonstrated the highest baseline agreement (&#x03B1;=0.653), with notable improvements observed when GPT-4 or Sonar Huge was excluded, increasing agreement to 0.784 and 0.773, respectively. These findings suggest that GPT-4 and Sonar Huge may introduce variability in CASP assessments. In contrast, the exclusion of GPT-3.5, Claude 3.5, or Claude 3 Opus reduced agreement to 0.572, highlighting their role in fostering consensus.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Radar-chart visualization upon model exclusion. AI: artificial intelligence; CASP: Critical Appraisal Skills Programme; ETQS: Evaluative Tools for Qualitative Studies; JBI: Joanna Briggs Institute.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e72815_fig01.png"/></fig><p>For the JBI tool, excluding Sonar Huge resulted in the largest improvement in agreement (&#x03B1;=0.561), while removing GPT-3.5 led to a significant drop to 0.398, indicating that GPT-3.5 is a key contributor to maintaining consistency in JBI assessments. The ETQS tool exhibited the lowest baseline agreement (&#x03B1;=0.376), with marginal gains observed when GPT-4 or Claude 3 Opus were excluded, both increasing agreement to 0.409. This suggests that ETQS assessments are generally consistent across models, with GPT-4 and Claude 3 Opus introducing slight variability.</p><p>These results underscore the importance of model selection in AI-assisted qualitative research assessment, as certain models contribute more significantly to consensus, while others may introduce variability depending on the assessment tool used.</p><p><xref ref-type="table" rid="table6">Table 6</xref> highlights the ETQS criteria where AI models demonstrated the most significant disagreements in their assessments. Items such as generalizability to settings and populations (items 35 and 36) exhibited the full spectrum of possible responses (&#x201C;Yes,&#x201D; &#x201C;Cannot tell,&#x201D; and &#x201C;No&#x201D;), indicating substantial variability in model interpretation. Other items, including policy implications (item 38) and methodological framework alignment (item 8), also showed notable disagreement, albeit with fewer distinct response categories. These findings underscore the challenges AI models face in achieving consensus on nuanced qualitative criteria, particularly those requiring contextual or interpretive judgment.</p></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study reveals critical insights about AI&#x2019;s role in qualitative research appraisal, particularly in health science contexts where methodological rigor directly impacts evidence-based practice [<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref30">30</xref>]. All AI models demonstrated systematic affirmation bias, with &#x201C;Yes&#x201D; rates ranging from 75.9% to 85.4%, suggesting an inherent tendency toward favorable assessments regardless of the assessment tool (<xref ref-type="table" rid="table3">Table 3</xref>). Model-specific variability emerged as a key factor, particularly with GPT-4 diverging significantly (&#x201C;Yes&#x201D;: 115/192, 59.9%) compared to proprietary models like GPT-3.5 and Claude 3.5, which showed near-perfect alignment (Cramer <italic>V</italic>=0.891; <italic>P</italic>&#x003C;.001) as detailed in <xref ref-type="table" rid="table4">Table 4</xref>.</p><p>Tool-dependent disagreements were evident, particularly with ETQS criteria like policy implications (item 35) and generalizability (item 36), which elicited the full spectrum of responses across models. This highlights current limitations of AI in contextual interpretation. In health research, such biases could distort evidence syntheses informing clinical guidelines or public health policies, especially for studies like Paper C (multiple sclerosis), where AI&#x2019;s inability to contextualize structural barriers (eg, health care access disparities) risks undermining person-centered care models [<xref ref-type="bibr" rid="ref31">31</xref>].</p></sec><sec id="s4-2"><title>Comparison to Prior Work</title><p>The findings of this study are consistent with emerging research on AI-augmented qualitative analysis. The consensus-building role of proprietary models mirrors previous findings regarding ChatGPT&#x2019;s utility in thematic analysis [<xref ref-type="bibr" rid="ref18">18</xref>]. AI&#x2019;s challenges with nuanced criteria such as policy implications corroborate [<xref ref-type="bibr" rid="ref13">13</xref>] known limitations in interpretive tasks critical for health policy design, such as balancing clinical efficacy with ethical or logistical constraints (eg, insurance coverage gaps in Paper A). The ongoing need for human validation supports the framework proposed by Hitch [<xref ref-type="bibr" rid="ref12">12</xref>], which positions AI as a &#x201C;team member&#x201D; rather than a standalone evaluator. This approach is reinforced by the importance of patient-centered transparency in health care AI [<xref ref-type="bibr" rid="ref32">32</xref>], where oversight mechanisms and impact on care experience directly influence trust.</p></sec><sec id="s4-3"><title>Strengths and Limitations</title><p>The strengths of the study are as follows:</p><list list-type="bullet"><list-item><p>Standardized protocols: The use of standardized protocols and independent verification of AI outputs helped mitigate potential bias, especially given the lead author&#x2019;s (AL) dual role as investigator and participant in Paper A.</p></list-item><list-item><p>Diverse model selection: The inclusion of multiple AI models with varied architectures and capabilities facilitated a comprehensive analysis of AI&#x2019;s potential and limitations in qualitative research appraisal.</p></list-item><list-item><p>Tool variety: The application of 3 widely recognized assessment tools (CASP, JBI, and ETQS) provided a robust framework for evaluating AI performance across different dimensions of qualitative research quality.</p></list-item></list><p>The limitations of the study are as follows:</p><list list-type="bullet"><list-item><p>Proprietary model opacity: The proprietary nature of commercial models (GPT-3.5 and Claude 3.5) obscures the architectural factors driving their consensus patterns, potentially masking biases that disproportionately affect vulnerable populations (eg, Paper B&#x2019;s findings on physical education participation barriers).</p></list-item><list-item><p>Dataset scope: The focused dataset of 3 health science papers limits generalizability, although the inclusion of pediatric and chronic disease contexts underscores current challenges for large language models in appraising life span&#x2013;specific health narratives (<xref ref-type="table" rid="table6">Table 6</xref>).</p></list-item><list-item><p>Author dual role: The lead author&#x2019;s (AL) involvement as both investigator and participant in Paper A introduced potential interpretation bias, mitigated but not eliminated by standardized protocols.</p></list-item><list-item><p>Absence of human expert ratings: The lack of human expert ratings prevents definitive conclusions about whether AI&#x2019;s &#x201C;favorable bias&#x201D; reflects accuracy or systemic overestimation.</p></list-item></list></sec><sec id="s4-4"><title>Conclusions</title><p>This study demonstrates that AI models exhibit both promise and limitations as evaluators of qualitative research quality. This comprehensive analysis revealed 3 critical insights: first, affirmation bias was evident, with &#x201C;Yes&#x201D; ratings ranging from 75.9% to 85.4% across models, highlighting AI&#x2019;s tendency to favor positive assessments, a pattern that could overstate the feasibility of interventions in health research. Second, model-specific variability emerged, as seen in GPT-4&#x2019;s divergent ratings, which lowered CASP agreement by 20% and underscored the influence of model architecture on appraisal consistency. Third, disagreements were often tool-dependent, particularly for ETQS criteria like policy implications and generalizability, exposing current limitations in AI&#x2019;s contextual interpretation.</p><p>The findings emphasize that AI cannot yet replace human judgment in nuanced qualitative appraisal but could enhance efficiency when strategically implemented. In health research, strong alignment of proprietary models (Cramer <italic>V</italic>=0.891) may expedite systematic reviews of patient experience studies, but their affirmation bias risks inflating confidence in underpowered qualitative evidence used for clinical guidelines. Open-source variability, while requiring oversight, could help counterbalance systemic optimism in AI-driven health syntheses.</p><p>Key limitations, including proprietary model opacity, which obscures biases affecting marginalized health populations, dataset scope constraints, and the author&#x2019;s dual role in Paper A warrant cautious interpretation. The absence of human expert ratings is particularly consequential for health research, where patient narratives and clinician insights require a nuanced ethical appraisal that AI&#x2019;s binary frameworks may oversimplify.</p><p>Future research should prioritize three areas: (1) health-specific AI training protocols emphasizing qualitative epistemology to better capture patient-centered care priorities, (2) benchmarking against expert panels to validate accuracy thresholds, and (3) establishing ethical frameworks for disclosing AI&#x2019;s role in health evidence synthesis, ensuring transparency in policy recommendations. As qualitative methodologies evolve alongside AI capabilities, the path forward lies not in human-machine competition but in hybrid workflows that leverage AI&#x2019;s scalability while preserving human expertise for contextual and interpretive tasks.</p></sec></sec></body><back><ack><p>The author declared that they had insufficient funding to support open access publication of this manuscript, including from affiliated organizations or institutions, funding agencies, or other organizations. JMIR Publications provided article processing fee (APF) support for the publication of this paper. Artificial intelligence tools are cited for transparency; their inclusion does not imply endorsement.</p></ack><fn-group><fn fn-type="con"><p>AL served both investigator and author in Paper A, a qualitative study included in the analysis. Standardized protocols and independent verification of artificial intelligence outputs mitigated potential bias.</p></fn><fn fn-type="conflict"><p>This study used ChatGPT-3.5, ChatGPT-4 (OpenAI), Claude 3.5, Claude 3 Opus (Anthropic), and Sonar Huge (Perplexity AI) for structured quality assessments. The author is not employed by, holds stock in, or has received financial compensation from these companies. AL holds no affiliation with JMIR journals, holds no patents related to this work, or has financial or personal relationships with individuals or organizations that could influence this research.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">CASP</term><def><p>Critical Appraisal Skills Programme</p></def></def-item><def-item><term id="abb3">ETQS</term><def><p>Evaluative Tools for Qualitative Studies</p></def></def-item><def-item><term id="abb4">JBI</term><def><p>Joanna Briggs Institute</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McKenna</surname><given-names>L</given-names> </name></person-group><article-title>Translation of research interviews: do we have a problem with qualitative rigor?</article-title><source>Nurse Author Ed</source><year>2022</year><month>03</month><volume>32</volume><issue>1</issue><fpage>1</fpage><lpage>3</lpage><pub-id pub-id-type="doi">10.1111/nae2.31</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pilbeam</surname><given-names>C</given-names> </name><name name-style="western"><surname>Anthierens</surname><given-names>S</given-names> </name><name name-style="western"><surname>Vanderslott</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tonkin-Crine</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wanat</surname><given-names>M</given-names> </name></person-group><article-title>Methodological and ethical considerations when conducting qualitative interview research with healthcare professionals: reflections and recommendations as a result of a pandemic</article-title><source>Int J Qual Methods</source><year>2022</year><month>04</month><volume>21</volume><fpage>160940692210777</fpage><pub-id pub-id-type="doi">10.1177/16094069221077763</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pyo</surname><given-names>J</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>W</given-names> </name><name name-style="western"><surname>Choi</surname><given-names>EY</given-names> </name><name name-style="western"><surname>Jang</surname><given-names>SG</given-names> </name><name name-style="western"><surname>Ock</surname><given-names>M</given-names> </name></person-group><article-title>Qualitative research in healthcare: necessity and characteristics</article-title><source>J Prev Med Public Health</source><year>2023</year><month>01</month><volume>56</volume><issue>1</issue><fpage>12</fpage><lpage>20</lpage><pub-id pub-id-type="doi">10.3961/jpmph.22.451</pub-id><pub-id pub-id-type="medline">36746418</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Duden</surname><given-names>GS</given-names> </name></person-group><article-title>Challenges to qualitative evidence synthesis&#x2014;aiming for diversity and abstracting without losing meaning</article-title><source>Methods Psychol</source><year>2021</year><month>12</month><volume>5</volume><fpage>100070</fpage><pub-id pub-id-type="doi">10.1016/j.metip.2021.100070</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sebele-Mpofu</surname><given-names>FY</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Serpa</surname><given-names>S</given-names> </name></person-group><article-title>Saturation controversy in qualitative research: complexities and underlying assumptions. A literature review</article-title><source>Cogent Soc Sci</source><year>2020</year><month>01</month><volume>6</volume><issue>1</issue><fpage>1838706</fpage><pub-id pub-id-type="doi">10.1080/23311886.2020.1838706</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Noyes</surname><given-names>J</given-names> </name><name name-style="western"><surname>Booth</surname><given-names>A</given-names> </name><name name-style="western"><surname>Moore</surname><given-names>G</given-names> </name><name name-style="western"><surname>Flemming</surname><given-names>K</given-names> </name><name name-style="western"><surname>Tun&#x00E7;alp</surname><given-names>&#x00D6;</given-names> </name><name name-style="western"><surname>Shakibazadeh</surname><given-names>E</given-names> </name></person-group><article-title>Synthesising quantitative and qualitative evidence to inform guidelines on complex interventions: clarifying the purposes, designs and outlining some methods</article-title><source>BMJ Glob Health</source><year>2019</year><volume>4</volume><issue>Suppl 1</issue><fpage>e000893</fpage><pub-id pub-id-type="doi">10.1136/bmjgh-2018-000893</pub-id><pub-id pub-id-type="medline">30775016</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Santiago-Delefosse</surname><given-names>M</given-names> </name><name name-style="western"><surname>Gavin</surname><given-names>A</given-names> </name><name name-style="western"><surname>Bruchez</surname><given-names>C</given-names> </name><name name-style="western"><surname>Roux</surname><given-names>P</given-names> </name><name name-style="western"><surname>Stephen</surname><given-names>SL</given-names> </name></person-group><article-title>Quality of qualitative research in the health sciences: analysis of the common criteria present in 58 assessment guidelines by expert users</article-title><source>Soc Sci Med</source><year>2016</year><month>01</month><volume>148</volume><fpage>142</fpage><lpage>151</lpage><pub-id pub-id-type="doi">10.1016/j.socscimed.2015.11.007</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sanjari</surname><given-names>M</given-names> </name><name name-style="western"><surname>Bahramnezhad</surname><given-names>F</given-names> </name><name name-style="western"><surname>Fomani</surname><given-names>FK</given-names> </name><name name-style="western"><surname>Shoghi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Cheraghi</surname><given-names>MA</given-names> </name></person-group><article-title>Ethical challenges of researchers in qualitative studies: the necessity to develop a specific guideline</article-title><source>J Med Ethics Hist Med</source><year>2014</year><volume>7</volume><fpage>14</fpage><pub-id pub-id-type="medline">25512833</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Buetow</surname><given-names>S</given-names> </name></person-group><article-title>Apophenia, unconscious bias and reflexivity in nursing qualitative research</article-title><source>Int J Nurs Stud</source><year>2019</year><month>01</month><volume>89</volume><fpage>8</fpage><lpage>13</lpage><pub-id pub-id-type="doi">10.1016/j.ijnurstu.2018.09.013</pub-id><pub-id pub-id-type="medline">30316055</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Christou</surname><given-names>P</given-names> </name></person-group><article-title>&#x0397;ow to use artificial intelligence (AI) as a resource, methodological and analysis tool in qualitative research?</article-title><source>TQR</source><year>2023</year><volume>28</volume><issue>7</issue><fpage>1968</fpage><lpage>1980</lpage><pub-id pub-id-type="doi">10.46743/2160-3715/2023.6406</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Al-Busaidi</surname><given-names>ZQ</given-names> </name></person-group><article-title>Qualitative research and its uses in health care</article-title><source>Sultan Qaboos Univ Med J</source><year>2008</year><month>03</month><volume>8</volume><issue>1</issue><fpage>11</fpage><lpage>19</lpage><pub-id pub-id-type="medline">21654952</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hitch</surname><given-names>D</given-names> </name></person-group><article-title>Artificial intelligence augmented qualitative analysis: the way of the future?</article-title><source>Qual Health Res</source><year>2024</year><month>06</month><volume>34</volume><issue>7</issue><fpage>595</fpage><lpage>606</lpage><pub-id pub-id-type="doi">10.1177/10497323231217392</pub-id><pub-id pub-id-type="medline">38064244</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tai</surname><given-names>RH</given-names> </name><name name-style="western"><surname>Bentley</surname><given-names>LR</given-names> </name><name name-style="western"><surname>Xia</surname><given-names>X</given-names> </name><etal/></person-group><article-title>An examination of the use of large language models to aid analysis of textual data</article-title><source>Int J Qual Methods</source><year>2024</year><month>01</month><volume>23</volume><fpage>16094069241231168</fpage><pub-id pub-id-type="doi">10.1177/16094069241231168</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Datt</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sharma</surname><given-names>H</given-names> </name><name name-style="western"><surname>Aggarwal</surname><given-names>N</given-names> </name><name name-style="western"><surname>Sharma</surname><given-names>S</given-names> </name></person-group><article-title>Role of ChatGPT-4 for medical researchers</article-title><source>Ann Biomed Eng</source><year>2024</year><month>06</month><volume>52</volume><issue>6</issue><fpage>1534</fpage><lpage>1536</lpage><pub-id pub-id-type="doi">10.1007/s10439-023-03336-5</pub-id><pub-id pub-id-type="medline">37526801</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>van Dijk</surname><given-names>SHB</given-names> </name><name name-style="western"><surname>Brusse-Keizer</surname><given-names>MGJ</given-names> </name><name name-style="western"><surname>Bucs&#x00E1;n</surname><given-names>CC</given-names> </name><name name-style="western"><surname>van der Palen</surname><given-names>J</given-names> </name><name name-style="western"><surname>Doggen</surname><given-names>CJM</given-names> </name><name name-style="western"><surname>Lenferink</surname><given-names>A</given-names> </name></person-group><article-title>Artificial intelligence in systematic reviews: promising when appropriately used</article-title><source>BMJ Open</source><year>2023</year><month>07</month><day>7</day><volume>13</volume><issue>7</issue><fpage>e072254</fpage><pub-id pub-id-type="doi">10.1136/bmjopen-2023-072254</pub-id><pub-id pub-id-type="medline">37419641</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alanazi</surname><given-names>A</given-names> </name></person-group><article-title>Clinicians&#x2019; views on using artificial intelligence in healthcare: opportunities, challenges, and beyond</article-title><source>Cureus</source><volume>15</volume><issue>9</issue><fpage>e45255</fpage><pub-id pub-id-type="doi">10.7759/cureus.45255</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>De Angelis</surname><given-names>L</given-names> </name><name name-style="western"><surname>Baglivo</surname><given-names>F</given-names> </name><name name-style="western"><surname>Arzilli</surname><given-names>G</given-names> </name><etal/></person-group><article-title>ChatGPT and the rise of large language models: the new AI-driven infodemic threat in public health</article-title><source>Front Public Health</source><year>2023</year><volume>11</volume><fpage>1166120</fpage><pub-id pub-id-type="doi">10.3389/fpubh.2023.1166120</pub-id><pub-id pub-id-type="medline">37181697</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>VV</given-names> </name><name name-style="western"><surname>van der Lubbe</surname><given-names>SCC</given-names> </name><name name-style="western"><surname>Goh</surname><given-names>LH</given-names> </name><name name-style="western"><surname>Valderas</surname><given-names>JM</given-names> </name></person-group><article-title>Harnessing ChatGPT for thematic analysis: are we ready?</article-title><source>J Med Internet Res</source><year>2024</year><volume>26</volume><fpage>e54974</fpage><pub-id pub-id-type="doi">10.2196/54974</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Senthil</surname><given-names>R</given-names> </name><name name-style="western"><surname>Anand</surname><given-names>T</given-names> </name><name name-style="western"><surname>Somala</surname><given-names>CS</given-names> </name><name name-style="western"><surname>Saravanan</surname><given-names>KM</given-names> </name></person-group><article-title>Bibliometric analysis of artificial intelligence in healthcare research: trends and future directions</article-title><source>Future Healthc J</source><year>2024</year><month>09</month><volume>11</volume><issue>3</issue><fpage>100182</fpage><pub-id pub-id-type="doi">10.1016/j.fhj.2024.100182</pub-id><pub-id pub-id-type="medline">39310219</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hannes</surname><given-names>K</given-names> </name><name name-style="western"><surname>Lockwood</surname><given-names>C</given-names> </name><name name-style="western"><surname>Pearson</surname><given-names>A</given-names> </name></person-group><article-title>A comparative analysis of three online appraisal instruments&#x2019; ability to assess validity in qualitative research</article-title><source>Qual Health Res</source><year>2010</year><month>12</month><volume>20</volume><issue>12</issue><fpage>1736</fpage><lpage>1743</lpage><pub-id pub-id-type="doi">10.1177/1049732310378656</pub-id><pub-id pub-id-type="medline">20671302</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Aydogdu</surname><given-names>ALF</given-names> </name></person-group><article-title>Exploring different aspects of nursing leadership: an integrative review of qualitative studies</article-title><source>Mod Care J</source><year>2023</year><volume>20</volume><issue>4</issue><pub-id pub-id-type="doi">10.5812/modernc-130402</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gottvall</surname><given-names>M</given-names> </name><name name-style="western"><surname>Brunell</surname><given-names>C</given-names> </name><name name-style="western"><surname>Eldebo</surname><given-names>A</given-names> </name><name name-style="western"><surname>Johansson Metso</surname><given-names>F</given-names> </name><name name-style="western"><surname>Jirwe</surname><given-names>M</given-names> </name><name name-style="western"><surname>Carlsson</surname><given-names>T</given-names> </name></person-group><article-title>Post-migration psychosocial experiences and challenges amongst LGBTQ+ forced migrants: a meta-synthesis of qualitative reports</article-title><source>J Adv Nurs</source><year>2023</year><month>01</month><volume>79</volume><issue>1</issue><fpage>358</fpage><lpage>371</lpage><pub-id pub-id-type="doi">10.1111/jan.15480</pub-id><pub-id pub-id-type="medline">36320151</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jayachandran</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hill</surname><given-names>K</given-names> </name><name name-style="western"><surname>Walmsley</surname><given-names>AD</given-names> </name></person-group><article-title>A critical review of qualitative research publications in dental implants from 2006 to 2020</article-title><source>Clin Oral Implants Res</source><year>2021</year><month>06</month><volume>32</volume><issue>6</issue><fpage>659</fpage><lpage>671</lpage><pub-id pub-id-type="doi">10.1111/clr.13743</pub-id><pub-id pub-id-type="medline">33715249</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Moreno G&#x00F3;mez</surname><given-names>A</given-names> </name><name name-style="western"><surname>Guo</surname><given-names>P</given-names> </name><name name-style="western"><surname>de la Llave Rinc&#x00F3;n</surname><given-names>AI</given-names> </name><name name-style="western"><surname>Efstathiou</surname><given-names>N</given-names> </name></person-group><article-title>Women&#x2019;s experiences of primary dysmenorrhea symptoms: a systematic review of qualitative evidence and meta-aggregation</article-title><source>Women Health</source><year>2023</year><month>09</month><day>14</day><volume>63</volume><issue>8</issue><fpage>658</fpage><lpage>668</lpage><pub-id pub-id-type="doi">10.1080/03630242.2023.2255289</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Au</surname><given-names>A</given-names> </name><name name-style="western"><surname>Murad-Kassam</surname><given-names>S</given-names> </name><name name-style="western"><surname>Mukanoheli</surname><given-names>V</given-names> </name><etal/></person-group><article-title>Immigrant older adults&#x2019; experiences of aging in place and their neighborhoods: a qualitative systematic review</article-title><source>Int J Environ Res Public Health</source><year>2024</year><month>07</month><day>10</day><volume>21</volume><issue>7</issue><fpage>904</fpage><pub-id pub-id-type="doi">10.3390/ijerph21070904</pub-id><pub-id pub-id-type="medline">39063481</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Flod&#x00E9;n</surname><given-names>A</given-names> </name><name name-style="western"><surname>Brostr&#x00F6;m</surname><given-names>EW</given-names> </name><name name-style="western"><surname>von Heideken</surname><given-names>J</given-names> </name><etal/></person-group><article-title>A qualitative study examining the validity and comprehensibility of physical activity items: developed and tested in children with juvenile idiopathic arthritis</article-title><source>Pediatr Rheumatol Online J</source><year>2019</year><month>04</month><day>25</day><volume>17</volume><issue>1</issue><fpage>16</fpage><pub-id pub-id-type="doi">10.1186/s12969-019-0317-6</pub-id><pub-id pub-id-type="medline">31023371</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Meier</surname><given-names>S</given-names> </name><name name-style="western"><surname>H&#x00F6;ger</surname><given-names>B</given-names> </name><name name-style="western"><surname>Giese</surname><given-names>M</given-names> </name></person-group><article-title>&#x201C;If only balls could talk&#x2026;&#x201D;: barriers and opportunities to participation for students with blindness and visual impairment in specialized PE</article-title><source>Front Sports Act Living</source><year>2023</year><volume>5</volume><fpage>1286909</fpage><pub-id pub-id-type="doi">10.3389/fspor.2023.1286909</pub-id><pub-id pub-id-type="medline">38162696</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sikes</surname><given-names>EM</given-names> </name><name name-style="western"><surname>Richardson</surname><given-names>EV</given-names> </name><name name-style="western"><surname>Motl</surname><given-names>RW</given-names> </name></person-group><article-title>A qualitative study of exercise and physical activity in adolescents with pediatric-onset multiple sclerosis</article-title><source>Int J MS Care</source><year>2019</year><volume>21</volume><issue>2</issue><fpage>81</fpage><lpage>91</lpage><pub-id pub-id-type="doi">10.7224/1537-2073.2018-033</pub-id><pub-id pub-id-type="medline">31049039</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stickley</surname><given-names>T</given-names> </name><name name-style="western"><surname>O&#x2019;Caithain</surname><given-names>A</given-names> </name><name name-style="western"><surname>Homer</surname><given-names>C</given-names> </name></person-group><article-title>The value of qualitative methods to public health research, policy and practice</article-title><source>Perspect Public Health</source><year>2022</year><month>07</month><volume>142</volume><issue>4</issue><fpage>237</fpage><lpage>240</lpage><pub-id pub-id-type="doi">10.1177/17579139221083814</pub-id><pub-id pub-id-type="medline">35362352</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bajwa</surname><given-names>J</given-names> </name><name name-style="western"><surname>Munir</surname><given-names>U</given-names> </name><name name-style="western"><surname>Nori</surname><given-names>A</given-names> </name><name name-style="western"><surname>Williams</surname><given-names>B</given-names> </name></person-group><article-title>Artificial intelligence in healthcare: transforming the practice of medicine</article-title><source>Future Healthc J</source><year>2021</year><month>07</month><volume>8</volume><issue>2</issue><fpage>e188</fpage><lpage>e194</lpage><pub-id pub-id-type="doi">10.7861/fhj.2021-0095</pub-id><pub-id pub-id-type="medline">34286183</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cresswell</surname><given-names>K</given-names> </name><name name-style="western"><surname>de Keizer</surname><given-names>N</given-names> </name><name name-style="western"><surname>Magrabi</surname><given-names>F</given-names> </name><etal/></person-group><article-title>Evaluating artificial intelligence in clinical settings&#x2014;let us not reinvent the wheel</article-title><source>J Med Internet Res</source><year>2024</year><month>08</month><day>7</day><volume>26</volume><issue>1</issue><fpage>e46407</fpage><pub-id pub-id-type="doi">10.2196/46407</pub-id><pub-id pub-id-type="medline">39110494</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stroud</surname><given-names>AM</given-names> </name><name name-style="western"><surname>Minteer</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Ridgeway</surname><given-names>JL</given-names> </name><name name-style="western"><surname>Miller</surname><given-names>JE</given-names> </name><name name-style="western"><surname>Barry</surname><given-names>BA</given-names> </name></person-group><article-title>Patient information needs for transparent and trustworthy cardiovascular artificial intelligence: a qualitative study</article-title><source>PLOS Digit Health</source><year>2025</year><month>04</month><volume>4</volume><issue>4</issue><fpage>e0000826</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000826</pub-id><pub-id pub-id-type="medline">40258073</pub-id></nlm-citation></ref></ref-list></back></article>