<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="letter"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v9i1e58366</article-id><article-id pub-id-type="doi">10.2196/58366</article-id><article-categories><subj-group subj-group-type="heading"><subject>Research Letter</subject></subj-group></article-categories><title-group><article-title>The AI Reviewer: Evaluating AI&#x2019;s Role in Citation Screening for Streamlined Systematic Reviews</article-title></title-group><contrib-group><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Ghossein</surname><given-names>Jamie</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" corresp="yes" equal-contrib="yes"><name name-style="western"><surname>Hryciw</surname><given-names>Brett N</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ramsay</surname><given-names>Tim</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kyeremanteng</surname><given-names>Kwadwo</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff5">5</xref></contrib></contrib-group><aff id="aff1"><institution>Interdepartmental Division of Critical Care Medicine, University of Toronto</institution><addr-line>Toronto</addr-line><addr-line>ON</addr-line><country>Canada</country></aff><aff id="aff2"><institution>Division of Critical Care, Department of Medicine, University of Ottawa</institution><addr-line>501 Smyth Road</addr-line><addr-line>Ottawa</addr-line><addr-line>ON</addr-line><country>Canada</country></aff><aff id="aff3"><institution>Faculty of Medicine, University of Ottawa</institution><addr-line>Ottawa</addr-line><addr-line>ON</addr-line><country>Canada</country></aff><aff id="aff4"><institution>Clinical Epidemiology, Ottawa Hospital Research Institute</institution><addr-line>Ottawa</addr-line><addr-line>ON</addr-line><country>Canada</country></aff><aff id="aff5"><institution>Institute du Savoir Montfort, Montfort Hospital</institution><addr-line>Ottawa</addr-line><addr-line>ON</addr-line><country>Canada</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Mavragani</surname><given-names>Amaryllis</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Deng</surname><given-names>Jiawen</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Mistry</surname><given-names>Jinal</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Khraisha</surname><given-names>Qusai</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Brett N Hryciw, MD, Division of Critical Care, Department of Medicine, University of Ottawa, 501 Smyth Road, Ottawa, ON, Canada, 1 (613) 798-5555 ext 16045; <email>bhryciw@toh.ca</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>28</day><month>3</month><year>2025</year></pub-date><volume>9</volume><elocation-id>e58366</elocation-id><history><date date-type="received"><day>13</day><month>03</month><year>2024</year></date><date date-type="rev-recd"><day>26</day><month>01</month><year>2025</year></date><date date-type="accepted"><day>29</day><month>01</month><year>2025</year></date></history><copyright-statement>&#x00A9;Jamie Ghossein, Brett N Hryciw, Tim Ramsay, Kwadwo Kyeremanteng. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 28.3.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2025/1/e58366"/><kwd-group><kwd>article screening</kwd><kwd>artificial intelligence</kwd><kwd>systematic review</kwd><kwd>AI</kwd><kwd>large language model</kwd><kwd>LLM</kwd><kwd>screening</kwd><kwd>analysis</kwd><kwd>reviewer</kwd><kwd>app</kwd><kwd>ChatGPT 3.5</kwd><kwd>chatbot</kwd><kwd>dataset</kwd><kwd>data</kwd><kwd>adoption</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Systematic reviews are regarded as one of the highest form of evidence in medical research and are vital for answering clinical questions [<xref ref-type="bibr" rid="ref1">1</xref>]. However, the conventional systematic review methodology is time-consuming, particularly the manual screening of articles for pertinence [<xref ref-type="bibr" rid="ref2">2</xref>]. The exponential increase in biomedical literature presents a challenge for researchers to remain updated. Artificial intelligence (AI) has shown promise in various fields [<xref ref-type="bibr" rid="ref3">3</xref>], with large language models (LLMs) specifically offering capabilities to interpret complex text, which can be leveraged in the systematic review process [<xref ref-type="bibr" rid="ref4">4</xref>]. We conducted a pilot feasibility study evaluating 5 distinct LLMs in an existing systematic review dataset.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Overview</title><p>We compared 5 commonly used LLMs to screen citations from a previously published systematic review on trauma hemorrhage, originally screened by two human reviewers [<xref ref-type="bibr" rid="ref5">5</xref>]. Of the 1186 total citations, 21 (1.8%) were included for full-text review and 1165 (98.2%) were excluded. We randomly selected 100 excluded citations using Microsoft Excel. Hence, 121 citations (n=21, 17.4% included and n=100, 82.6% excluded) were tested against predefined eligibility criteria using ChatGPT 3.5 (version September 25, 2023), ChatGPT 4 (version September 25, 2023), Google Bard (version 1.15; released on September 2, 2023), Meta Llama 2 (70b parameters, version 2.1.1; released on October 10, 2023), and Claude AI 2 (version 1.3; released on July 11, 2023). We used descriptive statistics to evaluate sensitivity, specificity, and overall accuracy.</p></sec><sec id="s2-2"><title>Ethical Considerations</title><p>All citations were taken from publicly available, previously published literature. No personal or patient-level data were used, and no identifiers were included. Formal research ethics board approval was therefore not required.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>Among the 121 total citations, the LLMs&#x2019; sensitivity (correctly identifying included citations) ranged from 57% to 100%, and specificity (correctly excluding noneligible citations) ranged from 18% to 79%. ChatGPT 3.5 achieved the highest sensitivity (100%) and the highest specificity (79%). Full results are shown in <xref ref-type="table" rid="table1">Table 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Performance metrics of large language models in citation screening for systematic reviews, including sensitivity, specificity, and accuracy.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Large language model</td><td align="left" valign="bottom">Sensitivity, %</td><td align="left" valign="bottom">Specificity, %</td><td align="left" valign="bottom">Accuracy, %</td></tr></thead><tbody><tr><td align="left" valign="top">ChatGPT 3.5</td><td align="left" valign="top">100</td><td align="left" valign="top">79</td><td align="left" valign="top">83</td></tr><tr><td align="left" valign="top">ChatGPT 4</td><td align="left" valign="top">95</td><td align="left" valign="top">66</td><td align="left" valign="top">72</td></tr><tr><td align="left" valign="top">Google Bard</td><td align="left" valign="top">100</td><td align="left" valign="top">71</td><td align="left" valign="top">77</td></tr><tr><td align="left" valign="top">Meta Llama 2 (70b parameters)</td><td align="left" valign="top">95</td><td align="left" valign="top">18</td><td align="left" valign="top">34</td></tr><tr><td align="left" valign="top">Claude AI 2</td><td align="left" valign="top">57</td><td align="left" valign="top">77</td><td align="left" valign="top">73</td></tr></tbody></table></table-wrap></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><p>In this pilot assessment, selected LLMs demonstrated high sensitivity for identifying relevant studies, with ChatGPT 3.5 and Google Bard reaching 100%. Notably, the specificity varied widely, ranging from as low as 18% for Meta Llama 2 to 79% for ChatGPT 3.5. While some LLMs can be remarkably sensitive for screening articles within our sample, excluding irrelevant citations remains a challenge for certain LLMs. These findings suggest that AI-driven LLMs could be poised to support the screening phase, potentially replacing the second human reviewer and streamlining the often labor-intensive study screening process.</p><p>The sample size of 121 citations is a limitation, and findings may not be generalizable to other systematic reviews or inclusion and exclusion criteria. Larger studies, ideally with multiple runs of the same citations, are necessary to capture the probabilistic variability inherent to LLMs. As we only ran each citation through a given LLM once, multiple runs or &#x201C;prompt engineering&#x201D; strategies could yield more consistent or refined outcomes when evaluating LLMs. Nonetheless, our study offers a novel approach by directly comparing the performance of multiple LLMs, thus providing insight into how different architectures perform on the same dataset. Future research should explore repeated runs to assess LLM consistency, implement advanced prompt engineering, and investigate the explainability of LLM results.</p><p>LLMs have previously been demonstrated to effectively generate Boolean queries for a systematic review literature search [<xref ref-type="bibr" rid="ref1">1</xref>]. As LLMs evolve further, it is conceivable that they could entirely manage the title and abstract screening. This progress can eventually lead to a fully automated review process, where AI might oversee the search strategy, title and abstract screening, full-text review, data analysis and synthesis, and even drafting and publication. Such automation would epitomize a living systematic review, ensuring evidence is continuously updated as soon as new research is published. As transparency and accountability concerns may arise, a robust ethical framework will be paramount as we navigate the advancements of this technology [<xref ref-type="bibr" rid="ref6">6</xref>].</p></sec></body><back><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">LLM</term><def><p>large language model</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Scells</surname><given-names>H</given-names> </name><name name-style="western"><surname>Koopman</surname><given-names>B</given-names> </name><name name-style="western"><surname>Zuccon</surname><given-names>G</given-names> </name></person-group><article-title>Can ChatGPT write a good boolean query for systematic review literature search?</article-title><conf-name>SIGIR &#x2019;23: The 46th International ACM SIGIR Conference on Research and Development in Information Retrieval</conf-name><conf-date>Jul 23-27, 2023</conf-date><conf-loc>Taipei, Taiwan</conf-loc><fpage>1426</fpage><lpage>1436</lpage><pub-id pub-id-type="doi">10.1145/3539618.3591703</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tsafnat</surname><given-names>G</given-names> </name><name name-style="western"><surname>Glasziou</surname><given-names>P</given-names> </name><name name-style="western"><surname>Karystianis</surname><given-names>G</given-names> </name><name name-style="western"><surname>Coiera</surname><given-names>E</given-names> </name></person-group><article-title>Automated screening of research studies for systematic reviews using study characteristics</article-title><source>Syst Rev</source><year>2018</year><month>04</month><day>25</day><volume>7</volume><issue>1</issue><fpage>64</fpage><pub-id pub-id-type="doi">10.1186/s13643-018-0724-7</pub-id><pub-id pub-id-type="medline">29695296</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hryciw</surname><given-names>BN</given-names> </name><name name-style="western"><surname>Fortin</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Ghossein</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kyeremanteng</surname><given-names>K</given-names> </name></person-group><article-title>Doctor-patient interactions in the age of AI: navigating innovation and expertise</article-title><source>Front Med (Lausanne)</source><year>2023</year><month>08</month><day>30</day><volume>10</volume><fpage>1241508</fpage><pub-id pub-id-type="doi">10.3389/fmed.2023.1241508</pub-id><pub-id pub-id-type="medline">37711734</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kanjee</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Crowe</surname><given-names>B</given-names> </name><name name-style="western"><surname>Rodman</surname><given-names>A</given-names> </name></person-group><article-title>Accuracy of a generative artificial intelligence model in a complex diagnostic challenge</article-title><source>JAMA</source><year>2023</year><month>07</month><day>3</day><volume>330</volume><issue>1</issue><fpage>78</fpage><lpage>80</lpage><pub-id pub-id-type="doi">10.1001/jama.2023.8288</pub-id><pub-id pub-id-type="medline">37318797</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ghossein</surname><given-names>J</given-names> </name><name name-style="western"><surname>Fernando</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Rochwerg</surname><given-names>B</given-names> </name><name name-style="western"><surname>Inaba</surname><given-names>K</given-names> </name><name name-style="western"><surname>Lampron</surname><given-names>J</given-names> </name><name name-style="western"><surname>Tran</surname><given-names>A</given-names> </name></person-group><article-title>A systematic review and meta-analysis of sample size methodology for traumatic hemorrhage trials</article-title><source>J Trauma Acute Care Surg</source><year>2023</year><month>06</month><day>1</day><volume>94</volume><issue>6</issue><fpage>870</fpage><lpage>876</lpage><pub-id pub-id-type="doi">10.1097/TA.0000000000003944</pub-id><pub-id pub-id-type="medline">36879398</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Jha</surname><given-names>D</given-names> </name><name name-style="western"><surname>Durak</surname><given-names>G</given-names> </name><name name-style="western"><surname>Sharma</surname><given-names>V</given-names> </name><name name-style="western"><surname>Keles</surname><given-names>E</given-names> </name><name name-style="western"><surname>Cicek</surname><given-names>V</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>A conceptual algorithm for applying ethical principles of AI to medical practice</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 3, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2304.11530</pub-id></nlm-citation></ref></ref-list></back></article>