<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v10i1e55127</article-id><article-id pub-id-type="doi">10.2196/55127</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>RoBuster&#x2014;Corpus Annotated With Risk of Bias Text Spans in Randomized Controlled Trials in Physiotherapy and Rehabilitation: Corpus Development and Annotation Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Dhrangadhariya</surname><given-names>Anjani</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hilfiker</surname><given-names>Roger</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Sattelmayer</surname><given-names>Karl Martin</given-names></name><degrees>DRehabSci</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Naderi</surname><given-names>Nona</given-names></name><degrees>Dr rer nat</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Giacomino</surname><given-names>Katia</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Caliesch</surname><given-names>Rahel</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Higgins</surname><given-names>Julian</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Marchand-Maillet</surname><given-names>St&#x00E9;phane</given-names></name><degrees>Dr rer nat</degrees><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>M&#x00FC;ller</surname><given-names>Henning</given-names></name><degrees>Dr rer nat</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff7">7</xref><xref ref-type="aff" rid="aff8">8</xref></contrib></contrib-group><aff id="aff1"><institution>Informatics Institute, HES-SO Valais-Wallis</institution><addr-line>Rue du Technopole 3</addr-line><addr-line>Sierre</addr-line><country>Switzerland</country></aff><aff id="aff2"><institution>Institute of Health Sciences, HES-SO Valais-Wallis</institution><addr-line>Thermenstrasse 41</addr-line><addr-line>Leukerbad</addr-line><addr-line>Valais</addr-line><country>Switzerland</country></aff><aff id="aff3"><institution>Physiotherapy Tschopp &#x0026; Hilfiker</institution><addr-line>Brig</addr-line><addr-line>Valais</addr-line><country>Switzerland</country></aff><aff id="aff4"><institution>Centre national de la recherche scientifique, Laboratoire Interdisciplinaire des Sciences du Num&#x00E9;rique, Universit&#x00E9; Paris-Saclay</institution><addr-line>Orsay</addr-line><country>France</country></aff><aff id="aff5"><institution>Population Health Sciences, Bristol Medical School, University of Bristol</institution><addr-line>Bristol</addr-line><country>United Kingdom</country></aff><aff id="aff6"><institution>Centre d'Informatique Universitaire, University of Geneva</institution><addr-line>Geneva</addr-line><country>Switzerland</country></aff><aff id="aff7"><institution>Medical Faculty, University of Geneva</institution><addr-line>Geneva</addr-line><country>Switzerland</country></aff><aff id="aff8"><institution>The Sense Innovation and Research Center</institution><addr-line>Sion</addr-line><country>Switzerland</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Sarvestan</surname><given-names>Javad</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Zheng</surname><given-names>Jiaping</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Zhu</surname><given-names>Lingxuan</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Kommireddy</surname><given-names>Shreeven</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Anjani Dhrangadhariya, PhD, Informatics Institute, HES-SO Valais-Wallis, Rue du Technopole 3, Sierre, 3960, Switzerland, 41 787084007; <email>anjani.k.dhrangadhariya@gmail.com</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>27</day><month>4</month><year>2026</year></pub-date><volume>10</volume><elocation-id>e55127</elocation-id><history><date date-type="received"><day>05</day><month>12</month><year>2023</year></date><date date-type="rev-recd"><day>15</day><month>01</month><year>2026</year></date><date date-type="accepted"><day>16</day><month>01</month><year>2026</year></date></history><copyright-statement>&#x00A9; Anjani Dhrangadhariya, Roger Hilfiker, Karl Martin Sattelmayer, Nona Naderi, Katia Giacomino, Rahel Caliesch, Julian Higgins, St&#x00E9;phane Marchand-Maillet, Henning M&#x00FC;ller. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 27.4.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2026/1/e55127"/><abstract><sec><title>Background</title><p>Risk of bias (RoB) assessment of randomized clinical trials (RCTs) is vital to answering systematic review questions accurately. Manual RoB assessment for hundreds of RCTs is a cognitively demanding and lengthy process. Automation has the potential to assist reviewers in rapidly identifying text descriptions in RCTs that indicate potential risks of bias. However, no RoB text span annotated corpus could be used to fine-tune or evaluate large language models (LLMs), and there are no established guidelines for annotating the RoB spans in RCTs.</p></sec><sec><title>Objective</title><p>The revised Cochrane RoB 2 test (RoB 2) tool provides comprehensive guidelines for RoB assessment; however, due to the inherent subjectivity of this tool, it cannot be directly used as RoB annotation guidelines. The study aimed to develop precise RoB text span annotation instructions that could address this subjectivity and thus aid the corpus annotation.</p></sec><sec sec-type="methods"><title>Methods</title><p>We leveraged RoB 2 guidelines to develop visual instructional placards that serve as annotation guidelines for RoB spans and risk judgments. Expert annotators used these visual placards to annotate a dataset named RoBuster, consisting of 41 full-text RCTs from the domains of physiotherapy and rehabilitation. We report interannotator agreement (IAA) between 2 annotators for text span annotations before and after applying visual instructions on a subset (n=9) of RoBuster. We also provide IAA on bias risk judgments using Cohen &#x03BA;. Moreover, we used a portion of RoBuster (n=10) to evaluate an LLM using a straightforward evaluation framework. This evaluation aimed to gauge the performance of an LLM (here GPT 3.5) in the challenging task of RoB span extraction and demonstrate the utility of this corpus using a straightforward framework.</p></sec><sec sec-type="results"><title>Results</title><p>We present a corpus of 41 RCTs with fine-grained text span annotations comprising more than 28,427 tokens belonging to 22 RoB classes. The IAA at the text span level calculated using the F1 measure varies from 0% to 90%, while Cohen &#x03BA; for risk judgments ranges between &#x2013;0.235 and 1.0. Using visual instructions for annotation increases the IAA by more than 17 percentage points. LLM (GPT-3.5) shows promising but varied observed agreements with the expert annotation across the different bias questions.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Despite having comprehensive bias assessment guidelines and visual instructional placards, RoB annotation remains a complex task. Using visual placards for bias assessment and annotation enhances IAA compared to cases where visual placards are absent; however, text annotation remains challenging for the subjective questions and the questions for which annotation data are unavailable in RCTs. Similarly, while GPT-3.5 demonstrates effectiveness, its accuracy diminishes with more subjective RoB questions and low information availability.</p></sec></abstract><kwd-group><kwd>risk of bias</kwd><kwd>corpus annotation</kwd><kwd>natural language processing</kwd><kwd>large language models</kwd><kwd>LLM</kwd><kwd>information extraction</kwd><kwd>RoBuster</kwd><kwd>corpus</kwd><kwd>randomized controlled trials</kwd><kwd>RCT</kwd><kwd>reviewer</kwd><kwd>tools</kwd><kwd>physiotherapy</kwd><kwd>rehabilitation</kwd><kwd>effectiveness</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Systematic reviews (SRs) synthesized using randomized controlled trials (RCTs) are the highest quality of evidence in the evidence pyramid. SRs help medical professionals make informed health decisions and guide health policies [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. An RCT tests an intervention&#x2019;s effectiveness by randomly assigning patients to intervention groups; for example, the impact of the intervention under investigation is compared to other interventions in a controlled setting [<xref ref-type="bibr" rid="ref3">3</xref>]. Theoretically, RCTs are low on biases given the randomized design, but biases can still infiltrate the design, execution, or reporting phases. Such biases may cause medical professionals to misjudge intervention effects, impacting health outcomes [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. Therefore, bias assessment, known as risk-of-bias (RoB) assessment, is vital for RCTs used for writing SRs.</p><p>There are several tools to assess RoB, including the Cochrane Collaboration&#x2019;s RoB Tool, PEDro RoB scale, revised Cochrane RoB 2 tool, AMSTAR (A Measurement Tool to Assess Systematic Reviews) or AMSTAR 2, EPOC (Effective Practice and Organization of Care) RoB Tool, and other checklists [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref11">11</xref>]. These tools provide structured questions to elicit bias-relevant information from RCTs. Manual RoB assessment, a time-consuming task requiring substantial expertise, can take hours per RCT and months for a full SR, emphasizing the need for automation [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]. Moreover, writing SRs is highly resource-heavy, taking about 6 months to several years to complete [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>]. Machine learning (ML) could expedite this process by pinpointing bias-relevant RCT text, aiding quicker quality assessments [<xref ref-type="bibr" rid="ref16">16</xref>]. Marshall et al [<xref ref-type="bibr" rid="ref17">17</xref>] attempted automation of RoB assessment using a distant supervision approach supported by proprietary data from CDSR. The study was supported by the manually entered data from CDSR, which is behind a paywall and automates based on Cochrane RoB 1 guidelines and not the latest RoB 2 [<xref ref-type="bibr" rid="ref7">7</xref>]. Although RoB 1 is most frequently used for assessment, the recently revised Cochrane RoB 2 offers significant differences [<xref ref-type="bibr" rid="ref18">18</xref>]. Compared to RoB 1, RoB 2 provides a reliable and concrete structure to the RoB evaluation by developing comprehensive guidelines that aim to increase consistency [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref11">11</xref>]. The use of RoB 2 increased from 0% in 2019 to 24.1% in 2022, indicating the need to switch to this updated tool for bias assessment [<xref ref-type="bibr" rid="ref19">19</xref>].</p><p>Millard et al [<xref ref-type="bibr" rid="ref20">20</xref>] also explored ML-based RoB assessment using proprietary data. Their work using these pay-walled data was used to develop RobotReviewer, which has been evaluated by several studies for its human-level performance [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref21">21</xref>-<xref ref-type="bibr" rid="ref24">24</xref>]. A lack of public RoB-annotated data still limits community advancements. Wang et al [<xref ref-type="bibr" rid="ref25">25</xref>] recently released RoB datasets for animal studies, but human clinical trials still lack a comprehensive RoB corpus. Manual RoB assessment is a complex, expert-led task laden with subjective judgments [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]. Systematically translating this manual process for developing a RoB annotated corpus requires a carefully designed annotation scheme and annotation guidelines. We previously worked on a pilot study to test whether RoB 2 guidelines could be effectively used as annotation guidelines to annotate a corpus of RCTs with RoB. We concluded that RoB 2 cannot be used as text annotation guidelines but did not provide any annotation guidelines [<xref ref-type="bibr" rid="ref28">28</xref>]. Here, we aim to establish clear annotation guidelines to annotate RCTs with RoB spans corresponding to the RoB 2 tool [<xref ref-type="bibr" rid="ref11">11</xref>].</p><p>In addition, recent large language models (LLMs) have shown potential in handling complex tasks with minimal instructions [<xref ref-type="bibr" rid="ref29">29</xref>]. However, their capability to identify RoB spans in RCTs has yet to be assessed. Our contributions with this paper are 5-fold as follows:</p><list list-type="order"><list-item><p>Development of detailed annotation guidelines for RCT RoB spans.</p></list-item><list-item><p>Development of visual placards to simplify annotation and assist trainee RoB assessors.</p></list-item><list-item><p>Compilation of &#x201C;RoBuster,&#x201D; a corpus of 41 annotated RCTs with 22 RoB span types, for ML and LLM training or benchmarking.</p></list-item><list-item><p>Evaluation of an LLM in identifying answers to signaling questions using prompts.</p></list-item><list-item><p>Open sharing of annotation guidelines, dataset, and LLM prompts with the community.</p></list-item></list></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Overview</title><p>This section details the annotation scheme, software, and visual placard development. With no existing RoB span annotation guidelines, we created them from scratch based on the revised Cochrane RoB 2 tool [<xref ref-type="bibr" rid="ref6">6</xref>]. We created a draft of the visual guidelines, doubly annotating a fraction of RCTs using it, and refined the guidelines using identified conflicts. <xref ref-type="fig" rid="figure1">Figure 1</xref> illustrates our methodology starting from data collection.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>The flowchart (right) illustrates our methodology starting from data collection until interannotator agreement (IAA) calculation. The agreement is calculated between 10 RCTs annotated (left) in Dhrangadhariya et al [<xref ref-type="bibr" rid="ref28">28</xref>] and 9 RCTs from the current work. LLM: large language model; RCT: randomized controlled trial</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e55127_fig01.png"/></fig></sec><sec id="s2-2"><title>Expert Team</title><p>RoB annotation requires specialized expertise due to the need to thoroughly review the entire RCT text to identify 22 different bias categories. Our annotation team comprised 2 experts in RoB assessment in the physiotherapy and rehabilitation domains: an epidemiology researcher (RH) and an associate professor in physiotherapy (KMS), both with extensive experience in physiotherapy, statistical methods, and SRs. Two senior PhD students (KG and RC) contributed to the development of the visual annotation guidelines. Two researchers in natural language processing, an associate professor in computational linguistics (NN) and a PhD student in computer science (AKD), assisted in creating the visual guidelines, which serve as a benchmark for RoB text span extraction. Finally, JPTH provided feedback to shape the visual annotation guidelines (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p></sec><sec id="s2-3"><title>Ethical Considerations</title><p>In accordance with the Swiss Federal Act on Research involving Human Beings (Human Research Act) of September 30, 2011 (SR 810.30), formal ethical approval by a Cantonal Ethics Committee was not required as the research did not concern human diseases or the structure and function of the human body. The experts who undertook the visual placards development and the annotation process for this corpus were informed about the purpose of the annotation project and agreed to voluntarily participate in the study. Even though they agreed to participate in the study, they can withdraw their participation at any time without consequences of any kind. They were informed of the purpose and nature of the study via a presentation and had an opportunity to ask questions. They were also fully informed about the eventual publication of the findings. Each expert provided consent for the publication of the annotated corpus, along with the understanding that any identifying information would be appropriately anonymized to protect their privacy. The expert annotators volunteered their time and received no financial compensation.</p></sec><sec id="s2-4"><title>Annotation Scheme</title><p>Creating a new annotated corpus requires defining or adopting an annotation scheme. To our knowledge, the only existing RoB span annotation scheme is from our previous work [<xref ref-type="bibr" rid="ref28">28</xref>]. Rather than developing a new one, we adapted and enhanced this scheme, addressing prior limitations. The scheme aligns with the RoB 2 assessment, which organizes bias into 5 domains reflecting different trial design aspects. Each risk domain decomposes into several signaling questions totaling 22 (<xref ref-type="table" rid="table1">Table 1</xref>). Each question prompts the assessor to look for relevant text evidence in the trial and judge risk response for that signaling question (SQ; <xref ref-type="table" rid="table2">Table 2</xref>). For detailed explanations of the SQs, see Cochrane RoB 2.0 guidelines as the original document and <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> [<xref ref-type="bibr" rid="ref6">6</xref>].</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>The table lists signaling questions from each bias domain from the revised Cochrane Risk of Bias (RoB) tool.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Question number</td><td align="left" valign="bottom">Signaling question</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="2">RoB 1</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 1.1</td><td align="left" valign="top">Was the allocation sequence random?</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 1.2</td><td align="left" valign="top">Was the allocation sequence concealed until participants were enrolled and assigned to interventions?</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 1.3</td><td align="left" valign="top">Did baseline differences between intervention groups suggest a problem with the randomization process?</td></tr><tr><td align="left" valign="top" colspan="2">RoB 2</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 2.1</td><td align="left" valign="top">Were participants aware of their assigned intervention during the trial?</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 2.2</td><td align="left" valign="top">Were carers and people delivering the interventions aware of participants&#x2019; assigned intervention during the trial?</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 2.3</td><td align="left" valign="top">Were there deviations from the intended intervention that arose because of the trial context?</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 2.4</td><td align="left" valign="top">Were these deviations likely to have affected the outcome?</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 2.5</td><td align="left" valign="top">Were these deviations from the intended intervention balanced between groups?</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 2.6</td><td align="left" valign="top">Was an appropriate analysis used to estimate the effect of assignment to intervention?</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 2.7</td><td align="left" valign="top">Was there potential for a substantial impact (on the result) of the failure to analyze participants in the group to which they were randomized?</td></tr><tr><td align="left" valign="top" colspan="2">RoB 3</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 3.1</td><td align="left" valign="top">Were data for this outcome available for all, or nearly all, participants randomized?</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 3.2</td><td align="left" valign="top">Is there evidence that the result was not biased by missing outcome data?</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 3.3</td><td align="left" valign="top">Could missingness in the outcome depend on its true value?</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 3.4</td><td align="left" valign="top">Is it likely that missingness in the outcome depended on its true value?</td></tr><tr><td align="left" valign="top" colspan="2">RoB 4</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 4.1</td><td align="left" valign="top">Was the method of measurement of the outcome inappropriate?</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 4.2</td><td align="left" valign="top">Could measurement or ascertainment of the outcome have differed between intervention groups?</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 4.3</td><td align="left" valign="top">Were outcome assessors aware of the intervention received by study participants?</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 4.4</td><td align="left" valign="top">Could the assessment of the outcome have been influenced by knowledge of the intervention received?</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 4.5</td><td align="left" valign="top">Is it likely that the assessment of the outcome was influenced by knowledge of the intervention received?</td></tr><tr><td align="left" valign="top" colspan="2">RoB 5</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 5.1</td><td align="left" valign="top">Were the data that produced this result analyzed in accordance with a pre-specified analysis plan that was finalized before unblinded outcome data were available for analysis?</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 5.2</td><td align="left" valign="top">Is the numerical result being assessed likely to have been selected, on the basis of the results, from multiple eligible outcome measurements (eg, scales, definitions, time points) within the outcome domain?</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 5.3</td><td align="left" valign="top">Is the numerical result being assessed likely to have been selected, on the basis of the results, from multiple eligible analyses of the data?</td></tr></tbody></table></table-wrap><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>The table lists bias domains from the revised Cochrane Risk of Bias (RoB) tool and the number of signaling questions (SQs) per domain.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Class</td><td align="left" valign="bottom">Domain</td><td align="left" valign="bottom">SQ</td></tr></thead><tbody><tr><td align="left" valign="top">RoB 1</td><td align="left" valign="top">Biases arising from the <italic>randomization process</italic></td><td align="char" char="." valign="top">3</td></tr><tr><td align="left" valign="top">RoB 2</td><td align="left" valign="top">Biases due to <italic>deviations from intended interventions</italic></td><td align="char" char="." valign="top">7</td></tr><tr><td align="left" valign="top">RoB 3</td><td align="left" valign="top">Bias due to <italic>missing outcome data</italic></td><td align="char" char="." valign="top">4</td></tr><tr><td align="left" valign="top">RoB 4</td><td align="left" valign="top">Bias in the <italic>measurement of the outcome</italic></td><td align="char" char="." valign="top">5</td></tr><tr><td align="left" valign="top">RoB 5</td><td align="left" valign="top">Bias in the <italic>selection of the reported result</italic></td><td align="char" char="." valign="top">3</td></tr></tbody></table></table-wrap><p>The response options for the RoB judgment include &#x201C;Yes,&#x201D; &#x201C;Probably yes,&#x201D; &#x201C;No,&#x201D; &#x201C;Probably no,&#x201D; or &#x201C;No information.&#x201D; Reviewers assess each SQ by examining the factual evidence in the RCT. For example, the SQ &#x201C;Was the allocation sequence random?&#x201D; is assessed by checking the randomization method. A well-executed method results in a &#x201C;Yes&#x201D; response (low risk), while a poorly executed method leads to a &#x201C;No&#x201D; (high risk) [<xref ref-type="bibr" rid="ref11">11</xref>].</p><p>For RoB span annotation, we mimic the assessment process by considering evidence text spans in the RCT as the main units of annotation. Each span corresponds to an answer for a specific SQ and is annotated with a label. Multiple spans (sentence and paragraph) across the RCT can be annotated to answer a single SQ if needed. The annotation label incorporates information about the SQ number and its domain (for the above example, &#x201C;1.1&#x201D; for the first domain and first SQ of the domain). The response option for risk judgment is incorporated in the label, such as &#x201C;1.1 Yes Good&#x201D; for a well-executed randomization procedure and &#x201C;1.1 No Bad&#x201D; otherwise (<xref ref-type="fig" rid="figure2">Figure 2</xref>). To improve interannotator agreement (IAA), we collapse &#x201C;Yes&#x201D; and &#x201C;Probably Yes&#x201D; to a single &#x201C;Yes,&#x201D; and similarly for &#x201C;No&#x201D; and &#x201C;Probably No&#x201D; [<xref ref-type="bibr" rid="ref28">28</xref>]. As shown in <xref ref-type="fig" rid="figure3">Figure 3</xref>, these collapsed responses do not affect the final risk judgment for a domain (low, high, or some concerns). Therefore, except for some special case SQs, we collapse these response options in this work. Our scheme includes 22 entities for the 22 SQs, each with 2 response options (&#x201C;Yes&#x201D; or &#x201C;No&#x201D;) and 2 risk judgments (&#x201C;Good&#x201D; or &#x201C;Bad&#x201D;); &#x201C;Good&#x201D; implies low risk and &#x201C;Bad&#x201D; implies high risk. The &#x201C;No Information&#x201D; option is removed unless there&#x2019;s truly no text evidence, though it remains for specific SQs like SQ 2.1. For instance, if a trial describes a &#x201C;random number generator and sealed envelopes&#x201D; but lacks details on envelope opacity, &#x201C;No Information&#x201D; is considered an appropriate label.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Algorithm for suggested judgment of risk of bias (RoB) arising from the randomization process. The figure is recreated from the revised Cochrane RoB 2 tool (RoB 2) [<xref ref-type="bibr" rid="ref6">6</xref>].</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e55127_fig02.png"/></fig><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Algorithm for suggested judgment of risk of bias arising from the randomization process. The figure is recreated from the revised Cochrane Risk of Bias 2 tool. N: no; NI: no information; PN: probably no; PY: probably yes; Y: yes [<xref ref-type="bibr" rid="ref6">6</xref>].</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e55127_fig03.png"/></fig></sec><sec id="s2-5"><title>Data Collection</title><p>A dataset of 41 RCTs from the physiotherapy and rehabilitation domains was compiled by RH. The RCTs included in the corpus were carefully curated from a selection of high-quality journals in physiotherapy and rehabilitation, as recommended by our institute&#x2019;s librarian (eg, PLoS). To ensure consistency with modern reporting practices, we included only RCTs published after 2010. To facilitate open sharing and publication of the annotated corpus, we included only articles available under the CC-BY-0 license. Additional details are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>To create this corpus, PDFs of full-text RCTs were extracted, and each article was paired with its trial registry whenever available. Each PDF was renamed with the primary outcome to be assessed using the RoB 2 tool before being uploaded to the annotation software. To ensure that various primary outcome types were represented in the corpus, we included 17, 17, and 7 RCTs addressing objective, subjective, and mortality primary outcomes, respectively, following the rationale that RoB assessment results are related to the outcomes [<xref ref-type="bibr" rid="ref30">30</xref>-<xref ref-type="bibr" rid="ref33">33</xref>]. The rationale behind this is described in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-6"><title>Visual Placards Development</title><p>Although RoB 2 guidelines are widely used for bias assessment, there has been some research on their reliability. Minozzi et al [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref34">34</xref>] addressed this issue by creating an ID that reduces subjectivity in the RoB 2 guidelines, providing clearer instructions for assessment. Before implementing the ID, the agreement among 4 expert RoB assessors in the Minozzi et al&#x2019;s [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref34">34</xref>] study was zero, but it improved after adopting the ID. Several other papers explored the subjectivity and reliability of Cochrane RoB 1 and 2 tools [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>]. To enhance consistency and reliability, we developed precise annotation instructions using the RoB 2 tool and collaborated with experts to format these into visual placards. Each placard, structured as a flowchart, guides annotators in answering SQs and labeling text with risk judgments. While RoB 2 SQs are mainly factual, they allow for subjective judgments, which the placards help standardize.</p></sec><sec id="s2-7"><title>Annotation</title><p>The annotation process for the 41 RCTs (see &#x201C;Data Collection&#x201D; section) began after developing visual placards. Annotators used the complete RoB 2 guidance alongside these placards, following instructions closely for each SQ. For each SQ, the placards guided annotators to relevant sections within the RCTs, to identify and highlight pertinent text to answer each question, selecting labels as defined in the annotation scheme. Domain 2 of RoB 2 was assessed with respect to the effect of assignment to the intervention (intention-to-treat estimand). Signaling questions related to the &#x201C;effect of adhering to the intervention&#x201D; were not annotated.</p><p>The annotation was done in Tagtog (tagtog Sp. z o.o.), a commercial PDF annotation tool [<xref ref-type="bibr" rid="ref37">37</xref>]. Of the 41 RCTs, 9 were doubly annotated by RH and KMS to calculate IAA, with the remaining (n=32) singly annotated by RH. Conflict resolution on the doubly annotated RCTs helped refine the visual placards before annotating the rest. After annotating the 9 RCTs, we transitioned to the PAWLS (PDF Annotation With Labels and Structure) annotation tool (Allen Institute for Artificial Intelligence; <xref ref-type="fig" rid="figure4">Figure 4</xref>), a free PDF annotation platform [<xref ref-type="bibr" rid="ref38">38</xref>]. Annotating PDFs preserves the structure of sections, tables, and figures, improving annotation speed and quality and ease of annotation for our experts who volunteered for annotation. Feedback from them is detailed in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>A screenshot of PAWLS (PDF Annotation With Labels and Structure; Allen Institute for Artificial Intelligence) interface with an example PDF and risk of bias (RoB) annotations.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e55127_fig04.png"/></fig></sec><sec id="s2-8"><title>IAA Measure</title><p>We report IAA on the doubly annotated RCTs. The IAA was calculated at 2 levels, assessing annotator agreement on text spans for SQs using pairwise F1, which excludes unannotated tokens and is well-suited for token-level annotation tasks. Pairwise F1 is measured as shown below for each pair of annotators by treating one annotator&#x2019;s labels as &#x201C;true&#x201D; and the other&#x2019;s as &#x201C;predicted&#x201D; [<xref ref-type="bibr" rid="ref39">39</xref>,<xref ref-type="bibr" rid="ref40">40</xref>]. In this study, IAA was measured after incorporating visual placards into the annotation process. To evaluate the impact of these placards on annotation quality, we also compare the F1 IAA with results from our previous work, where n=10 RCTs were annotated without the use of placards (<xref ref-type="fig" rid="figure1">Figure 1</xref>).</p><disp-formula id="equWL1"><mml:math id="eqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>F</mml:mi><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>m</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>u</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>2</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mi>T</mml:mi><mml:mi>r</mml:mi><mml:mi>u</mml:mi><mml:mi>e</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi>P</mml:mi><mml:mi>o</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>v</mml:mi><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mi>T</mml:mi><mml:mi>r</mml:mi><mml:mi>u</mml:mi><mml:mi>e</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi>P</mml:mi><mml:mi>o</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>v</mml:mi><mml:mi>e</mml:mi><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi>P</mml:mi><mml:mi>o</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>v</mml:mi><mml:mi>e</mml:mi><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi>N</mml:mi><mml:mi>e</mml:mi><mml:mi>g</mml:mi><mml:mi>a</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>v</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:mfrac></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>We also check annotator agreement on risk judgments for each SQ using prevalence and bias-adjusted &#x03BA; (PABAK) and observed percent agreement. PABAK (&#x03BA;<sub>PABAK</sub>), an extension of Cohen &#x03BA; that accounts for prevalence and bias, is commonly used for classification tasks and is ideal for evaluating reliability at the risk judgment level. Interpretation guidelines for both IAA measures are shown in <xref ref-type="table" rid="table3">Table 3</xref> [<xref ref-type="bibr" rid="ref41">41</xref>-<xref ref-type="bibr" rid="ref44">44</xref>].</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Interpretation of pairwise F1-measure, &#x03BA;<sub>PABAK</sub>, and observed agreement.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Metric</td><td align="left" valign="bottom">Value</td></tr><tr><td align="left" valign="bottom" colspan="2">Pairwise F1</td></tr></thead><tbody><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Poor</td><td align="left" valign="top">0&#x2010;0.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Slight</td><td align="left" valign="top">1&#x2010;20.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Fair</td><td align="left" valign="top">21&#x2010;40.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Good</td><td align="left" valign="top">41&#x2010;60.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Substantial</td><td align="left" valign="top">61&#x2010;80.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Almost perfect</td><td align="left" valign="top">81&#x2010;99.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Perfect</td><td align="left" valign="top">100</td></tr><tr><td align="left" valign="top" colspan="2">&#x03BA;<sub>PABAK</sub></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>No agreement</td><td align="left" valign="top">&#x2264;0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>None to slight</td><td align="left" valign="top">0.0-0.20</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Minimal</td><td align="left" valign="top">0.21&#x2010;0.39</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Weak</td><td align="left" valign="top">0.40&#x2010;0.59</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Moderate</td><td align="left" valign="top">0.60&#x2010;0.79</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Strong</td><td align="left" valign="top">0.80&#x2010;0.90</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Almost perfect</td><td align="left" valign="top">&#x2265;0.90</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Perfect</td><td align="left" valign="top">100</td></tr><tr><td align="left" valign="top" colspan="2">Observed agreement</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>None</td><td align="left" valign="top">0%</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Very low</td><td align="left" valign="top">1%&#x2010;10%</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Low</td><td align="left" valign="top">11%&#x2010;30%</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Moderate</td><td align="left" valign="top">31%&#x2010;50%</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>High</td><td align="left" valign="top">51%&#x2010;70%</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Very high</td><td align="left" valign="top">71%&#x2010;90%</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Perfect</td><td align="left" valign="top">&#x003E;90%</td></tr></tbody></table></table-wrap></sec><sec id="s2-9"><title>LLM Evaluation</title><p>Our annotation guidelines were initially adapted for benchmarking traditional ML approaches rather than LLMs. This meant we restricted certain annotations, assuming the PDFs would be converted into text via optical character recognition, thus losing table and figure structures that classical ML models cannot interpret without significant adjustments [<xref ref-type="bibr" rid="ref45">45</xref>,<xref ref-type="bibr" rid="ref46">46</xref>]. Recent advancements with LLMs have offered a better alternative and have made us rethink the evaluation. The bar for clinical applications is high, and it is necessary to evaluate LLMs for the challenging clinical tasks like RoB span extraction [<xref ref-type="bibr" rid="ref47">47</xref>]. ChatPDF allows direct interaction between LLMs and PDFs, negating the clumsy PDF-to-text conversion [<xref ref-type="bibr" rid="ref48">48</xref>]. Therefore, we consider it essential to evaluate LLMs instead of forcefully adapting the evaluation to a classical ML problem.</p><p>We formulated the task as a zero-shot RoB text span extraction task with an aim to gauge whether an LLM encodes knowledge related to assessing trial biases. We used simple prompt constructs of the structure &#x201C;Answer the {<italic>SQ</italic>} + Action item to extract sentence supporting the answer&#x201D; (<xref ref-type="other" rid="box1">Textbox 1</xref>). ChatPDF used these prompts to identify relevant paragraphs and generate answers to the SQs using GPT-3.5, mirroring the annotators&#x2019; task.</p><boxed-text id="box1"><title> Example prompt used for large language model evaluation.</title><p>Question 4.3: Were outcome assessors aware of the intervention received by study participants? Provide an answer and extract the supporting sentences that you write your answer based on. Extract the sentences in JSON format.</p></boxed-text><p>LLM performance was measured by agreement on response options and extracted text span or evidence. If the LLM&#x2019;s response matched the expert annotator&#x2019;s selection, it was considered correct. If the text extracted by LLMs as evidence for answering the SQ fuzzy matches the text selected by the expert annotator, it is considered a correct answer. Both skills were evaluated using observed agreement metrics, <inline-formula><mml:math id="ieqn1"><mml:msub><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>O</mml:mi></mml:mrow></mml:msub><mml:mfenced separators="|"><mml:mrow><mml:mi>E</mml:mi><mml:mi>x</mml:mi><mml:mi>t</mml:mi><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>c</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:mfenced></mml:math></inline-formula> for measuring agreement over extraction and <inline-formula><mml:math id="ieqn2"><mml:msub><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>O</mml:mi></mml:mrow></mml:msub><mml:mfenced separators="|"><mml:mrow><mml:mi>R</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi><mml:mi>p</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>s</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:mfenced><mml:mi> </mml:mi></mml:math></inline-formula>for measuring agreement over response judgments and interpreted as per <xref ref-type="table" rid="table3">Table 3</xref>. Observed agreement is essentially the number of documents for an RoB SQ where LLM responses align with those of the human expert, divided by the total number of documents assessed [<xref ref-type="bibr" rid="ref49">49</xref>]. For cases where annotators found no information in RCT, ChatPDF&#x2019;s ability to recognize this absence was also evaluated. We set the temperature to 0 to ensure a deterministic setting for span extraction and response generation. This ensures exact text spans are extracted from the input RCT. This evaluation was done manually for 10 out of the 41 annotated RCTs. Details of the RCTs used for LLM evaluation are in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Visual Placards</title><p>A total of 27 placards were developed to address the 22 SQs. Details of the annotation guidelines and visual placards are available in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendices 1</xref> and <xref ref-type="supplementary-material" rid="app3">3</xref>. <xref ref-type="fig" rid="figure5">Figure 5</xref> shows an example placard for annotating SQ 3.1 (&#x201C;Were data for this outcome available for all, or nearly all participants randomized?&#x201D;), which assesses the completeness of outcome data in an RCT. Missing outcomes data can compromise statistical power and treatment effect estimates. The first diamond on the placard instructs annotators to check the &#x201C;Results&#x201D; section (priority indicated by green arrow) and the flowchart and table within the &#x201C;Results&#x201D; section (second priority) to identify outcome data at the specified time point. If outcomes data were available for at least 95<bold>%</bold> of participants, annotators mark relevant text descriptions as &#x201C;3.1 Yes Good,&#x201D; indicating a low bias. If data were available for fewer than 95<bold>%</bold>, they mark it as &#x201C;3.1 No Bad,&#x201D; indicating a high bias. The placard includes visual cues to guide the annotation process efficiently.</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Sample annotation instruction placard for the signaling question (SQ) 3.1 designed and adapted using the Cochrane Risk of Bias (RoB) 2 tool [<xref ref-type="bibr" rid="ref6">6</xref>].</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e55127_fig05.png"/></fig></sec><sec id="s3-2"><title>The Corpus: RoBuster</title><p><xref ref-type="fig" rid="figure6">Figure 6</xref> shows that SQ 1.3 had a much higher number of annotated tokens (n=16,446), compared to fewer than 2600 tokens for other SQs. This is because SQ 1.3 required annotating the entire baseline patient characteristics table, as instructed by the visual placards. For other questions, the number of annotated tokens depended on the amount of detail provided on study design, methods, and results, which affected both annotation count and assessment subjectivity. Most other SQs had fewer than 2000 annotated tokens, with SQ 3.1 slightly exceeding this threshold. SQ 2.4 had the fewest tokens, with only 25 tokens.</p><fig position="float" id="figure6"><label>Figure 6.</label><caption><p>Total number of token annotations in RoBuster for each risk of bias (RoB) signaling question (SQ).</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e55127_fig06.png"/></fig><p><xref ref-type="fig" rid="figure7">Figure 7</xref>, which shows the distribution of risk judgments across RoBuster, highlights that, for most SQs, no information was available (yellow bars) for answering the SQs. Exceptions included SQs 1.1, 1.2, 1.3, 2.2, 2.6, 3.1, 4.3, 4.4, and 5.2, where over 50% of documents had relevant information. In cases where even some information was available, bias tended to be low (green bar) with an exception for the SQs 2.1, 2.2, 3.1, 4.3, and 4.4, where bias was high (red bar). Studies with comprehensive information made evaluation easier, whereas those lacking key details made it challenging. Annotator feedback indicated that questions with fewer than 100 annotated tokens were consistently rated as &#x201C;(very) low&#x201D; information availability, whereas the top 5 SQs shown in <xref ref-type="fig" rid="figure6">Figure 6</xref> were rated as &#x201C;high&#x201D; or &#x201C;normal&#x201D; availability. All study references are listed in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><fig position="float" id="figure7"><label>Figure 7.</label><caption><p>Distribution of bias judgment across risk of bias (RoB) signaling questions (SQs) in RoBuster.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e55127_fig07.png"/></fig></sec><sec id="s3-3"><title>IAA Measure</title><p><xref ref-type="table" rid="table4">Table 4</xref> presents the F1-measure (IAA) between 2 annotators, both before and after the development of the visual placards. The F1-measures before and after the guideline improvement were calculated on a different set of documents. To reiterate, the F1 IAA results before the visual placard development were drawn from our previous work, where 10 RCTs were annotated in the absence of the placards [<xref ref-type="bibr" rid="ref28">28</xref>]. Initially, the average agreement across the corpus was 10.87%, which rose by 17.14% points to reach 28.01% with placard use. For the &#x201C;randomization&#x201D; domain, agreement improved from a low 31.72% to 63.30%. In the &#x201C;deviations from intended interventions&#x201D; domain, it increased from 12.76% to 27.02%. Agreement in the &#x201C;missing outcomes&#x201D; domain rose from 5.89% to 9.92%, while &#x201C;missing outcome measurement&#x201D; increased from 4.07% to 17.29%. For &#x201C;selection of reported results,&#x201D; agreement increased from 0% to 16.49%.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>The table displays the F1-measure at the text span annotation level before and after visual placard development, with changes in absolute interannotator agreement (IAA) points.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" colspan="2">Domain and SQ<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="bottom" colspan="2">F1-measure IAA</td><td align="left" valign="bottom">Change</td></tr><tr><td align="left" valign="top" colspan="2"/><td align="left" valign="top">Before guideline improvement</td><td align="left" valign="top">After guidelines improvement</td><td align="left" valign="top"/></tr></thead><tbody><tr><td align="left" valign="top" colspan="5">Domain 1</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup> 1.1</td><td align="left" valign="top">24.44</td><td align="left" valign="top">55.02</td><td align="left" valign="top">+30.58</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 1.2</td><td align="left" valign="top">50.28</td><td align="left" valign="top">44</td><td align="left" valign="top">&#x2013;6.28</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 1.3</td><td align="left" valign="top">20.44</td><td align="left" valign="top">90.9</td><td align="left" valign="top">+70.46</td></tr><tr><td align="left" valign="top" colspan="5">Domain 2</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 2.1</td><td align="left" valign="top">1.34</td><td align="left" valign="top">67.26</td><td align="left" valign="top">+65.92</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 2.2</td><td align="left" valign="top">7.23</td><td align="left" valign="top">38.66</td><td align="left" valign="top">+31.43</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 2.3</td><td align="left" valign="top">5.42</td><td align="left" valign="top">0</td><td align="left" valign="top">&#x2013;5.42</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 2.4</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td><td align="left" valign="top">0</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 2.5</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 2.6</td><td align="left" valign="top">68.85</td><td align="left" valign="top">83.25</td><td align="left" valign="top">+14.4</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 2.7</td><td align="left" valign="top">6.52</td><td align="left" valign="top">0</td><td align="left" valign="top">&#x2013;6.52</td></tr><tr><td align="left" valign="top" colspan="5">Domain 3</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 3.1</td><td align="left" valign="top">23.57</td><td align="left" valign="top">39.68</td><td align="left" valign="top">+16.11</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 3.2</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 3.3</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 3.4</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top" colspan="5">Domain 4</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 4.1</td><td align="left" valign="top">6.51</td><td align="left" valign="top">61.71</td><td align="left" valign="top">+55.2</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 4.2</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 4.3</td><td align="left" valign="top">13.85</td><td align="left" valign="top">30.21</td><td align="left" valign="top">+16.36</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 4.4</td><td align="left" valign="top">0</td><td align="left" valign="top">56.25</td><td align="left" valign="top">+56.25</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 4.5</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top" colspan="5">Domain 5</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 5.1</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 5.2</td><td align="left" valign="top">0</td><td align="left" valign="top">49.49</td><td align="left" valign="top">+49.49</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 5.3</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>SQ: signaling question.</p></fn><fn id="table4fn2"><p><sup>b</sup>RoB: risk of bias.</p></fn><fn id="table4fn3"><p><sup>c</sup>Dash (&#x2014;) indicates that one of the annotators did not annotate any text for a particular SQ.</p></fn></table-wrap-foot></table-wrap><p>Substantial gains of over 50% points were seen in SQs 1.3, 2.1, 4.1, and 4.4. However, for 11 out of 22 questions, agreement remained at zero, and for nine of these, the lack of agreement persisted postguideline update. In contrast, agreement improved for SQs 4.4 and 5.2 from 0% to 56.25% and 49.49%, respectively, while SQs 2.3, 2.7, and 1.2 saw slight agreement declines. Across the 22 SQs, 11 had poor agreement, 3 fair, 4 good, 2 substantial, and 2 near-perfect (&#x003E;81%); none of the SQs achieved a perfect F1-measure.</p><p><xref ref-type="table" rid="table5">Table 5</xref> presents the &#x03BA;<sub>PABAK</sub> agreement, observed agreement, and the percentage of &#x03BA;<sub>PABAK</sub> agreements stemming from &#x201C;No Information&#x201D; judgments. &#x03BA;<sub>PABAK</sub> measures agreement at the SQ risk judgment level. Overall &#x03BA;<sub>PABAK</sub> between annotators shows weak IAA at 0.412. Agreement for &#x201C;randomization&#x201D; (domain 1) averaged a moderate 0.629, &#x201C;deviations due to intended interventions&#x201D; (domain 2) was 0.64, and &#x201C;missing outcome data&#x201D; (domain 3) was minimal at 0.388. Domains 4 (&#x201C;outcome measurement&#x201D;) and 5 (&#x201C;selection of reported result&#x201D;) had slight or no agreement at 0.166 and 0.092, respectively.</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>&#x03BA;<sub>PABAK</sub> and observed agreement between annotator pairs at the risk judgment level for each signaling question (SQ).</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" colspan="2">Domain and SQ</td><td align="left" valign="bottom">&#x03BA;<sub>PABAK</sub> agreement</td><td align="left" valign="bottom">Observed agreement (%)</td><td align="left" valign="bottom">Contribution from &#x201C;No information&#x201D; (%)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="5">Domain 1</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup> 1.1</td><td align="left" valign="top">0.8333</td><td align="left" valign="top">88.90</td><td align="left" valign="top">22.22</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 1.2</td><td align="left" valign="top">0.5</td><td align="left" valign="top">66.70</td><td align="left" valign="top">33.33</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 1.3</td><td align="left" valign="top">0.5556</td><td align="left" valign="top">77.80</td><td align="left" valign="top">11.11</td></tr><tr><td align="left" valign="top" colspan="5">Domain 2</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 2.1</td><td align="left" valign="top">0.7037</td><td align="left" valign="top">77.80</td><td align="left" valign="top">5.56</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 2.2</td><td align="left" valign="top">0.5</td><td align="left" valign="top">66.70</td><td align="left" valign="top">38.89</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 2.3</td><td align="left" valign="top">0.5</td><td align="left" valign="top">66.70</td><td align="left" valign="top">77.78</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 2.4</td><td align="left" valign="top">0.5556</td><td align="left" valign="top">77.80</td><td align="left" valign="top">88.89</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 2.5</td><td align="left" valign="top">0.6667</td><td align="left" valign="top">77.80</td><td align="left" valign="top">83.33</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 2.6</td><td align="left" valign="top">1</td><td align="left" valign="top">100</td><td align="left" valign="top">55.56</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 2.7</td><td align="left" valign="top">0.5556</td><td align="left" valign="top">77.80</td><td align="left" valign="top">77.78</td></tr><tr><td align="left" valign="top" colspan="5">Domain 3</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 3.1</td><td align="left" valign="top">0.8333</td><td align="left" valign="top">88.90</td><td align="left" valign="top">11.11</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 3.2</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table5fn2">b</xref></sup></td><td align="left" valign="top">100</td><td align="left" valign="top">100</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 3.3</td><td align="left" valign="top">0</td><td align="left" valign="top">33.3</td><td align="left" valign="top">55.56</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 3.4</td><td align="left" valign="top">0.3333</td><td align="left" valign="top">55.60</td><td align="left" valign="top">33.33</td></tr><tr><td align="left" valign="top" colspan="5">Domain 4</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 4.1</td><td align="left" valign="top">0.5556</td><td align="left" valign="top">77.80</td><td align="left" valign="top">11.11</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 4.2</td><td align="left" valign="top">&#x2013;0.5556</td><td align="left" valign="top">22.20</td><td align="left" valign="top">38.89</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 4.3</td><td align="left" valign="top">0.6667</td><td align="left" valign="top">77.80</td><td align="left" valign="top">27.78</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 4.4</td><td align="left" valign="top">0.1667</td><td align="left" valign="top">44.40</td><td align="left" valign="top">22.22</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 4.5</td><td align="left" valign="top">0</td><td align="left" valign="top">33.30</td><td align="left" valign="top">66.67</td></tr><tr><td align="left" valign="top" colspan="5">Domain 5</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 5.1</td><td align="left" valign="top">&#x2013;0.5556</td><td align="left" valign="top">22.20</td><td align="left" valign="top">61.11</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 5.2</td><td align="left" valign="top">0.5</td><td align="left" valign="top">66.70</td><td align="left" valign="top">22.22</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 5.3</td><td align="left" valign="top">0.3333</td><td align="left" valign="top">55.60</td><td align="left" valign="top">72.22</td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>RoB: risk of bias.</p></fn><fn id="table5fn2"><p><sup>b</sup>Dash (&#x2014;) indicates that one of the annotators did not annotate any text for a particular SQ.</p></fn></table-wrap-foot></table-wrap><p>The highest agreement, 1.0, occurred for SQ 2.6, largely due to &#x201C;No Information&#x201D; judgments. No SQs showed almost perfect or strong agreement; moderate agreements (0.60&#x2010;0.80) were observed for SQs 1.1, 2.1, 2.5, 3.1, and 4.3, with &#x201C;No Information&#x201D; judgments impacting only SQs 1.1, 2.1, 3.1, and 4.3. Two SQs (4.2 and 5.1) had agreements worse than chance, with one annotator marking no text in SQ 5.1, resulting in negative IAA, and both annotators marking different parts of the text in SQ 4.2. SQs 3.3 and 4.5 had zero &#x03BA;<sub>PABAK</sub> due to mutually exclusive document annotations, causing no consensus.</p></sec><sec id="s3-4"><title>LLM Evaluation</title><p><xref ref-type="table" rid="table6">Table 6</xref> shows the observed agreement between the LLM and expert assessments for extracting and responding to SQs on a subset (n=10) of RoBuster. In domain 1, GPT-3.5 had high agreement with experts (66.6% for extraction and 55.3% for response judgment), with no reliance on &#x201C;No Information&#x201D; responses, indicating good information availability. For domain 2, observed agreements were 64.3% (extraction) and 60.0% (response), but 40% of these were &#x201C;No Information&#x201D; responses, showing lower reporting quality. Domain 3 had moderate agreement at 47.5% for both extraction and response, while domain 4 had lower agreement (28% for response and 26% for extraction). To enhance transparency, we have included both the LLM-generated responses (via ChatPDF) and the corresponding annotator responses as <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>.</p><table-wrap id="t6" position="float"><label>Table 6.</label><caption><p>Large language models (LLMs) evaluation&#x2014;observed agreements between LLM and experts over a subset of RoBuster.</p></caption><table id="table6" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" colspan="2">Domain and SQ<sup><xref ref-type="table-fn" rid="table6fn1">a</xref></sup></td><td align="left" valign="bottom">P<sub>O</sub>(extraction; %)</td><td align="left" valign="bottom">P<sub>O</sub> (response; %)</td><td align="left" valign="bottom">Contribution from &#x201C;No information&#x201D; (%)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="5">Domain 1</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup> 1.1</td><td align="left" valign="top">90</td><td align="left" valign="top">70</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 1.2</td><td align="left" valign="top">70</td><td align="left" valign="top">60</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 1.3</td><td align="left" valign="top">40</td><td align="left" valign="top">30</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top" colspan="5">Domain 2</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 2.1</td><td align="left" valign="top">50</td><td align="left" valign="top">40</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 2.2</td><td align="left" valign="top">30</td><td align="left" valign="top">30</td><td align="left" valign="top">10</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 2.3</td><td align="left" valign="top">60</td><td align="left" valign="top">60</td><td align="left" valign="top">50</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 2.4</td><td align="left" valign="top">90</td><td align="left" valign="top">90</td><td align="left" valign="top">100</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 2.5</td><td align="left" valign="top">90</td><td align="left" valign="top">90</td><td align="left" valign="top">100</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 2.6</td><td align="left" valign="top">80</td><td align="left" valign="top">50</td><td align="left" valign="top">10</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 2.7</td><td align="left" valign="top">50</td><td align="left" valign="top">60</td><td align="left" valign="top">40</td></tr><tr><td align="left" valign="top" colspan="5">Domain 3</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 3.1</td><td align="left" valign="top">30</td><td align="left" valign="top">40</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 3.2</td><td align="left" valign="top">60</td><td align="left" valign="top">40</td><td align="left" valign="top">40</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 3.3</td><td align="left" valign="top">30</td><td align="left" valign="top">30</td><td align="left" valign="top">30</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 3.4</td><td align="left" valign="top">70</td><td align="left" valign="top">80</td><td align="left" valign="top">70</td></tr><tr><td align="left" valign="top" colspan="5">Domain 4</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 4.1</td><td align="left" valign="top">40</td><td align="left" valign="top">30</td><td align="left" valign="top">10</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 4.2</td><td align="left" valign="top">40</td><td align="left" valign="top">30</td><td align="left" valign="top">20</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 4.3</td><td align="left" valign="top">10</td><td align="left" valign="top">10</td><td align="left" valign="top">10</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 4.4</td><td align="left" valign="top">10</td><td align="left" valign="top">20</td><td align="left" valign="top">10</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 4.5</td><td align="left" valign="top">40</td><td align="left" valign="top">40</td><td align="left" valign="top">40</td></tr><tr><td align="left" valign="top" colspan="5">Domain 5<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 5.1</td><td align="left" valign="top">22.22</td><td align="left" valign="top">77.77</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 5.2</td><td align="left" valign="top">33.33</td><td align="left" valign="top">44.44</td><td align="left" valign="top">33.33</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RoB 5.3</td><td align="left" valign="top">55.55</td><td align="left" valign="top">55.55</td><td align="left" valign="top">44.44</td></tr></tbody></table><table-wrap-foot><fn id="table6fn1"><p><sup>a</sup>SQ: signaling question.</p></fn><fn id="table6fn2"><p><sup>b</sup>RoB: risk of bias.</p></fn><fn id="table6fn3"><p><sup>c</sup>The LLM evaluation for domain 5 was conducted on 9 randomized controlled trials (RCTs), as one lacked a trial registry.</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Visual Placards</title><p>According to a study by Dhrangadhariyaet et al [<xref ref-type="bibr" rid="ref28">28</xref>], 2 factors contributed to low F1 IAA in annotating text spans for SQs: a lack of guidance on annotation granularity and inconsistent annotation locations. Some annotators marked entire paragraphs, while others selected only the most informative text. Our placards address this by specifying whether to annotate a phrase, sentence, or sentences. Additionally, annotators often drew evidence from different parts of the text for the same SQ, lowering agreement. Our placards now restrict annotations for certain SQs to specific sections, such as &#x201C;Methods,&#x201D; &#x201C;Results,&#x201D; or &#x201C;Flowcharts.&#x201D; Flowcharts and tables are designated as the lowest priority for all SQs except 1.3 due to ML models&#x2019; difficulty in interpreting them. For example, although SQ 3.1 information appears in the flowchart, annotators are directed to the &#x201C;Results&#x201D; section, which better supports ML training.</p><p>The visual placards aimed to reduce RoB 2 subjectivity, particularly for SQs with insufficient information for risk judgment annotation. For example, in one trial, both annotators selected &#x201C;71 allocated routine services, 67 allocated intervention service, 69 assessed at 8 weeks, 64 assessed at 8 weeks&#x201D; from the PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analyses) flowchart to answer SQ 3.1 [<xref ref-type="bibr" rid="ref50">50</xref>], but one responded &#x201C;Yes&#x201D; while the other chose &#x201C;No.&#x201D; This question asks if outcome data were available for nearly all randomized participants but lacks a clear threshold. To standardize responses, we introduced a 95% threshold in the placard (<xref ref-type="fig" rid="figure5">Figure 5</xref>).</p></sec><sec id="s4-2"><title>IAA Measure</title><sec id="s4-2-1"><title>Text-Span Agreement</title><p>F1 agreement at the text span level improved for 10 of the 22 SQs after implementing visual placards. The agreement increased as placards clarified whether to annotate a phrase, sentence, paragraph, or entire table. SQ 1.3 saw the largest increase (70.46 percentage points) due to instructions to mark the entire patient characteristics table. Previously, annotators only marked portions, leading to variability based on what they noticed first. For SQ 1.1, F1 improved by 30 points with instructions to prioritize marking comprehensive information on the randomization method in the Methods section, which offers a detailed description. Previously, annotators marked evidence variably, only including &#x201C;randomized controlled trial&#x201D; phrases without checking the randomization method. Placards clarified that &#x201C;Yes Good&#x201D; required detailed evidence of randomization. Similarly, SQs 2.1 and 2.2 saw increased IAA due to clear guidance on marking both intervention and placebo descriptions, resolving prior inconsistencies where only one of the two was marked.</p><p>While the agreement drastically increased for certain SQs, it remained at 0 for some SQs (2.3, 2.4, 2.5, 2.7, 3.2, 3.3, 3.4, 4.2, 4.5, 5.1, and 5.3), aligning with feedback on these SQs, which indicated high subjectivity, difficulty, and limited information. This suggests that subjectivity and data scarcity in RCTs challenge consistent annotation.</p></sec><sec id="s4-2-2"><title>Response Option Agreement</title><p>Disagreements in response judgments stemmed from 2 main issues. Most disagreements (82.85%) occurred when one annotator provided a response while the other marked &#x201C;No Information&#x201D; due to no annotation. Agreements were also split between cases where both annotators labeled a text span similarly and cases where neither annotated, defaulting to &#x201C;No Information.&#x201D; Therefore, a considerable chunk of both agreements and disagreements came from &#x201C;No Information.&#x201D; This highlights the impact of &#x201C;No Information&#x201D; judgments, emphasizing the potential benefit of using high-quality journals to reduce these instances and improve annotation comprehensiveness.</p><p>SQs (1.1, 2.1, 2.5, 2.6, 3.1, and 4.3) with adequate agreement (&#x003E;0.60 &#x03BA;<sub>PABAK</sub>) showed fewer &#x201C;No Information&#x201D; judgments, aligning with feedback that these questions had &#x201C;normal&#x201D; or &#x201C;very high&#x201D; information availability and low subjectivity. In contrast, SQs with low or negative agreement had more disagreements, leading to lower &#x03BA;<sub>PABAK</sub> scores.</p><p>Negative &#x03BA; are unlikely to occur in practice, but two SQs (4.2 and 5.1) had negative &#x03BA;<sub>PABAK</sub> values due to frequent disagreements (in 7 of 9 documents), indicating strong disagreement among raters. The reason for the lower &#x03BA;<sub>PABAK</sub> for these questions was disagreements over 7 of the 9 annotated documents, which were the highest number of disagreements in the subset RCTs used for &#x03BA; calculation (<xref ref-type="fig" rid="figure8">Figure 8</xref>). To note that these agreements came from &#x201C;No Information&#x201D; judgments. Though the &#x03BA;<sub>PABAK</sub> values for these SQs are considerably smaller than &#x2013;0.10. Such values (&#x003C;&#x2013;0.10) suggest that the collected data may not be meaningful for these questions [<xref ref-type="bibr" rid="ref41">41</xref>].</p><fig position="float" id="figure8"><label>Figure 8.</label><caption><p>The histogram of total number of conflicts and total number of agreements in the subset of randomized controlled trials (RCTs) used to calculate prevalence and bias adjusted &#x03BA;.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e55127_fig08.png"/></fig><p>Domain 5 had nearly zero agreement, likely due to the complexity of assessing it, which requires annotators to reference both the RCT and its trial registry and then summarize the information. Zero IAA also stems from theoretical questions like SQ 3.4, which evaluates whether missing outcome data is linked to true outcome values, and SQ 4.5, which assesses whether outcome assessment was influenced by knowledge of assigned intervention. These questions require hypothetical judgments rather than direct evidence, increasing subjectivity. Since these aspects are often not explicitly covered in trial documentation, annotators were instructed to mark outcome and outcome measurement descriptions to provide a foundation for judgments and ensure relevant annotations for evaluating LLM.</p><p>There were certain aspects where question subjectivity led to low agreement. For SQ 2.4, which asks if deviations from intended intervention might affect outcomes, responses are inherently subjective, as assessors may differ on what they consider impactful. Limited information on deviations further complicates accurate judgment. Only 2 of the 9 RCTs had SQ 2.4 annotations, and only 1 annotator marked them, resulting in no text span agreement. SQ 3.3 requires assessors to judge whether missing data are related to the true outcome, which can be challenging to determine objectively. Annotating evidence is further complicated by the need to link missing data reasons to outcome values, such as &#x201C;fatigue&#x201D; in physiotherapy studies. If, for example, patients missed follow-up due to fatigue, this connection should be annotated. However, RCTs rarely include this level of detail. Placards instead advised annotators to mark outcome descriptions to support judgment, but 1 annotator did not follow this guidance, leading to zero text span agreement. Such cases indicate a need for extended training, conflict resolution rounds, and placard improvement.</p></sec></sec><sec id="s4-3"><title>LLM Evaluation</title><sec id="s4-3-1"><title>Direct Versus Indirect Responses</title><p>LLM gave direct responses (&#x201C;Yes,&#x201D; &#x201C;No,&#x201D; or &#x201C;No Information&#x201D;) for some questions and responded indirectly for others. For instance, in an RCT by Gleason et al [<xref ref-type="bibr" rid="ref51">51</xref>], it directly answered SQ 1.1, stating, &#x201C;Yes, the allocation sequence was random,&#x201D; and extracted relevant supporting text. However, for SQ 5.2, instead of stating &#x201C;No information was present to make a judgment,&#x201D; the LLM extracted a candidate paragraph along with the following text: &#x201C;However, the study authors did not provide information on whether the numerical result being assessed was selected based on the results from multiple eligible outcome measurements within the outcome domain.&#x201D; This response implied that it found &#x201C;No Information&#x201D; for SQ 5.2.</p></sec><sec id="s4-3-2"><title>Domain-Specific Findings</title><p>The LLM performed well in extracting information on participant randomization and allocation concealment for SQs 1.1 and 1.2, resulting in good <inline-formula><mml:math id="ieqn3"><mml:msub><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>O</mml:mi></mml:mrow></mml:msub><mml:mfenced separators="|"><mml:mrow><mml:mi>E</mml:mi><mml:mi>x</mml:mi><mml:mi>t</mml:mi><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>c</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:mfenced></mml:math></inline-formula> agreement. However, it sometimes reached the correct response judgment, leading to a higher <inline-formula><mml:math id="ieqn4"><mml:msub><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>O</mml:mi></mml:mrow></mml:msub><mml:mfenced separators="|"><mml:mrow><mml:mi>R</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi><mml:mi>p</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>s</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:mfenced></mml:math></inline-formula>, without fully extracting the required evidence. In an RCT by Stuck et al [<xref ref-type="bibr" rid="ref52">52</xref>], the LLM extracted an irrelevant passage to answer SQ 1.1 with &#x201C;Yes Good&#x201D; (low risk of randomization bias) even though the extracted sentence did not contain information about the randomization method. For SQ 1.3, lower agreement was observed between the expert and LLM, as the LLM often relied on text rather than tables, where critical data were found. Text in tables is read by ChatPDF, but it might have problems correlating the correct rows and columns, leading to a distorted understanding of the tables. For Stuck et al&#x2019;s [<xref ref-type="bibr" rid="ref52">52</xref>] RCT, both the expert and LLM answered SQ 1.3 with &#x201C;No Good,&#x201D; but the expert used a table as the evidence to answer the question, while the LLM used the text evidence not found in the table.</p><p>In domain 2, approximately 40% of <inline-formula><mml:math id="ieqn5"><mml:msub><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>O</mml:mi></mml:mrow></mml:msub><mml:mfenced separators="|"><mml:mrow><mml:mi>R</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi><mml:mi>p</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>s</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:mfenced></mml:math></inline-formula> agreement was due to &#x201C;No Information,&#x201D; indicating a lack of information in RCTs for SQs 2.4 and 2.5. In an instance, LLM evaluation led to the identification of an incorrect label from the expert. To elaborate, for SQ 2.6 in Hassett et al&#x2019;s [<xref ref-type="bibr" rid="ref53">53</xref>] study, the expert had annotated an incorrect part of the text to answer the question with &#x201C;Yes Good.&#x201D; LLM correctly extracted information about the intention-to-treat analysis, which led to correcting the final annotation in RoBuster.</p><p>In domain 3, bias assessment subjectivity emerged. A lenient assessor will judge a risk of bias as low in comparison to a stringent assessor judging a bias risk as high for any SQ, but it is more pronounced in subjective SQs. In Thorndike et al&#x2019;s [<xref ref-type="bibr" rid="ref54">54</xref>] RCT, both the LLM and the expert used the same evidence, &#x201C;...104 (82%) were randomized in January 2011. Five residents withdrew during Phase 1, and 99 continued participation in Phase 2...,&#x201D; to answer SQ 3.1. However, the LLM rated it as &#x201C;Yes Good,&#x201D; whereas the expert rated it as &#x201C;No Bad,&#x201D; following a strict rule that any outcome data missing for over 5% of participants indicates high bias risk, regardless of other study details. For SQ 3.2 in RCTs [<xref ref-type="bibr" rid="ref55">55</xref>,<xref ref-type="bibr" rid="ref56">56</xref>], the LLM was more lenient than the experts, extracting information about the sensitivity analysis carried out by the study authors to account for missing outcome data. However, the RCT authors were not explicit that there was no bias due to missing outcomes data. For SQ 3.3, LLM judged &#x201C;No Information&#x201D; 9 out of 10 times. It questions whether the missingness in outcomes depended on its true value and is quite subjective because the assessor needs to contemplate the reasons for missingness for a particular outcome and whether the true value could have affected the missingness. For example, they need to assess whether the fact that data on falls is missing can be attributed to the actual occurrence of falls in the study.</p><p>The agreement was lower in domain 4, despite outcome measurement data being present, as it often extracted nontargeted outcomes due to simple prompts. The annotators assess bias pertaining to the predecided target outcome and not all the outcomes reported in the trial. However, the information about the target outcome being assessed was not available in the simple prompts used to test LLMs. Consequently, the LLMs extracted information that did not necessarily pertain to the target outcome but rather to other outcomes reported in the trial. Enhancing prompt specificity, potentially with chain-of-thought or tree-of-thought styles, could improve the LLM&#x2019;s performance in RoB assessment [<xref ref-type="bibr" rid="ref57">57</xref>].</p><p>In sum, while the LLM shows potential for automating RoB span extraction, further optimization is needed for handling subjective SQs, lack of information, and tailoring prompts to specific bias contexts.</p></sec></sec><sec id="s4-4"><title>Limitations</title><p>The scale of our annotations is limited to 41 RCTs due to financial constraints, which restricted our ability to hire expert annotators. Furthermore, we acknowledge that performing double-coding on a subset of 9 out of 41 RCTs (about 22% of the corpus) restricts the statistical precision of our reliability estimation. While this subset was key in identifying and resolving discrepancies during the annotation process, the resulting IAA metrics should be interpreted as indicative rather than exhaustive.</p><p>Our focus is limited to physiotherapy and rehabilitation as we relied on voluntary experts only from these domains. This limitation was necessary to have annotators proficient in the domain, as expanding to other domains could compromise the validity of expert judgments [<xref ref-type="bibr" rid="ref6">6</xref>]. Though our approach restricts the broader applicability of the corpus, the annotations provided are robust, reflecting thorough assessment by experts within available resources. The sampling was restricted to articles published after 2010 to ensure consistency with modern CONSORT (Consolidated Standards of Reporting Trials) reporting practices. To facilitate open sharing and publication of the annotated corpus, we included only articles available under the CC0 license. While these criteria were necessary for methodological consistency and licensing requirements, they may limit generalizability to RCTs published prior to 2010, to other clinical domains, or to studies published in non&#x2013;open-access venues.</p><p>The low IAA observed stems from the subjective nature of RoB 2, which requires evaluators to interpret nuanced details across various domains. While we have implemented visual placards to aid standardization, these improvements cannot fully resolve the underlying subjectivity in the tool&#x2019;s design. RoB 2, despite offering more structured guidance than its predecessor, still relies on subjective judgment in answering SQs, particularly in complex domains like deviations from intended interventions. This subjectivity is not unique to our study, but is a fundamental challenge in RoB assessments and is recognized in previous research. Given the interpretative nature of RoB assessments, it is not feasible to entirely eliminate these differences. While we continue to explore methods for improving standardization, it is crucial to recognize that the inherent subjectivity in RoB 2 will always impact IAA to some degree.</p><p>Our LLM evaluation at that time was restricted to using PDFs, limiting platform compatibility and model selection. Specifically, Google Bard is a freely available tool that interacts with PDFs, but we observed that the Bard results were less deterministic than ChatPDF. Another drawback was associated with the prompts used. Our prompts were also relatively simple and lacked specificity regarding target outcomes, resulting in general responses rather than targeted bias assessments. In retrospect, we should have attached the detailed trial protocol and the statistical analysis plan to each clinical trial before annotation and LLM evaluation. This limitation will be addressed in future expansions of the corpus by adding both the protocol and analysis plan to the RCTs for annotation.</p><p>While we have provided a detailed account of our LLM evaluation methodology, we acknowledge the importance of adopting structured reporting frameworks, such as the recently published STAGER (Standards for Transparent And Generative Evaluation and Reporting) checklist, for studies involving generative artificial intelligence. Future studies could benefit from adhering to such guidelines to ensure greater methodological rigor and transparency in reporting [<xref ref-type="bibr" rid="ref58">58</xref>].</p></sec><sec id="s4-5"><title>Conclusions</title><p>We present RoBuster, a new publicly available corpus of 41 full-text annotated RCTs with detailed RoB spans across 22 bias questions. This corpus and our rigorously developed annotation guidelines address a gap in resources for evaluating RoB text span extraction using ML. RoBuster includes fine-grained bias spans and annotator decisions, providing a benchmark for assessing LLM performance against human bias assessment. Created through collaboration between bias assessors and natural language processing experts, RoBuster can enhance automated bias assessment and support systematic literature review systems.</p><p>Our work reaffirms the complexity of bias assessment and the need for robust guidelines to improve IAA in both assessment and annotation. Challenges stemmed from variation in RCT reporting standards, leading to low information availability and the subjectivity inherent in RoB 2. Future work will focus on refining our visual placards, expanding RoBuster with more annotated texts, and using the placards to train new bias assessors.</p></sec></sec></body><back><ack><p>Generative artificial intelligence was used solely for linguistic editing and proofreading. The authors take full responsibility for the final content.</p></ack><notes><sec><title>Funding</title><p>The work for funded by the Informatics Institute, HES-SO Valais-Wallis, Sierre, Switzerland and the School of Health Sciences, HES-SO Valais-Wallis, Leukerbad, Switzerland.</p></sec><sec><title>Data Availability</title><p>The 22 signaling question prompts used for large language model evaluation and the RoBuster dataset are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> and <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>, respectively, with this manuscript. Upon publication, the dataset will be permanently hosted in a public repository (eg, GitHub or Zenodo) to ensure persistent access and version control.</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AMSTAR</term><def><p>A Measurement Tool to Assess Systematic Reviews</p></def></def-item><def-item><term id="abb2">CONSORT</term><def><p>Consolidated Standards of Reporting Trials</p></def></def-item><def-item><term id="abb3">EPOC</term><def><p>Effective Practice and Organization of Care</p></def></def-item><def-item><term id="abb4">IAA</term><def><p>interannotator agreement</p></def></def-item><def-item><term id="abb5">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb6">ML</term><def><p>machine learning</p></def></def-item><def-item><term id="abb7">PABAK</term><def><p>prevalence and bias adjusted &#x03BA;</p></def></def-item><def-item><term id="abb8">PAWLS</term><def><p>PDF Annotation With Labels and Structure</p></def></def-item><def-item><term id="abb9">PRISMA</term><def><p>Preferred Reporting Items for Systematic Reviews and Meta-Analyses</p></def></def-item><def-item><term id="abb10">RCT</term><def><p>randomized controlled trial</p></def></def-item><def-item><term id="abb11">RoB</term><def><p>risk of bias</p></def></def-item><def-item><term id="abb12">SQ</term><def><p>signaling question</p></def></def-item><def-item><term id="abb13">SR</term><def><p>systematic review</p></def></def-item><def-item><term id="abb14">STAGER</term><def><p>Standards for Transparent And Generative Evaluation and Reporting</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mogo</surname><given-names>ERI</given-names> </name><name name-style="western"><surname>Lawanson</surname><given-names>T</given-names> </name><name name-style="western"><surname>Foley</surname><given-names>L</given-names> </name><etal/></person-group><article-title>A systematic review protocol of opportunities for noncommunicable disease prevention via public space initiatives in African cities</article-title><source>Int J Environ Res Public Health</source><year>2022</year><month>02</month><day>17</day><volume>19</volume><issue>4</issue><fpage>2285</fpage><pub-id pub-id-type="doi">10.3390/ijerph19042285</pub-id><pub-id pub-id-type="medline">35206471</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McTigue</surname><given-names>KM</given-names> </name><name name-style="western"><surname>Hess</surname><given-names>R</given-names> </name><name name-style="western"><surname>Ziouras</surname><given-names>J</given-names> </name></person-group><article-title>Obesity in older adults: a systematic review of the evidence for diagnosis and treatment</article-title><source>Obesity</source><year>2006</year><month>09</month><volume>14</volume><issue>9</issue><fpage>1485</fpage><lpage>1497</lpage><pub-id pub-id-type="doi">10.1038/oby.2006.171</pub-id><pub-id pub-id-type="medline">17030958</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sibbald</surname><given-names>B</given-names> </name><name name-style="western"><surname>Roland</surname><given-names>M</given-names> </name></person-group><article-title>Understanding controlled trials. Why are randomised controlled trials important?</article-title><source>BMJ</source><year>1998</year><month>01</month><day>17</day><volume>316</volume><issue>7126</issue><fpage>201</fpage><pub-id pub-id-type="doi">10.1136/bmj.316.7126.201</pub-id><pub-id pub-id-type="medline">9468688</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kjaergard</surname><given-names>LL</given-names> </name><name name-style="western"><surname>Nikolova</surname><given-names>D</given-names> </name><name name-style="western"><surname>Gluud</surname><given-names>C</given-names> </name></person-group><article-title>Randomized clinical trials in hepatology: predictors of quality</article-title><source>Hepatology</source><year>1999</year><month>11</month><volume>30</volume><issue>5</issue><fpage>1134</fpage><lpage>1138</lpage><pub-id pub-id-type="doi">10.1002/hep.510300510</pub-id><pub-id pub-id-type="medline">10534332</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Naci</surname><given-names>H</given-names> </name><name name-style="western"><surname>Davis</surname><given-names>C</given-names> </name><name name-style="western"><surname>Savovi&#x0107;</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Design characteristics, risk of bias, and reporting of randomised controlled trials supporting approvals of cancer drugs by European Medicines Agency, 2014-16: cross sectional analysis</article-title><source>BMJ</source><year>2019</year><month>09</month><day>18</day><volume>366</volume><fpage>l5221</fpage><pub-id pub-id-type="doi">10.1136/bmj.l5221</pub-id><pub-id pub-id-type="medline">31533922</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Higgins</surname><given-names>JP</given-names> </name><name name-style="western"><surname>Savovi&#x0107;</surname><given-names>J</given-names> </name><name name-style="western"><surname>Page</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Sterne</surname><given-names>JA</given-names> </name><collab>Cochrane</collab></person-group><article-title>Revised Cochrane Risk-Of-Bias tool for randomized trials (RoB 2)</article-title><source>Google Sites</source><year>2019</year><month>08</month><day>22</day><access-date>2026-02-03</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://sites.google.com/site/riskofbiastool/welcome/rob-2-0-tool/current-version-of-rob-2">https://sites.google.com/site/riskofbiastool/welcome/rob-2-0-tool/current-version-of-rob-2</ext-link></comment></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Higgins</surname><given-names>JPT</given-names> </name><name name-style="western"><surname>Altman</surname><given-names>DG</given-names> </name><name name-style="western"><surname>G&#x00F8;tzsche</surname><given-names>PC</given-names> </name><etal/></person-group><article-title>The Cochrane Collaboration&#x2019;s tool for assessing risk of bias in randomised trials</article-title><source>BMJ</source><year>2011</year><month>10</month><day>18</day><volume>343</volume><fpage>d5928</fpage><pub-id pub-id-type="doi">10.1136/bmj.d5928</pub-id><pub-id pub-id-type="medline">22008217</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Elkins</surname><given-names>MR</given-names> </name><name name-style="western"><surname>Moseley</surname><given-names>AM</given-names> </name><name name-style="western"><surname>Sherrington</surname><given-names>C</given-names> </name><name name-style="western"><surname>Herbert</surname><given-names>RD</given-names> </name><name name-style="western"><surname>Maher</surname><given-names>CG</given-names> </name></person-group><article-title>Growth in the Physiotherapy Evidence Database (PEDro) and use of the PEDro scale</article-title><source>Br J Sports Med</source><year>2013</year><month>03</month><volume>47</volume><issue>4</issue><fpage>188</fpage><lpage>189</lpage><pub-id pub-id-type="doi">10.1136/bjsports-2012-091804</pub-id><pub-id pub-id-type="medline">23134761</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shea</surname><given-names>BJ</given-names> </name><name name-style="western"><surname>Reeves</surname><given-names>BC</given-names> </name><name name-style="western"><surname>Wells</surname><given-names>G</given-names> </name><etal/></person-group><article-title>AMSTAR 2: a critical appraisal tool for systematic reviews that include randomised or non-randomised studies of healthcare interventions, or both</article-title><source>BMJ</source><year>2017</year><month>09</month><day>21</day><volume>358</volume><fpage>j4008</fpage><pub-id pub-id-type="doi">10.1136/bmj.j4008</pub-id><pub-id pub-id-type="medline">28935701</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Farrah</surname><given-names>K</given-names> </name><name name-style="western"><surname>Young</surname><given-names>K</given-names> </name><name name-style="western"><surname>Tunis</surname><given-names>MC</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>L</given-names> </name></person-group><article-title>Risk of bias tools in systematic reviews of health interventions: an analysis of PROSPERO-registered protocols</article-title><source>Syst Rev</source><year>2019</year><month>11</month><day>15</day><volume>8</volume><issue>1</issue><fpage>280</fpage><pub-id pub-id-type="doi">10.1186/s13643-019-1172-8</pub-id><pub-id pub-id-type="medline">31730014</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sterne</surname><given-names>JAC</given-names> </name><name name-style="western"><surname>Savovi&#x0107;</surname><given-names>J</given-names> </name><name name-style="western"><surname>Page</surname><given-names>MJ</given-names> </name><etal/></person-group><article-title>RoB 2: a revised tool for assessing risk of bias in randomised trials</article-title><source>BMJ</source><year>2019</year><month>08</month><day>28</day><volume>366</volume><fpage>l4898</fpage><pub-id pub-id-type="doi">10.1136/bmj.l4898</pub-id><pub-id pub-id-type="medline">31462531</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hartling</surname><given-names>L</given-names> </name><name name-style="western"><surname>Bond</surname><given-names>K</given-names> </name><name name-style="western"><surname>Vandermeer</surname><given-names>B</given-names> </name><name name-style="western"><surname>Seida</surname><given-names>J</given-names> </name><name name-style="western"><surname>Dryden</surname><given-names>DM</given-names> </name><name name-style="western"><surname>Rowe</surname><given-names>BH</given-names> </name></person-group><article-title>Applying the risk of bias tool in a systematic review of combination long-acting beta-agonists and inhaled corticosteroids for persistent asthma</article-title><source>PLoS ONE</source><year>2011</year><month>02</month><day>24</day><volume>6</volume><issue>2</issue><fpage>e17242</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0017242</pub-id><pub-id pub-id-type="medline">21390219</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Crocker</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Lam</surname><given-names>N</given-names> </name><name name-style="western"><surname>Jord&#x00E3;o</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Risk-of-bias assessment using Cochrane&#x2019;s revised tool for randomized trials (RoB 2) was useful but challenging and resource-intensive: observations from a systematic review</article-title><source>J Clin Epidemiol</source><year>2023</year><month>09</month><volume>161</volume><fpage>39</fpage><lpage>45</lpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2023.06.015</pub-id><pub-id pub-id-type="medline">37364620</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tsertsvadze</surname><given-names>A</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>YF</given-names> </name><name name-style="western"><surname>Moher</surname><given-names>D</given-names> </name><name name-style="western"><surname>Sutcliffe</surname><given-names>P</given-names> </name><name name-style="western"><surname>McCarthy</surname><given-names>N</given-names> </name></person-group><article-title>How to conduct systematic reviews more expeditiously?</article-title><source>Syst Rev</source><year>2015</year><month>11</month><day>12</day><volume>4</volume><issue>1</issue><fpage>160</fpage><pub-id pub-id-type="doi">10.1186/s13643-015-0147-7</pub-id><pub-id pub-id-type="medline">26563648</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Khangura</surname><given-names>S</given-names> </name><name name-style="western"><surname>Konnyu</surname><given-names>K</given-names> </name><name name-style="western"><surname>Cushman</surname><given-names>R</given-names> </name><name name-style="western"><surname>Grimshaw</surname><given-names>J</given-names> </name><name name-style="western"><surname>Moher</surname><given-names>D</given-names> </name></person-group><article-title>Evidence summaries: the evolution of a rapid review approach</article-title><source>Syst Rev</source><year>2012</year><month>02</month><day>10</day><volume>1</volume><issue>1</issue><fpage>10</fpage><pub-id pub-id-type="doi">10.1186/2046-4053-1-10</pub-id><pub-id pub-id-type="medline">22587960</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Marshall</surname><given-names>IJ</given-names> </name><name name-style="western"><surname>Kuiper</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wallace</surname><given-names>BC</given-names> </name></person-group><article-title>Automating risk of bias assessment for clinical trials</article-title><source>IEEE J Biomed Health Inform</source><year>2015</year><month>07</month><volume>19</volume><issue>4</issue><fpage>1406</fpage><lpage>1412</lpage><pub-id pub-id-type="doi">10.1109/JBHI.2015.2431314</pub-id><pub-id pub-id-type="medline">25966488</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Marshall</surname><given-names>IJ</given-names> </name><name name-style="western"><surname>Kuiper</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wallace</surname><given-names>BC</given-names> </name></person-group><article-title>RobotReviewer: evaluation of a system for automatically assessing bias in clinical trials</article-title><source>J Am Med Inform Assoc</source><year>2016</year><month>01</month><volume>23</volume><issue>1</issue><fpage>193</fpage><lpage>201</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocv044</pub-id><pub-id pub-id-type="medline">26104742</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ma</surname><given-names>LL</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>YY</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>ZH</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>D</given-names> </name><name name-style="western"><surname>Weng</surname><given-names>H</given-names> </name><name name-style="western"><surname>Zeng</surname><given-names>XT</given-names> </name></person-group><article-title>Methodological quality (risk of bias) assessment tools for primary and secondary medical studies: What are they and which is better?</article-title><source>Mil Med Res</source><year>2020</year><month>02</month><day>29</day><volume>7</volume><issue>1</issue><fpage>7</fpage><pub-id pub-id-type="doi">10.1186/s40779-020-00238-8</pub-id><pub-id pub-id-type="medline">32111253</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Martimbianco</surname><given-names>ALC</given-names> </name><name name-style="western"><surname>S&#x00E1;</surname><given-names>KMM</given-names> </name><name name-style="western"><surname>Santos</surname><given-names>GM</given-names> </name><name name-style="western"><surname>Santos</surname><given-names>EM</given-names> </name><name name-style="western"><surname>Pacheco</surname><given-names>RL</given-names> </name><name name-style="western"><surname>Riera</surname><given-names>R</given-names> </name></person-group><article-title>Most Cochrane systematic reviews and protocols did not adhere to the Cochrane&#x2019;s risk of bias 2.0 tool</article-title><source>Rev Assoc Med Bras</source><year>2023</year><volume>69</volume><issue>3</issue><fpage>469</fpage><lpage>472</lpage><pub-id pub-id-type="doi">10.1590/1806-9282.20221593</pub-id><pub-id pub-id-type="medline">36820779</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Millard</surname><given-names>LAC</given-names> </name><name name-style="western"><surname>Flach</surname><given-names>PA</given-names> </name><name name-style="western"><surname>Higgins</surname><given-names>JPT</given-names> </name></person-group><article-title>Machine learning to assist risk-of-bias assessments in systematic reviews</article-title><source>Int J Epidemiol</source><year>2016</year><month>02</month><volume>45</volume><issue>1</issue><fpage>266</fpage><lpage>277</lpage><pub-id pub-id-type="doi">10.1093/ije/dyv306</pub-id><pub-id pub-id-type="medline">26659355</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Soboczenski</surname><given-names>F</given-names> </name><name name-style="western"><surname>Trikalinos</surname><given-names>TA</given-names> </name><name name-style="western"><surname>Kuiper</surname><given-names>J</given-names> </name><name name-style="western"><surname>Bias</surname><given-names>RG</given-names> </name><name name-style="western"><surname>Wallace</surname><given-names>BC</given-names> </name><name name-style="western"><surname>Marshall</surname><given-names>IJ</given-names> </name></person-group><article-title>Machine learning to help researchers evaluate biases in clinical trials: a prospective, randomized user study</article-title><source>BMC Med Inform Decis Mak</source><year>2019</year><month>05</month><day>8</day><volume>19</volume><issue>1</issue><fpage>96</fpage><pub-id pub-id-type="doi">10.1186/s12911-019-0814-z</pub-id><pub-id pub-id-type="medline">31068178</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vinkers</surname><given-names>CH</given-names> </name><name name-style="western"><surname>Lamberink</surname><given-names>HJ</given-names> </name><name name-style="western"><surname>Tijdink</surname><given-names>JK</given-names> </name><etal/></person-group><article-title>The methodological quality of 176,620 randomized controlled trials published between 1966 and 2018 reveals a positive trend but also an urgent need for improvement</article-title><source>PLoS Biol</source><year>2021</year><month>04</month><volume>19</volume><issue>4</issue><fpage>e3001162</fpage><pub-id pub-id-type="doi">10.1371/journal.pbio.3001162</pub-id><pub-id pub-id-type="medline">33872298</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jardim</surname><given-names>PSJ</given-names> </name><name name-style="western"><surname>Rose</surname><given-names>CJ</given-names> </name><name name-style="western"><surname>Ames</surname><given-names>HM</given-names> </name><name name-style="western"><surname>Echavez</surname><given-names>JFM</given-names> </name><name name-style="western"><surname>Van de Velde</surname><given-names>S</given-names> </name><name name-style="western"><surname>Muller</surname><given-names>AE</given-names> </name></person-group><article-title>Automating risk of bias assessment in systematic reviews: a real-time mixed methods comparison of human researchers to a machine learning system</article-title><source>BMC Med Res Methodol</source><year>2022</year><month>06</month><day>8</day><volume>22</volume><issue>1</issue><fpage>167</fpage><pub-id pub-id-type="doi">10.1186/s12874-022-01649-y</pub-id><pub-id pub-id-type="medline">35676632</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hirt</surname><given-names>J</given-names> </name><name name-style="western"><surname>Meichlinger</surname><given-names>J</given-names> </name><name name-style="western"><surname>Schumacher</surname><given-names>P</given-names> </name><name name-style="western"><surname>Mueller</surname><given-names>G</given-names> </name></person-group><article-title>Agreement in risk of bias assessment between RobotReviewer and human reviewers: an evaluation study on randomised controlled trials in nursing&#x2010;related Cochrane reviews</article-title><source>J Nurs Scholarsh</source><year>2021</year><month>03</month><volume>53</volume><issue>2</issue><fpage>246</fpage><lpage>254</lpage><pub-id pub-id-type="doi">10.1111/jnu.12628</pub-id><pub-id pub-id-type="medline">33555110</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Liao</surname><given-names>J</given-names> </name><name name-style="western"><surname>Lapata</surname><given-names>M</given-names> </name><name name-style="western"><surname>Macleod</surname><given-names>M</given-names> </name></person-group><article-title>Risk of bias assessment in preclinical literature using natural language processing</article-title><source>Res Synth Methods</source><year>2022</year><month>05</month><volume>13</volume><issue>3</issue><fpage>368</fpage><lpage>380</lpage><pub-id pub-id-type="doi">10.1002/jrsm.1533</pub-id><pub-id pub-id-type="medline">34709718</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Minozzi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Dwan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Borrelli</surname><given-names>F</given-names> </name><name name-style="western"><surname>Filippini</surname><given-names>G</given-names> </name></person-group><article-title>Reliability of the revised Cochrane risk-of-bias tool for randomised trials (RoB2) improved with the use of implementation instruction</article-title><source>J Clin Epidemiol</source><year>2022</year><month>01</month><volume>141</volume><fpage>99</fpage><lpage>105</lpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2021.09.021</pub-id><pub-id pub-id-type="medline">34537386</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hartling</surname><given-names>L</given-names> </name><name name-style="western"><surname>Ospina</surname><given-names>M</given-names> </name><name name-style="western"><surname>Liang</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Risk of bias versus quality assessment of randomised controlled trials: cross sectional study</article-title><source>BMJ</source><year>2009</year><month>10</month><day>19</day><volume>339</volume><fpage>b4012</fpage><pub-id pub-id-type="doi">10.1136/bmj.b4012</pub-id><pub-id pub-id-type="medline">19841007</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Dhrangadhariya</surname><given-names>AK</given-names> </name><name name-style="western"><surname>Hilfiker</surname><given-names>R</given-names> </name><name name-style="western"><surname>Sattelmayer</surname><given-names>M</given-names> </name><etal/></person-group><article-title>First steps towards a risk of bias corpus of randomized controlled trials</article-title><source>Caring Is Sharing-Exploiting the Value in Data for Health and Innovation</source><year>2023</year><publisher-name>IOS Press</publisher-name><fpage>586</fpage><lpage>590</lpage><pub-id pub-id-type="doi">10.3233/shti230210</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>J</given-names> </name><etal/></person-group><article-title>A survey on evaluation of large language models</article-title><source>ACM Trans Intell Syst Technol</source><year>2024</year><month>06</month><day>30</day><volume>15</volume><issue>3</issue><fpage>1</fpage><lpage>45</lpage><pub-id pub-id-type="doi">10.1145/3641289</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Page</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Higgins</surname><given-names>JPT</given-names> </name><name name-style="western"><surname>Clayton</surname><given-names>G</given-names> </name><name name-style="western"><surname>Sterne</surname><given-names>JAC</given-names> </name><name name-style="western"><surname>Hr&#x00F3;bjartsson</surname><given-names>A</given-names> </name><name name-style="western"><surname>Savovi&#x0107;</surname><given-names>J</given-names> </name></person-group><article-title>Empirical evidence of study design biases in randomized trials: systematic review of meta-epidemiological studies</article-title><source>PLoS ONE</source><year>2016</year><volume>11</volume><issue>7</issue><fpage>e0159267</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0159267</pub-id><pub-id pub-id-type="medline">27398997</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Savovic</surname><given-names>J</given-names> </name><name name-style="western"><surname>Turner</surname><given-names>RM</given-names> </name><name name-style="western"><surname>Mawdsley</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Association between risk-of-bias assessments and results of randomized trials in Cochrane reviews: the ROBES meta-epidemiologic study</article-title><source>Am J Epidemiol</source><year>2018</year><month>05</month><day>1</day><volume>187</volume><issue>5</issue><fpage>1113</fpage><lpage>1122</lpage><pub-id pub-id-type="doi">10.1093/aje/kwx344</pub-id><pub-id pub-id-type="medline">29126260</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vollert</surname><given-names>J</given-names> </name><name name-style="western"><surname>Cook</surname><given-names>NR</given-names> </name><name name-style="western"><surname>Kaptchuk</surname><given-names>TJ</given-names> </name><name name-style="western"><surname>Sehra</surname><given-names>ST</given-names> </name><name name-style="western"><surname>Tobias</surname><given-names>DK</given-names> </name><name name-style="western"><surname>Hall</surname><given-names>KT</given-names> </name></person-group><article-title>Assessment of placebo response in objective and subjective outcome measures in rheumatoid arthritis clinical trials</article-title><source>JAMA Netw Open</source><year>2020</year><month>09</month><day>1</day><volume>3</volume><issue>9</issue><fpage>e2013196</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2020.13196</pub-id><pub-id pub-id-type="medline">32936297</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Grimaldi</surname><given-names>V</given-names> </name><name name-style="western"><surname>Schiano</surname><given-names>C</given-names> </name><name name-style="western"><surname>Casamassimi</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Imaging techniques to evaluate cell therapy in peripheral artery disease: state of the art and clinical trials</article-title><source>Clin Physiol Funct Imaging</source><year>2016</year><month>05</month><volume>36</volume><issue>3</issue><fpage>165</fpage><lpage>178</lpage><pub-id pub-id-type="doi">10.1111/cpf.12210</pub-id><pub-id pub-id-type="medline">25385089</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Minozzi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Cinquini</surname><given-names>M</given-names> </name><name name-style="western"><surname>Gianola</surname><given-names>S</given-names> </name><name name-style="western"><surname>Gonzalez-Lorenzo</surname><given-names>M</given-names> </name><name name-style="western"><surname>Banzi</surname><given-names>R</given-names> </name></person-group><article-title>The revised Cochrane risk of bias tool for randomized trials (RoB 2) showed low interrater reliability and challenges in its application</article-title><source>J Clin Epidemiol</source><year>2020</year><month>10</month><volume>126</volume><fpage>37</fpage><lpage>44</lpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2020.06.015</pub-id><pub-id pub-id-type="medline">32562833</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>da Costa</surname><given-names>BR</given-names> </name><name name-style="western"><surname>Beckett</surname><given-names>B</given-names> </name><name name-style="western"><surname>Diaz</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Effect of standardized training on the reliability of the Cochrane risk of bias assessment tool: a prospective study</article-title><source>Syst Rev</source><year>2017</year><month>03</month><day>3</day><volume>6</volume><issue>1</issue><fpage>44</fpage><pub-id pub-id-type="doi">10.1186/s13643-017-0441-7</pub-id><pub-id pub-id-type="medline">28253938</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Loef</surname><given-names>M</given-names> </name><name name-style="western"><surname>Walach</surname><given-names>H</given-names> </name><name name-style="western"><surname>Schmidt</surname><given-names>S</given-names> </name></person-group><article-title>Interrater reliability of ROB2&#x2014;an alternative measure and way of categorization</article-title><source>J Clin Epidemiol</source><year>2022</year><month>02</month><volume>142</volume><fpage>326</fpage><lpage>327</lpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2021.09.003</pub-id><pub-id pub-id-type="medline">34509629</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cejuela</surname><given-names>JM</given-names> </name><name name-style="western"><surname>McQuilton</surname><given-names>P</given-names> </name><name name-style="western"><surname>Ponting</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Tagtog: Interactive and text-mining-assisted annotation of gene mentions in PLOS full-text articles</article-title><source>Database (Oxford)</source><year>2014</year><volume>2014</volume><fpage>bau033</fpage><pub-id pub-id-type="doi">10.1093/database/bau033</pub-id><pub-id pub-id-type="medline">24715220</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Neumann</surname><given-names>M</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Skjonsberg</surname><given-names>S</given-names> </name></person-group><article-title>PAWLS: PDF Annotation With Labels and Structure</article-title><conf-name>Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing</conf-name><conf-date>Aug 1-6, 2021</conf-date><pub-id pub-id-type="doi">10.18653/v1/2021.acl-demo.31</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Deleger</surname><given-names>L</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Lingren</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Building gold standard corpora for medical natural language processing tasks</article-title><source>AMIA Annu Symp Proc</source><year>2012</year><volume>2012</volume><fpage>144</fpage><lpage>153</lpage><pub-id pub-id-type="medline">23304283</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Brandsen</surname><given-names>A</given-names> </name><name name-style="western"><surname>Verberne</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wansleeben</surname><given-names>M</given-names> </name><name name-style="western"><surname>Lambers</surname><given-names>K</given-names> </name></person-group><article-title>Creating a dataset for named entity recognition in the archaeology domain</article-title><access-date>2026-02-06</access-date><conf-name>Proceedings of the Twelfth Language Resources and Evaluation Conference (LREC 2020)</conf-name><conf-date>May 11-16, 2020</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2020.lrec-1.562/">https://aclanthology.org/2020.lrec-1.562/</ext-link></comment></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McHugh</surname><given-names>ML</given-names> </name></person-group><article-title>Interrater reliability: the kappa statistic</article-title><source>Biochem Med (Zagreb)</source><year>2012</year><volume>22</volume><issue>3</issue><fpage>276</fpage><lpage>282</lpage><pub-id pub-id-type="medline">23092060</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cohen</surname><given-names>J</given-names> </name></person-group><article-title>A coefficient of agreement for nominal scales</article-title><source>Educ Psychol Meas</source><year>1960</year><month>04</month><volume>20</volume><issue>1</issue><fpage>37</fpage><lpage>46</lpage><pub-id pub-id-type="doi">10.1177/001316446002000104</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Byrt</surname><given-names>T</given-names> </name><name name-style="western"><surname>Bishop</surname><given-names>J</given-names> </name><name name-style="western"><surname>Carlin</surname><given-names>JB</given-names> </name></person-group><article-title>Bias, prevalence and kappa</article-title><source>J Clin Epidemiol</source><year>1993</year><month>05</month><volume>46</volume><issue>5</issue><fpage>423</fpage><lpage>429</lpage><pub-id pub-id-type="doi">10.1016/0895-4356(93)90018-v</pub-id><pub-id pub-id-type="medline">8501467</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Landis</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Koch</surname><given-names>GG</given-names> </name></person-group><article-title>The measurement of observer agreement for categorical data</article-title><source>Biometrics</source><year>1977</year><month>03</month><volume>33</volume><issue>1</issue><fpage>159</fpage><lpage>174</lpage><pub-id pub-id-type="doi">10.2307/2529310</pub-id><pub-id pub-id-type="medline">843571</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>P</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Shatkay</surname><given-names>H</given-names> </name></person-group><article-title>Figure and caption extraction from biomedical documents</article-title><source>Bioinformatics</source><year>2019</year><month>11</month><day>1</day><volume>35</volume><issue>21</issue><fpage>4381</fpage><lpage>4388</lpage><pub-id pub-id-type="doi">10.1093/bioinformatics/btz228</pub-id><pub-id pub-id-type="medline">30949681</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>M</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>M</given-names> </name><name name-style="western"><surname>Han</surname><given-names>D</given-names> </name></person-group><article-title>UTTSR: A novel non-structured text table recognition model powered by deep learning technology</article-title><source>Appl Sci (Basel)</source><year>2023</year><volume>13</volume><issue>13</issue><fpage>7556</fpage><pub-id pub-id-type="doi">10.3390/app13137556</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singhal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Azizi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Large language models encode clinical knowledge</article-title><source>Nature</source><year>2023</year><month>08</month><volume>620</volume><issue>7972</issue><fpage>172</fpage><lpage>180</lpage><pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id><pub-id pub-id-type="medline">37438534</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="web"><source>ChatPDF GmbH</source><year>2023</year><access-date>2023-09-03</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.chatpdf.com">https://www.chatpdf.com</ext-link></comment></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Artstein</surname><given-names>R</given-names> </name></person-group><article-title>Handbook of linguistic annotation</article-title><source>Inter-Annotator Agreement</source><year>2017</year><publisher-name>Springer</publisher-name><fpage>297</fpage><lpage>313</lpage><pub-id pub-id-type="doi">10.1007/978-94-024-0881-2_11</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gilbertson</surname><given-names>L</given-names> </name><name name-style="western"><surname>Langhorne</surname><given-names>P</given-names> </name><name name-style="western"><surname>Walker</surname><given-names>A</given-names> </name><name name-style="western"><surname>Allen</surname><given-names>A</given-names> </name><name name-style="western"><surname>Murray</surname><given-names>GD</given-names> </name></person-group><article-title>Domiciliary occupational therapy for patients with stroke discharged from hospital: randomised controlled trial</article-title><source>BMJ</source><year>2000</year><month>03</month><day>4</day><volume>320</volume><issue>7235</issue><fpage>603</fpage><lpage>606</lpage><pub-id pub-id-type="doi">10.1136/bmj.320.7235.603</pub-id><pub-id pub-id-type="medline">10698876</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gleason</surname><given-names>CE</given-names> </name><name name-style="western"><surname>Dowling</surname><given-names>NM</given-names> </name><name name-style="western"><surname>Wharton</surname><given-names>W</given-names> </name><etal/></person-group><article-title>Effects of hormone therapy on cognition and mood in recently postmenopausal women: findings from the randomized, controlled KEEPS&#x2013;cognitive and affective study</article-title><source>PLoS Med</source><year>2015</year><month>06</month><volume>12</volume><issue>6</issue><fpage>e1001833</fpage><pub-id pub-id-type="doi">10.1371/journal.pmed.1001833</pub-id><pub-id pub-id-type="medline">26035291</pub-id></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stuck</surname><given-names>AE</given-names> </name><name name-style="western"><surname>Moser</surname><given-names>A</given-names> </name><name name-style="western"><surname>Morf</surname><given-names>U</given-names> </name><etal/></person-group><article-title>Effect of health risk assessment and counselling on health behaviour and survival in older people: a pragmatic randomised trial</article-title><source>PLoS Med</source><year>2015</year><month>10</month><volume>12</volume><issue>10</issue><fpage>e1001889</fpage><pub-id pub-id-type="doi">10.1371/journal.pmed.1001889</pub-id><pub-id pub-id-type="medline">26479077</pub-id></nlm-citation></ref><ref id="ref53"><label>53</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hassett</surname><given-names>L</given-names> </name><name name-style="western"><surname>van den Berg</surname><given-names>M</given-names> </name><name name-style="western"><surname>Lindley</surname><given-names>RI</given-names> </name><etal/></person-group><article-title>Digitally enabled aged care and neurological rehabilitation to enhance outcomes with Activity and MObility UsiNg Technology (AMOUNT) in Australia: a randomised controlled trial</article-title><source>PLoS Med</source><year>2020</year><month>02</month><volume>17</volume><issue>2</issue><fpage>e1003029</fpage><pub-id pub-id-type="doi">10.1371/journal.pmed.1003029</pub-id><pub-id pub-id-type="medline">32069288</pub-id></nlm-citation></ref><ref id="ref54"><label>54</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thorndike</surname><given-names>AN</given-names> </name><name name-style="western"><surname>Mills</surname><given-names>S</given-names> </name><name name-style="western"><surname>Sonnenberg</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Activity monitor intervention to promote physical activity of physicians-in-training: randomized controlled trial</article-title><source>PLoS ONE</source><year>2014</year><volume>9</volume><issue>6</issue><fpage>e100251</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0100251</pub-id><pub-id pub-id-type="medline">24950218</pub-id></nlm-citation></ref><ref id="ref55"><label>55</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Myer</surname><given-names>L</given-names> </name><name name-style="western"><surname>Phillips</surname><given-names>TK</given-names> </name><name name-style="western"><surname>Zerbe</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Integration of postpartum healthcare services for HIV-infected women and their infants in South Africa: a randomised controlled trial</article-title><source>PLoS Med</source><year>2018</year><month>03</month><volume>15</volume><issue>3</issue><fpage>e1002547</fpage><pub-id pub-id-type="doi">10.1371/journal.pmed.1002547</pub-id><pub-id pub-id-type="medline">29601570</pub-id></nlm-citation></ref><ref id="ref56"><label>56</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Taylor</surname><given-names>SJC</given-names> </name><name name-style="western"><surname>Carnes</surname><given-names>D</given-names> </name><name name-style="western"><surname>Homer</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Novel three-day, community-based, nonpharmacological group intervention for chronic musculoskeletal pain (COPERS): a randomised clinical trial</article-title><source>PLoS Med</source><year>2016</year><month>06</month><volume>13</volume><issue>6</issue><fpage>e1002040</fpage><pub-id pub-id-type="doi">10.1371/journal.pmed.1002040</pub-id><pub-id pub-id-type="medline">27299859</pub-id></nlm-citation></ref><ref id="ref57"><label>57</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wei</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Schuurmans</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Chain-of-thought prompting elicits reasoning in large language models</article-title><access-date>2026-03-31</access-date><conf-name>NIPS&#x2019;22: 36th International Conference on Neural Information Processing Systems</conf-name><conf-date>Nov 28 to Dec 9, 2022</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/10.5555/3600270.3602070">https://dl.acm.org/doi/10.5555/3600270.3602070</ext-link></comment></nlm-citation></ref><ref id="ref58"><label>58</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Mou</surname><given-names>W</given-names> </name><etal/></person-group><article-title>STAGER checklist: Standardized testing and assessment guidelines for evaluating generative artificial intelligence reliability</article-title><source>iMetaOmics</source><year>2024</year><month>09</month><volume>1</volume><issue>1</issue><fpage>e7</fpage><pub-id pub-id-type="doi">10.1002/imo2.7</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Documentation of signaling questions and corresponding annotation instructions used in this study.</p><media xlink:href="formative_v10i1e55127_app1.docx" xlink:title="DOCX File, 117 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Annotators&#x2019; feedback on the risk of bias assessment and annotation for the randomized controlled trials in RoBuster corpus.</p><media xlink:href="formative_v10i1e55127_app2.pdf" xlink:title="PDF File, 238 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>The visual instruction guidelines in PPTX format. These placards are accompanied by Multimedia Appendix 1.</p><media xlink:href="formative_v10i1e55127_app3.pptx" xlink:title="PPTX File, 188 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>An .xlsx file containing the large language model (LLM) evaluation, detailing the text extraction and response of the LLM and the corresponding text span and response judgment labeled by the expert annotator for all the signaling questions.</p><media xlink:href="formative_v10i1e55127_app4.xlsx" xlink:title="XLSX File, 814 KB"/></supplementary-material><supplementary-material id="app5"><label>Multimedia Appendix 5</label><p>The RoBuster dataset in .tsv format.</p><media xlink:href="formative_v10i1e55127_app5.zip" xlink:title="ZIP File, 11757 KB"/></supplementary-material></app-group></back></article>