<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v8i1e58666</article-id><article-id pub-id-type="doi">10.2196/58666</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Facilitating Trust Calibration in Artificial Intelligence&#x2013;Driven Diagnostic Decision Support Systems for Determining Physicians&#x2019; Diagnostic Accuracy: Quasi-Experimental Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes" equal-contrib="yes"><name name-style="western"><surname>Sakamoto</surname><given-names>Tetsu</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1"/><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Harada</surname><given-names>Yukinori</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1"/><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Shimizu</surname><given-names>Taro</given-names></name><degrees>MSc, MPH, MD, PhD, MBA, FACP</degrees><xref ref-type="aff" rid="aff1"/><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Diagnostic and Generalist Medicine, Dokkyo Medical University</institution>, <addr-line>880 Kitakobayashi, Mibu-cho, Shimotsuga-gun</addr-line><addr-line>Tochigi</addr-line>, <country>Japan</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Mavragani</surname><given-names>Amaryllis</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Turbe</surname><given-names>Hugues</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Hautz</surname><given-names>Stefanie C</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Lu</surname><given-names>Yong</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Tetsu Sakamoto, MD, Department of Diagnostic and Generalist Medicine, Dokkyo Medical University, 880 Kitakobayashi, Mibu-cho, Shimotsuga-gun, Tochigi, 321-0293, Japan, 81 282-86-1111, 81 282-86-4775; <email>stetsu@dokkyomed.ac.jp</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>all authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2024</year></pub-date><pub-date pub-type="epub"><day>27</day><month>11</month><year>2024</year></pub-date><volume>8</volume><elocation-id>e58666</elocation-id><history><date date-type="received"><day>22</day><month>03</month><year>2024</year></date><date date-type="rev-recd"><day>06</day><month>10</month><year>2024</year></date><date date-type="accepted"><day>10</day><month>10</month><year>2024</year></date></history><copyright-statement>&#x00A9; Tetsu Sakamoto, Yukinori Harada, Taro Shimizu. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 27.11.2024. </copyright-statement><copyright-year>2024</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2024/1/e58666"/><abstract><sec><title>Background</title><p>Diagnostic errors are significant problems in medical care. Despite the usefulness of artificial intelligence (AI)&#x2013;based diagnostic decision support systems, the overreliance of physicians on AI-generated diagnoses may lead to diagnostic errors.</p></sec><sec><title>Objective</title><p>We investigated the safe use of AI-based diagnostic decision support systems with trust calibration by adjusting trust levels to match the actual reliability of AI.</p></sec><sec sec-type="methods"><title>Methods</title><p>A quasi-experimental study was conducted at Dokkyo Medical University, Japan, with physicians allocated (1:1) to the intervention and control groups. A total of 20 clinical cases were created based on the medical histories recorded by an AI-driven automated medical history&#x2013;taking system from actual patients who visited a community-based hospital in Japan. The participants reviewed the medical histories of 20 clinical cases generated by an AI-driven automated medical history&#x2013;taking system with an AI-generated list of 10 differential diagnoses and provided 1 to 3 possible diagnoses. Physicians were asked whether the final diagnosis was in the AI-generated list of 10 differential diagnoses in the intervention group, which served as the trust calibration. We analyzed the diagnostic accuracy of physicians and the correctness of the trust calibration in the intervention group. We also investigated the relationship between the accuracy of the trust calibration and the diagnostic accuracy of physicians, and the physicians&#x2019; confidence level regarding the use of AI.</p></sec><sec sec-type="results"><title>Results</title><p>Among the 20 physicians assigned to the intervention (n=10) and control (n=10) groups, the mean age was 30.9 (SD 3.9) years and 31.7 (SD 4.2) years, the proportion of men was 80% and 60%, and the mean postgraduate year was 5.8 (SD 2.9) and 7.2 (SD 4.6), respectively, with no significant differences. The physicians&#x2019; diagnostic accuracy was 41.5% in the intervention group and 46% in the control group, with no significant difference (95% CI &#x2212;0.75 to 2.55; <italic>P</italic>=.27). The overall accuracy of the trust calibration was only 61.5%, and despite correct calibration, the diagnostic accuracy was 54.5%. In the multivariate logistic regression model, the accuracy of the trust calibration was a significant contributor to the diagnostic accuracy of physicians (adjusted odds ratio 5.90, 95% CI 2.93&#x2010;12.46; <italic>P</italic>&#x003C;.001). The mean confidence level for AI was 72.5% in the intervention group and 45% in the control group, with no significant difference.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Trust calibration did not significantly improve physicians&#x2019; diagnostic accuracy when considering the differential diagnoses generated by reading medical histories and the possible differential diagnosis lists of an AI-driven automated medical history&#x2013;taking system. As this was a formative study, the small sample size and suboptimal trust calibration methods may have contributed to the lack of significant differences. This study highlights the need for a larger sample size and the implementation of supportive measures of trust calibration.</p></sec></abstract><kwd-group><kwd>trust calibration</kwd><kwd>artificial intelligence</kwd><kwd>diagnostic accuracy</kwd><kwd>diagnostic decision support</kwd><kwd>decision support</kwd><kwd>diagnosis</kwd><kwd>diagnostic</kwd><kwd>chart</kwd><kwd>history</kwd><kwd>reliable</kwd><kwd>reliability</kwd><kwd>accurate</kwd><kwd>accuracy</kwd><kwd>AI</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Diagnostic errors pose a significant problem for the maintenance of high-quality medical care [<xref ref-type="bibr" rid="ref1">1</xref>], especially in the outpatient setting. In the United States, approximately 5% of outpatients were likely to encounter diagnostic errors [<xref ref-type="bibr" rid="ref2">2</xref>]. Recent data from Japan indicate that 3.9% of patients in primary care outpatient clinics have experienced diagnostic errors during the last decade [<xref ref-type="bibr" rid="ref3">3</xref>]. Thus, innovative approaches to improve diagnostic accuracy and minimize errors should be explored and adopted.</p><p>The implementation of artificial intelligence (AI)&#x2013;driven automated medical history&#x2013;taking systems with differential diagnosis generators is a promising solution, as these systems provide a list of potential differential diagnoses before the information is collected by physicians, thereby aiding more accurate diagnoses [<xref ref-type="bibr" rid="ref4">4</xref>]. However, AI-related diagnostic errors have become a problem [<xref ref-type="bibr" rid="ref5">5</xref>]. Among the multiple factors contributing to the diagnostic errors arising from AI implementation, the insufficient accuracy of AI systems is a prominent issue. For example, the diagnostic accuracy of AI-based differential diagnoses for trauma and musculoskeletal disorders was 73% [<xref ref-type="bibr" rid="ref6">6</xref>]. Another study revealed that the diagnostic accuracy of AI was 53% in patients who visited an outpatient internal medicine department and required hospitalization within 14 days [<xref ref-type="bibr" rid="ref7">7</xref>]. These reports indicate that AI alone is insufficient for a definitive diagnosis.</p><p>Nonetheless, AI-based diagnostic decision support systems can enhance diagnostic accuracy among physicians and medical students [<xref ref-type="bibr" rid="ref8">8</xref>], which has implications for clinical practice and medical education. However, concerns exist that inexperienced doctors may overly rely on AI diagnosis, even when the AI-provided diagnosis is incorrect [<xref ref-type="bibr" rid="ref9">9</xref>]. A recent study indicated that biased AI decreased the diagnostic precision of physicians and that providing explanations for AI reasoning did not improve the diagnostic precision [<xref ref-type="bibr" rid="ref10">10</xref>]. For the effective and safe implementation of AI-based diagnostic decision support systems in clinical settings, it is imperative to focus on two critical aspects: enhancing the diagnostic accuracy of AI systems and facilitating the development of physicians&#x2019; skills to evaluate the certainty levels of AI-generated diagnoses.</p><p>Prior research outside the health care domain has demonstrated the usefulness of &#x201C;trust calibration,&#x201D; which appropriately adjusts trust levels according to the reliability of an AI system [<xref ref-type="bibr" rid="ref11">11</xref>]. In a drone simulation study, trust calibration prevented people from excessively trusting AI, leading to performance improvements [<xref ref-type="bibr" rid="ref11">11</xref>]. However, in the medical field, previous studies examining the effectiveness of diagnostic decision support systems have not investigated the relationship between physicians&#x2019; final decisions using AI and their trust in the accuracy of AI-based diagnoses [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref9">9</xref>]. Similarly, it is unclear whether assessing the accuracy of AI judgments will improve diagnostic safety when using AI-based diagnostic decision support systems.</p><p>Therefore, in this study, we aimed to examine whether physicians&#x2019; trust calibration for AI-based diagnostic decision support systems improves their diagnostic accuracy.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Ethical Considerations</title><p>This study was conducted in accordance with the Declaration of Helsinki and was approved by the Research Ethics Committees of Dokkyo Medical University (R7112J) and Nagano Chuo Hospital (NCR202209). This study involved human subjects and adhered strictly to ethical research standards. All participants provided written informed consent prior to their involvement in the study. They were fully informed about the study&#x2019;s procedures, its purpose, the voluntary nature of their participation, and their right to withdraw at any time without consequence. To protect participants&#x2019; privacy and confidentiality, all personal data were anonymized, and access to this data was restricted to researchers directly involved in the study. There was no financial compensation for participation. No identifiable images of individual participants appear in the manuscript or supplementary material in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-2"><title>Study Design</title><p>This quasi-experimental study was conducted at the Dokkyo Medical University, Japan between August 9 and September 25, 2023.</p></sec><sec id="s2-3"><title>AI-Driven Automated Medical History&#x2013;Taking System</title><p>In this study, we used medical history data recorded by AI Monshin, an AI-driven automated medical history&#x2013;taking system widely used in more than 1,400 medical facilities in Japan. AI Monshin is a software that converts data entered by the patient on a tablet device into medical terms and summarizes them as medical history to provide the top 10 differential diagnoses. In the waiting room, patients entered their age, sex, and free-form description of their symptoms on a tablet. The AI software then selects approximately 20 questions tailored to the patient, which are presented sequentially on a tablet, and patients respond by choosing their answers from the displayed options. Questions were optimized according to past answers and a list of the most relevant candidate differential diagnoses was generated. Additional details of the AI Monshin have been described previously [<xref ref-type="bibr" rid="ref4">4</xref>].</p></sec><sec id="s2-4"><title>Case Creation</title><p>Twenty written clinical cases were created based on the medical histories recorded by an AI-driven automated medical history&#x2013;taking system from actual patients who visited Nagano Chuo Hospital, a community-based hospital in Japan. The following cases were selected. First, we included patients aged 18 years or older who used the AI-driven automated medical history&#x2013;taking system at the outpatient department of Nagano Chuo Hospital between May 1, 2019, and April 30, 2022, followed by hospitalization within 30 days. Patients without a confirmed diagnosis, those for whom the AI-driven automated medical history&#x2013;taking system did not list the differential diagnosis, and those who refused to use their data in this study were excluded. The requirement for informed consent from the patients was waived by the research ethics committee. Based on these criteria, 381 cases were stored for case creation. Second, we extracted data and the final diagnosis from the medical history recorded by the AI-driven automated medical history&#x2013;taking system. Third, the final diagnosis was coded using the <italic>International Classification of Diseases 11th Revision</italic> [<xref ref-type="bibr" rid="ref12">12</xref>]. The five most frequent disease categories were digestive, circulatory, and respiratory system diseases; neoplasms; and certain infectious or parasitic diseases. Fourth, two researchers (T Sakamoto and YH) independently determined whether the final diagnosis was included within the AI-generated list of differential diagnoses; any inconsistencies were resolved by discussion. The accuracy of the AI differential diagnosis list was 172/381 (45.1%). Fifth, two researchers (T Sakamoto and YH) independently classified the commonality of the final diagnosis (common or uncommon disease) and the typicality of the clinical presentation (typical or atypical presentation). Any inconsistencies were resolved by discussion: an uncommon disease was defined as a disease affecting less than 1 per 2000 people [<xref ref-type="bibr" rid="ref13">13</xref>], and judged based on the epidemiological data described in UpToDate [<xref ref-type="bibr" rid="ref14">14</xref>], DynaMed [<xref ref-type="bibr" rid="ref15">15</xref>], or other scientific literature. Moreover, a typical or atypical presentation was ascertained by referencing descriptions regarding each disease in UpToDate. We included this variable because atypical presentations have been identified as a risk factor for diagnostic errors [<xref ref-type="bibr" rid="ref16">16</xref>] and could also be a confounding factor in the results of our study. A total of 381 cases were classified into 4 categories: typical presentation of common disease (n=205, 53.8%), atypical presentation of common disease (n=52, 13.7%), typical presentation of uncommon disease (n=93, 24.4%), and atypical presentation of uncommon disease (n=31, 8.1%). Finally, based on the distribution of the disease category, commonality, and typicality in the patient population, we selected 20 cases. Each AI-generated list of differential diagnoses does not necessarily include the correct final diagnosis. We set an even distribution between the cases in the AI-generated list. This was done to prevent automation bias. <xref ref-type="table" rid="table1">Table 1</xref> provides detailed information on the distribution of the 20 cases.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Selected cases based on the distribution of disease category, commonality, and typicality in the patient population.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Case</td><td align="left" valign="bottom">Typicality</td><td align="left" valign="bottom">Commonality</td><td align="left" valign="bottom">AI&#x2019;s<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> answer</td></tr></thead><tbody><tr><td align="left" valign="top">Typhus fever due to <italic>Orientia tsutsugamushi</italic></td><td align="left" valign="top">Typical</td><td align="left" valign="top">Uncommon</td><td align="left" valign="top">False</td></tr><tr><td align="left" valign="top">Acute myocardial infarction</td><td align="left" valign="top">Typical</td><td align="left" valign="top">Common</td><td align="left" valign="top">True</td></tr><tr><td align="left" valign="top">Hepatocellular carcinoma of the liver</td><td align="left" valign="top">Atypical</td><td align="left" valign="top">Uncommon</td><td align="left" valign="top">False</td></tr><tr><td align="left" valign="top">Acute appendicitis</td><td align="left" valign="top">Typical</td><td align="left" valign="top">Common</td><td align="left" valign="top">True</td></tr><tr><td align="left" valign="top">Acute pancreatitis</td><td align="left" valign="top">Typical</td><td align="left" valign="top">Uncommon</td><td align="left" valign="top">False</td></tr><tr><td align="left" valign="top">Pneumonitis due to inhalation of food or vomit</td><td align="left" valign="top">Typical</td><td align="left" valign="top">Common</td><td align="left" valign="top">False</td></tr><tr><td align="left" valign="top">Gastroenteritis due to <italic>Campylobacter</italic></td><td align="left" valign="top">Typical</td><td align="left" valign="top">Common</td><td align="left" valign="top">True</td></tr><tr><td align="left" valign="top">Herpes zoster</td><td align="left" valign="top">Typical</td><td align="left" valign="top">Common</td><td align="left" valign="top">False</td></tr><tr><td align="left" valign="top">Congestive heart failure</td><td align="left" valign="top">Typical</td><td align="left" valign="top">Common</td><td align="left" valign="top">False</td></tr><tr><td align="left" valign="top">Acute pyelonephritis</td><td align="left" valign="top">Typical</td><td align="left" valign="top">Common</td><td align="left" valign="top">False</td></tr><tr><td align="left" valign="top">Polymyalgia rheumatica</td><td align="left" valign="top">Typical</td><td align="left" valign="top">Uncommon</td><td align="left" valign="top">True</td></tr><tr><td align="left" valign="top">Type 2 diabetes mellitus</td><td align="left" valign="top">Atypical</td><td align="left" valign="top">Common</td><td align="left" valign="top">True</td></tr><tr><td align="left" valign="top">Bacterial pneumonia</td><td align="left" valign="top">Atypical</td><td align="left" valign="top">Common</td><td align="left" valign="top">False</td></tr><tr><td align="left" valign="top">Pneumothorax</td><td align="left" valign="top">Typical</td><td align="left" valign="top">Uncommon</td><td align="left" valign="top">True</td></tr><tr><td align="left" valign="top">Pulmonary hypertension</td><td align="left" valign="top">Typical</td><td align="left" valign="top">Uncommon</td><td align="left" valign="top">False</td></tr><tr><td align="left" valign="top">Malignant neoplasm of the pancreas</td><td align="left" valign="top">Typical</td><td align="left" valign="top">Uncommon</td><td align="left" valign="top">True</td></tr><tr><td align="left" valign="top">Cerebral ischemic stroke</td><td align="left" valign="top">Typical</td><td align="left" valign="top">Common</td><td align="left" valign="top">True</td></tr><tr><td align="left" valign="top">Ischemic colitis</td><td align="left" valign="top">Atypical</td><td align="left" valign="top">Uncommon</td><td align="left" valign="top">True</td></tr><tr><td align="left" valign="top">Malignant neoplasms of the stomach</td><td align="left" valign="top">Typical</td><td align="left" valign="top">Common</td><td align="left" valign="top">False</td></tr><tr><td align="left" valign="top">Fracture of the spine</td><td align="left" valign="top">Typical</td><td align="left" valign="top">Common</td><td align="left" valign="top">True</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>AI: artificial intelligence.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-5"><title>Participants and Procedure</title><p>We recruited current and former physicians affiliated with the Department of Diagnostic and Generalist Medicine at Dokkyo Medical University Hospital, Japan. It is the referral department for consultations from within and outside the hospital for difficult-to-diagnose cases, one of its specialized tasks. Physicians who refused to participate were excluded. In this study, physicians were assigned by T Sakamoto to either the intervention or control group (1:1) using a computerized randomization process stratified by postgraduate year (PGY). The participants were not informed about whether they were assigned to the intervention group or the control group. Regardless of the group, each physician was requested to read 20 written clinical cases, arranged randomly, with the AI&#x2019;s list of 10 differential diagnoses, and then give 1 to 3 possible diagnoses (free text) for each case within 3 minutes. The 3-minute time allocated per case was derived from the assumption that physicians usually take less than 3 minutes to consider differential diagnoses from the AI-driven notes with a list of differential diagnoses in daily clinical practice. It was assumed that if the AI-based system could be used in a hospital outpatient setting and took only 3 minutes, it would be useful in practice. In the intervention group, physicians were presented with the statement &#x201C;Please consider whether the correct diagnosis is included in AI&#x2019;s list of differential diagnoses,&#x201D; in addition to receiving the information and the list of the AI&#x2019;s 10 differential diagnoses. They were then asked whether they believed that the final diagnosis was included in the AI-generated list of 10 differential diagnoses (Yes or No). This intervention served as the trust calibration in this study. To ensure successful collaboration between users and AI, the users need to adjust their trust level according to the actual reliability of the AI, a process called trust calibration [<xref ref-type="bibr" rid="ref11">11</xref>]. Trust calibration was designated as &#x201C;correct&#x201D; when the physicians&#x2019; judgment was correct on whether the final diagnosis was included in the AI&#x2019;s list of 10 differential diagnoses in the intervention group. Meanwhile, in the control group, there was no mention of &#x201C;Please consider whether the correct diagnosis is included in AI&#x2019;s list of differential diagnoses.&#x201D; They were also not asked whether they believed that the final diagnosis was included in the AI-generated list of 10 differential diagnoses.</p><p>After responding to all the cases, physicians in both groups were queried, &#x201C;What level of diagnostic accuracy would you anticipate for this AI Monshin&#x2019;s list of differential diagnoses?&#x201D; They were also instructed to rate their contrast level against the AI on a scale of 0% to 100%, defined as the level of confidence in the AI. Using this confidence level, we examined the accuracy of the physician trust calibration.</p></sec><sec id="s2-6"><title>Data Collection and Outcomes</title><p>We collected data on the physicians&#x2019; age, sex, PGY, answers to 20 clinical cases, and confidence level in AI. The primary outcome was the physicians&#x2019; diagnostic accuracy. Each physician&#x2019;s score was evaluated based on 20 questions, with each question worth 1 point. The physicians&#x2019; diagnostic accuracy was determined by whether the final diagnosis matched any diagnosis in the physicians&#x2019; list of differential diagnoses. The average scores of the two groups were compared. The secondary outcome measure was the physicians&#x2019; correctness of trust calibration. The extent to which physicians trusted AI Monshin&#x2019;s list of differential diagnoses was assessed after adjusting for confounding factors. Two researchers (T Sakamoto and YH) independently evaluated the primary and secondary outcomes, and inconsistencies were resolved through discussion.</p></sec><sec id="s2-7"><title>Sample Size Calculation</title><p>As the physicians&#x2019; diagnostic accuracy was 57.4% in an experimental study using the same AI-based system [<xref ref-type="bibr" rid="ref9">9</xref>], we assumed a 55% diagnostic accuracy for physicians in the control group. No previous study has investigated the effect size of trust calibration on the physicians&#x2019; diagnostic accuracy. Therefore, we assumed that a 15% increase in physicians&#x2019; diagnostic accuracy through trust calibration was clinically significant. With this assumption, we calculated a sample size based on a 2-tailed Student <italic>t</italic> test, <italic>&#x03B1;</italic>=.05, power 0.8, allocation ratio 1:1, and SD 0.1, which resulted in the required sample size (number of participating physicians) of 9 per group (total n=18). Considering dropouts during the study, we determined that 20 physicians were required to participate.</p></sec><sec id="s2-8"><title>Statistical Analysis</title><p>Continuous variables were presented as medians with interquartile ranges and compared between the two groups using the Student <italic>t</italic> test. Categorical variables were presented as numbers and percentages and were compared using the <italic>&#x03C7;</italic><sup>2</sup> test. The primary outcome, physicians&#x2019; diagnostic accuracy, was compared between the two groups using the Student <italic>t</italic> test. The secondary outcome, the correctness of trust calibration in the intervention group, was calculated from the number of cases out of 200 where the physician could correctly distinguish whether the final diagnosis was included in the AI list of the 10 differential diagnoses. Furthermore, we used a multivariate logistic regression model to evaluate the accuracy of trust calibration on the diagnostic accuracy of physicians in the intervention group, adjusted for other factors such as disease commonality, disease typicality, sex, and PGY. The confidence level for AI was compared between the two groups using the Student <italic>t</italic> test. All <italic>P</italic> values in the statistical tests were 2-tailed, and <italic>P</italic> values &#x003C;.05 were considered statistically significant. All statistical analyses were performed using R version 4.3.2 (The R Foundation for Statistical Computing).</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>Twenty physicians were included and assigned to the intervention (n=10) and control (n=10) groups, and there was no dropouts (<xref ref-type="fig" rid="figure1">Figure 1</xref>). The characteristics of the intervention and control groups were as follows: the mean age was 30.9 (SD 3.9) years and 31.7 (SD 4.2) years, the proportion of men was 80% (8/10) and 60% (6/10), and the mean PGY was 5.8 (SD 2.9) and 7.2 (SD 4.6), respectively. There was no significant intergroup difference in the baseline characteristics of the physicians.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>A quasi-experimental study was conducted at Dokkyo Medical University, Japan, with general physicians allocated (1:1) to the intervention and control groups to examine whether physicians&#x2019; trust calibration for AI-based diagnostic decision support systems improves their diagnostic accuracy.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v8i1e58666_fig01.png"/></fig><sec id="s3-1"><title>Evaluation Outcomes</title><p>The primary outcome, physicians&#x2019; diagnostic accuracy, was detected in 8.3 of 20 answers (41.5%) in the intervention group and in 9.2 of 20 answers (46%) in the control group. There was no significant intergroup difference in the physicians&#x2019; diagnostic accuracy (95% CI &#x2212;0.75 to 2.55; <italic>P</italic>=.27).</p><p>The secondary outcome, the correctness of trust calibration in the intervention group, was 61.5% (123/200; <xref ref-type="table" rid="table2">Table 2</xref>). The physicians&#x2019; diagnostic accuracy was 54.5% (67/123) in cases where trust calibration was correct, and 20.8% (16/77) in cases where trust calibration was incorrect. The accuracy of trust calibration was a significant contributor to the diagnostic accuracy of physicians (adjusted odds ratio 5.90, 95% CI 2.93&#x2010;12.46; <italic>P</italic>&#x003C;.001) in the multivariate logistic regression model (<xref ref-type="table" rid="table3">Table 3</xref>).</p><p>The confidence level for AI was 72.5% (10%&#x2010;80%) in the intervention group and 45% (30%&#x2010;80%) in the control group. There was no statistically significant intergroup difference (<italic>P</italic>=.12). The results are shown in <xref ref-type="table" rid="table4">Table 4</xref>.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Association between artificial intelligence (AI)&#x2019;s answer and trust calibration.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Trust calibration</td><td align="left" valign="bottom" colspan="2">AI&#x2019;s answer</td><td align="left" valign="bottom">Sum</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Correct</td><td align="left" valign="top">Incorrect</td><td align="left" valign="top"/></tr></thead><tbody><tr><td align="left" valign="top">Correct</td><td align="left" valign="top">87</td><td align="left" valign="top">36</td><td align="left" valign="top">123</td></tr><tr><td align="left" valign="top">Incorrect</td><td align="left" valign="top">13</td><td align="left" valign="top">64</td><td align="left" valign="top">77</td></tr></tbody></table></table-wrap><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Multivariate logistic regression analysis regarding the diagnostic accuracy of physicians.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Category</td><td align="left" valign="bottom">Odds ratio</td><td align="left" valign="bottom">95% CI</td><td align="left" valign="bottom"><italic>P</italic> value<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">Physician variables</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;Sex (male)</td><td align="left" valign="top">1.08</td><td align="left" valign="top">0.03&#x2010;0.46</td><td align="left" valign="top">.87</td></tr><tr><td align="left" valign="top">&#x2003;Years of postgraduation</td><td align="left" valign="top">1.06</td><td align="left" valign="top">0.93&#x2010;1.20</td><td align="left" valign="top">.38</td></tr><tr><td align="left" valign="top">Case variables</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;Disease commonality</td><td align="left" valign="top">2.72</td><td align="left" valign="top">1.36&#x2010;5.57</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">&#x2003;Disease typicality</td><td align="left" valign="top">21.67</td><td align="left" valign="top">6.06&#x2010;139.25</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">&#x2003;Trust calibration</td><td align="left" valign="top">5.90</td><td align="left" valign="top">2.93&#x2010;12.46</td><td align="left" valign="top">&#x003C;.001</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup><italic>P</italic> values from multivariable logistic regression.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Trust levels for AI in the intervention and control groups.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Trust level (%)</td><td align="left" valign="bottom">Intervention group, n</td><td align="left" valign="bottom">Control group, n</td></tr></thead><tbody><tr><td align="char" char="hyphen" valign="top">0-10</td><td align="char" char="." valign="top">1</td><td align="char" char="." valign="top">0</td></tr><tr><td align="char" char="hyphen" valign="top">20-30</td><td align="char" char="." valign="top">0</td><td align="char" char="." valign="top">2</td></tr><tr><td align="char" char="hyphen" valign="top">40-50</td><td align="char" char="." valign="top">2</td><td align="char" char="." valign="top">6</td></tr><tr><td align="char" char="hyphen" valign="top">60-70</td><td align="char" char="." valign="top">3</td><td align="char" char="." valign="top">1</td></tr><tr><td align="char" char="hyphen" valign="top">80-90</td><td align="char" char="." valign="top">4</td><td align="char" char="." valign="top">1</td></tr></tbody></table></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Results</title><p>This study showed that physicians&#x2019; diagnostic accuracy did not differ between groups with or without trust calibration when considering the differential diagnoses by reading the medical history and lists of possible differential diagnoses of an AI-driven automated medical history&#x2013;taking system.</p></sec><sec id="s4-2"><title>Comparison With Prior Work</title><p>In this study, the intervention with trust calibration for AI was not associated with an increase in the diagnostic accuracy. There are several possible reasons for this observation. First, there is a possibility that the trust calibration method is incorrect. In a previous study using drone simulators, the system issued warnings when people exhibited excessive confidence in the AI as its accuracy decreased [<xref ref-type="bibr" rid="ref11">11</xref>]. In this study, there was no material or warning to help physicians ascertain whether they were overly trusting, which did not improve the physicians&#x2019; diagnostic accuracy. Second, an automation bias may have influenced the results. Recent studies have suggested that excessive reliance on AI-based diagnostic-support tools may have adverse outcomes [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]. In this study, physicians in the intervention group estimated the accuracy of the AI to be approximately 30% higher than the actual accuracy. As shown in <xref ref-type="table" rid="table4">Table 4</xref>, other than one outlier with a 10% confidence level in AI, the intervention group showed a tendency for excessive confidence in AI compared to the control group. The outlier was involved in research on the diagnostic accuracy of generative AI, which may have influenced the group&#x2019;s level of confidence in AI. This result indicates that physicians&#x2019; trust calibration of AI without objective indicators may lead to excessive confidence in the AI, resulting in incorrect diagnostic decisions. There are two possible solutions for overcoming excessive confidence in AI systems that aid diagnostic decisions. One is to show physicians the reasoning process of AI-driven diagnostic decision support systems in advance [<xref ref-type="bibr" rid="ref17">17</xref>], and the other is to utilize another AI that can indicate reliance on AI-driven diagnostic decision support systems [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref19">19</xref>]. Conversely, trust calibration could make it easier for physicians to trust AI, potentially increasing physician satisfaction in clinical decision-making. More accurate trust calibration could improve diagnostic accuracy, leading to better clinical outcomes for patients and creating overall positive effects.</p><p>Clinical diagnostic decision support systems have demonstrated effectiveness in medical education as well. Diagnostic decision support systems based on patient history allows for the generation of more accurate differential diagnoses [<xref ref-type="bibr" rid="ref20">20</xref>]. This finding suggests that the use of clinical diagnostic decision support systems in medical education may increase further in the future. However, excessive reliance on AI-based diagnostic decision support systems could lead to diagnostic errors. Therefore, in addition to improving the accuracy of AI, as suggested by this study, there is a need to develop the ability to evaluate the reliability of AI. It is essential to implement solutions to overcome the excessive trust in AI mentioned earlier.</p><p>On multivariate logistic regression analysis, a correlation was observed between the accuracy of trust calibration and physicians&#x2019; diagnostic accuracy. This finding suggests the possibility of improving physicians&#x2019; diagnostic accuracy if they are provided with a more precise trust calibration. Additionally, the study results showed that a higher diagnostic accuracy was observed in the common and typical presentations of the cases.</p></sec><sec id="s4-3"><title>Limitations</title><p>This study had several limitations. First, it is unclear whether trust calibration was absent in the control group. Physicians often employ a dual-process medical decision-making model, which incorporates systems 1 and 2 to arrive at a diagnosis [<xref ref-type="bibr" rid="ref21">21</xref>]. Therefore, the control group may unconsciously engage in trust calibration as part of this dual-process model; however, further investigation is needed to confirm this hypothesis. Second, the patient data used in this study were collected from only one community hospital in Japan, and the disease frequency, commonality, and typicality may differ in other facilities. Third, the participants were young generalist physicians. Therefore, it is unclear whether these results are applicable to physicians in other specialties or to PGY groups. Additionally, the results may vary because of different cultural backgrounds, varying levels of medical training, and different health care systems. Fourth, the overall diagnostic accuracy of physicians was lower than that observed in previous studies, suggesting that clinical cases are difficult to solve. Fifth, physicians&#x2019; trust in AI may vary depending on the type of AI used, which may affect trust calibration. Sixth, because both groups reviewed the differential diagnosis list and supported cognitive reinforcement, there is a possibility of effect modification. Seventh, the sample size was limited. This study was exploratory, involving 20 physicians resolving 20 cases. The small sample size may have contributed to the lack of observed significance. Increasing the sample size could enhance the reliability of the results. To compensate for the lack of power, based on this study, it is estimated that approximately 158 participants would be ideal for the next trial. Eighth, our study is similar to previous trust calibration research in terms of evaluating the accuracy of AI. However, it differs in that participants did not have prior information to determine whether the AI was providing correct answers. In this study, it might have been beneficial to inform participants of the AI&#x2019;s accuracy beforehand. Therefore, this may not represent accurate trust calibration. In the next trial, the intervention group will be informed of the AI&#x2019;s diagnostic accuracy before starting the test. The test will be conducted on a computer-based platform, and we will incorporate the trust calibration-specific AI used in the past study [<xref ref-type="bibr" rid="ref11">11</xref>]. This system will alert participants in the intervention group when they are excessively or insufficiently trusting the AI, allowing for appropriate trust calibration. Ninth, this study suggests that a comprehensive evaluation of whether the AI&#x2019;s differential diagnosis is correct may be as ineffective as verifying one&#x2019;s own diagnosis [<xref ref-type="bibr" rid="ref22">22</xref>]. As previous research found encouraging more specific reflection, such as identifying &#x201C;where the inconsistencies are,&#x201D; to be effective [<xref ref-type="bibr" rid="ref23">23</xref>], applying such methods to the AI&#x2019;s differential diagnosis list could be a viable approach. Tenth, it is uncertain whether the 3-minute time limit was appropriate. Eleventh, automation bias related to AI may have influenced the results in this study; however, there is currently no clear method to prevent this bias. Twelfth, the potential impacts of trust calibration on other aspects of clinical decision-making, such as patient outcomes and physician satisfaction, were not evaluated. Thirteenth, there are currently no accurate and objective measures to evaluate trust calibration, making this a challenge for future research.</p></sec><sec id="s4-4"><title>Conclusions</title><p>Trust calibration did not significantly influence the physicians&#x2019; diagnostic accuracy in collaboration with the differential diagnosis list generated through AI-assisted medical history, which may, therefore, lack practical application in the real-world clinical setting. Nonetheless, based on past evidence, the introduction of a system that alerts physicians when they place excessive confidence in AI could encourage more precise trust calibration and thereby improve diagnostic accuracy. The significance of this study lies in its clear identification of the limitations of existing trust calibration. The study indicates that applying supportive measures with trust calibration, rather than utilizing only trust calibration, could improve diagnostic accuracy. As this study is formative, further studies incorporating an appropriate sample size and methods for trust calibration are necessary.</p></sec></sec></body><back><ack><p>This work was supported by JSPS KAKENHI, grant JP21K10355. We used ChatGPT4 to draft the point-by-point response for the revision process. In case of any issues with the generated text, the authors made the necessary corrections, and the final responsibility lies with the author. We disclosed the original ChatGPT transcripts in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></ack><notes><sec><title>Data Availability</title><p>The datasets generated and analyzed during this study are available from the corresponding author upon reasonable request.</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">PGY</term><def><p>postgraduate year</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Watari</surname><given-names>T</given-names> </name><name name-style="western"><surname>Tokuda</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Mitsuhashi</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Factors and impact of physicians&#x2019; diagnostic errors in malpractice claims in Japan</article-title><source>PLoS ONE</source><year>2020</year><volume>15</volume><issue>8</issue><fpage>e0237145</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0237145</pub-id><pub-id pub-id-type="medline">32745150</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singh</surname><given-names>H</given-names> </name><name name-style="western"><surname>Meyer</surname><given-names>AND</given-names> </name><name name-style="western"><surname>Thomas</surname><given-names>EJ</given-names> </name></person-group><article-title>The frequency of diagnostic errors in outpatient care: estimations from three large observational studies involving US adult populations</article-title><source>BMJ Qual Saf</source><year>2014</year><month>09</month><volume>23</volume><issue>9</issue><fpage>727</fpage><lpage>731</lpage><pub-id pub-id-type="doi">10.1136/bmjqs-2013-002627</pub-id><pub-id pub-id-type="medline">24742777</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Aoki</surname><given-names>T</given-names> </name><name name-style="western"><surname>Watanuki</surname><given-names>S</given-names> </name></person-group><article-title>Multimorbidity and patient-reported diagnostic errors in the primary care setting: multicentre cross-sectional study in Japan</article-title><source>BMJ Open</source><year>2020</year><month>08</month><day>20</day><volume>10</volume><issue>8</issue><fpage>e039040</fpage><pub-id pub-id-type="doi">10.1136/bmjopen-2020-039040</pub-id><pub-id pub-id-type="medline">32819954</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Harada</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Shimizu</surname><given-names>T</given-names> </name></person-group><article-title>Impact of a commercial artificial intelligence-driven patient self-assessment solution on waiting times at general internal medicine outpatient departments: retrospective study</article-title><source>JMIR Med Inform</source><year>2020</year><month>08</month><day>31</day><volume>8</volume><issue>8</issue><fpage>e21056</fpage><pub-id pub-id-type="doi">10.2196/21056</pub-id><pub-id pub-id-type="medline">32865504</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Parikh</surname><given-names>RB</given-names> </name><name name-style="western"><surname>Teeple</surname><given-names>S</given-names> </name><name name-style="western"><surname>Navathe</surname><given-names>AS</given-names> </name></person-group><article-title>Addressing bias in artificial intelligence in health care</article-title><source>JAMA</source><year>2019</year><month>12</month><day>24</day><volume>322</volume><issue>24</issue><fpage>2377</fpage><lpage>2378</lpage><pub-id pub-id-type="doi">10.1001/jama.2019.18058</pub-id><pub-id pub-id-type="medline">31755905</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schwitzguebel</surname><given-names>AJP</given-names> </name><name name-style="western"><surname>Jeckelmann</surname><given-names>C</given-names> </name><name name-style="western"><surname>Gavinio</surname><given-names>R</given-names> </name><name name-style="western"><surname>Levallois</surname><given-names>C</given-names> </name><name name-style="western"><surname>Bena&#x00EF;m</surname><given-names>C</given-names> </name><name name-style="western"><surname>Spechbach</surname><given-names>H</given-names> </name></person-group><article-title>Differential diagnosis assessment in ambulatory care with an automated medical history-taking device: pilot randomized controlled trial</article-title><source>JMIR Med Inform</source><year>2019</year><month>11</month><day>4</day><volume>7</volume><issue>4</issue><fpage>e14044</fpage><pub-id pub-id-type="doi">10.2196/14044</pub-id><pub-id pub-id-type="medline">31682590</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kawamura</surname><given-names>R</given-names> </name><name name-style="western"><surname>Harada</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Sugimoto</surname><given-names>S</given-names> </name><name name-style="western"><surname>Nagase</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Katsukura</surname><given-names>S</given-names> </name><name name-style="western"><surname>Shimizu</surname><given-names>T</given-names> </name></person-group><article-title>Incidence of diagnostic errors among unexpectedly hospitalized patients using an automated medical history-taking system with a differential diagnosis generator: retrospective observational study</article-title><source>JMIR Med Inform</source><year>2022</year><month>01</month><day>27</day><volume>10</volume><issue>1</issue><fpage>e35225</fpage><pub-id pub-id-type="doi">10.2196/35225</pub-id><pub-id pub-id-type="medline">35084347</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Friedman</surname><given-names>CP</given-names> </name><name name-style="western"><surname>Elstein</surname><given-names>AS</given-names> </name><name name-style="western"><surname>Wolf</surname><given-names>FM</given-names> </name><etal/></person-group><article-title>Enhancement of clinicians&#x2019; diagnostic reasoning by computer-based consultation: a multisite study of 2 systems</article-title><source>JAMA</source><year>1999</year><month>11</month><day>17</day><volume>282</volume><issue>19</issue><fpage>1851</fpage><lpage>1856</lpage><pub-id pub-id-type="doi">10.1001/jama.282.19.1851</pub-id><pub-id pub-id-type="medline">10573277</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Harada</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Katsukura</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kawamura</surname><given-names>R</given-names> </name><name name-style="western"><surname>Shimizu</surname><given-names>T</given-names> </name></person-group><article-title>Efficacy of artificial-intelligence-driven differential-diagnosis list on the diagnostic accuracy of physicians: an open-label randomized controlled study</article-title><source>Int J Environ Res Public Health</source><year>2021</year><month>02</month><day>21</day><volume>18</volume><issue>4</issue><fpage>2086</fpage><pub-id pub-id-type="doi">10.3390/ijerph18042086</pub-id><pub-id pub-id-type="medline">33669930</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jabbour</surname><given-names>S</given-names> </name><name name-style="western"><surname>Fouhey</surname><given-names>D</given-names> </name><name name-style="western"><surname>Shepard</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Measuring the impact of AI in the diagnosis of hospitalized patients: a randomized clinical vignette survey study</article-title><source>JAMA</source><year>2023</year><month>12</month><day>19</day><volume>330</volume><issue>23</issue><fpage>2275</fpage><lpage>2284</lpage><pub-id pub-id-type="doi">10.1001/jama.2023.22295</pub-id><pub-id pub-id-type="medline">38112814</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Okamura</surname><given-names>K</given-names> </name><name name-style="western"><surname>Yamada</surname><given-names>S</given-names> </name></person-group><article-title>Adaptive trust calibration for human-AI collaboration</article-title><source>PLoS ONE</source><year>2020</year><volume>15</volume><issue>2</issue><fpage>e0229132</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0229132</pub-id><pub-id pub-id-type="medline">32084201</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="web"><article-title>International Classification of Diseases 11th revision</article-title><source>World Health Organization</source><access-date>2024-11-06</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://icd.who.int/en">https://icd.who.int/en</ext-link></comment></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="web"><source>Orphanet</source><access-date>2023-07-01</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.orpha.net/consor/cgi-bin/index.php">https://www.orpha.net/consor/cgi-bin/index.php</ext-link></comment></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="web"><source>UpToDate</source><access-date>2023-07-01</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.uptodate.com/contents/search">https://www.uptodate.com/contents/search</ext-link></comment></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="web"><source>DynaMed</source><year>1995</year><access-date>2023-07-01</access-date><publisher-name>EBSCO Information Services</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.dynamed.com">https://www.dynamed.com</ext-link></comment></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Harada</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Otaka</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Katsukura</surname><given-names>S</given-names> </name><name name-style="western"><surname>Shimizu</surname><given-names>T</given-names> </name></person-group><article-title>Prevalence of atypical presentations among outpatients and associations with diagnostic error</article-title><source>Diagnosis (Berl)</source><year>2024</year><month>02</month><day>1</day><volume>11</volume><issue>1</issue><fpage>40</fpage><lpage>48</lpage><pub-id pub-id-type="doi">10.1515/dx-2023-0060</pub-id><pub-id pub-id-type="medline">38059495</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Goddard</surname><given-names>K</given-names> </name><name name-style="western"><surname>Roudsari</surname><given-names>A</given-names> </name><name name-style="western"><surname>Wyatt</surname><given-names>JC</given-names> </name></person-group><article-title>Automation bias: a systematic review of frequency, effect mediators, and mitigators</article-title><source>J Am Med Inform Assoc</source><year>2012</year><volume>19</volume><issue>1</issue><fpage>121</fpage><lpage>127</lpage><pub-id pub-id-type="doi">10.1136/amiajnl-2011-000089</pub-id><pub-id pub-id-type="medline">21685142</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Khera</surname><given-names>R</given-names> </name><name name-style="western"><surname>Simon</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Ross</surname><given-names>JS</given-names> </name></person-group><article-title>Automation bias and assistive AI: risk of harm from AI-driven clinical decision support</article-title><source>JAMA</source><year>2023</year><month>12</month><day>19</day><volume>330</volume><issue>23</issue><fpage>2255</fpage><lpage>2257</lpage><pub-id pub-id-type="doi">10.1001/jama.2023.22557</pub-id><pub-id pub-id-type="medline">38112824</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Okamura</surname><given-names>K</given-names> </name><name name-style="western"><surname>Yamada</surname><given-names>S</given-names> </name></person-group><article-title>Empirical evaluations of framework for adaptive trust calibration in human-AI cooperation</article-title><source>IEEE Access</source><year>2020</year><volume>8</volume><fpage>220335</fpage><lpage>220351</lpage><pub-id pub-id-type="doi">10.1109/ACCESS.2020.3042556</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yanagita</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Shikino</surname><given-names>K</given-names> </name><name name-style="western"><surname>Ishizuka</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Improving diagnostic accuracy using a clinical decision support system for medical students during history-taking: a randomized clinical trial</article-title><source>BMC Med Educ</source><year>2023</year><month>05</month><day>25</day><volume>23</volume><issue>1</issue><fpage>383</fpage><pub-id pub-id-type="doi">10.1186/s12909-023-04370-6</pub-id><pub-id pub-id-type="medline">37231512</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Djulbegovic</surname><given-names>B</given-names> </name><name name-style="western"><surname>Hozo</surname><given-names>I</given-names> </name><name name-style="western"><surname>Beckstead</surname><given-names>J</given-names> </name><name name-style="western"><surname>Tsalatsanis</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pauker</surname><given-names>SG</given-names> </name></person-group><article-title>Dual processing model of medical decision-making</article-title><source>BMC Med Inform Decis Mak</source><year>2012</year><month>09</month><day>3</day><volume>12</volume><fpage>94</fpage><pub-id pub-id-type="doi">10.1186/1472-6947-12-94</pub-id><pub-id pub-id-type="medline">22943520</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Norman</surname><given-names>G</given-names> </name><name name-style="western"><surname>Sherbino</surname><given-names>J</given-names> </name><name name-style="western"><surname>Dore</surname><given-names>K</given-names> </name><etal/></person-group><article-title>The etiology of diagnostic errors: a controlled trial of system 1 versus system 2 reasoning</article-title><source>Acad Med</source><year>2014</year><month>02</month><volume>89</volume><issue>2</issue><fpage>277</fpage><lpage>284</lpage><pub-id pub-id-type="doi">10.1097/ACM.0000000000000105</pub-id><pub-id pub-id-type="medline">24362377</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mamede</surname><given-names>S</given-names> </name><name name-style="western"><surname>van Gog</surname><given-names>T</given-names> </name><name name-style="western"><surname>van den Berge</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Effect of availability bias and reflective reasoning on diagnostic accuracy among internal medicine residents</article-title><source>JAMA</source><year>2010</year><month>09</month><day>15</day><volume>304</volume><issue>11</issue><fpage>1198</fpage><lpage>1203</lpage><pub-id pub-id-type="doi">10.1001/jama.2010.1276</pub-id><pub-id pub-id-type="medline">20841533</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Original ChatGPT transcripts.</p><media xlink:href="formative_v8i1e58666_app1.docx" xlink:title="DOCX File, 57 KB"/></supplementary-material></app-group></back></article>