<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v10i1e90814</article-id><article-id pub-id-type="doi">10.2196/90814</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Automatic Speech Recognition and Large Language Models for Multilingual Pathology Report Generation: Proof-of-Concept Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Lin</surname><given-names>Kuan-Hsun</given-names></name><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Chang</surname><given-names>Chia-Ping</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kuo</surname><given-names>Chen-Tsung</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hsu</surname><given-names>Chien-Yeh</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hung</surname><given-names>Shih-Hsin</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff5">5</xref><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lien</surname><given-names>Chung-Yueh</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lee</surname><given-names>Siang Hao</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yeh</surname><given-names>Yi-Chen</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff7">7</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Chu</surname><given-names>Yuan-Chia</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff8">8</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Information Management, Taipei Veterans General Hospital</institution><addr-line>No. 201, Sec. 2, Shipai Road, Beitou District</addr-line><addr-line>Taipei</addr-line><country>Taiwan</country></aff><aff id="aff2"><institution>Department of Information Management, National Taipei University of Nursing and Health Sciences</institution><addr-line>Taipei</addr-line><country>Taiwan</country></aff><aff id="aff3"><institution>Department of Pathology and Laboratory Medicine, Taipei Veterans General Hospital</institution><addr-line>Taipei</addr-line><country>Taiwan</country></aff><aff id="aff4"><institution>Master Program in Global Health and Health Security, College of Public Health, Taipei Medical University</institution><addr-line>Taipei</addr-line><country>Taiwan</country></aff><aff id="aff5"><institution>Department of Nursing, Taipei Veterans General Hospital</institution><addr-line>Taipei</addr-line><country>Taiwan</country></aff><aff id="aff6"><institution>Department of Nursing, Chang Jung Christian University</institution><addr-line>Tainan</addr-line><country>Taiwan</country></aff><aff id="aff7"><institution>School of Medicine, National Yang Ming Chiao Tung University</institution><addr-line>Taipei</addr-line><country>Taiwan</country></aff><aff id="aff8"><institution>Big Data Center, Taipei Veterans General Hospital</institution><addr-line>Taipei</addr-line><country>Taiwan</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Steenstra</surname><given-names>Ivan</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Chatzimina</surname><given-names>Maria</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Naliyatthaliyazchayil</surname><given-names>Parvati Menon</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Karyukin</surname><given-names>Vladislav</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Yuan-Chia Chu, PhD, Department of Information Management, Taipei Veterans General Hospital, No. 201, Sec. 2, Shipai Road, Beitou District, Taipei, 112201, Taiwan, +886 986-680623; <email>xd.yuanchia@gmail.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>13</day><month>5</month><year>2026</year></pub-date><volume>10</volume><elocation-id>e90814</elocation-id><history><date date-type="received"><day>04</day><month>01</month><year>2026</year></date><date date-type="rev-recd"><day>23</day><month>04</month><year>2026</year></date><date date-type="accepted"><day>24</day><month>04</month><year>2026</year></date></history><copyright-statement>&#x00A9; Kuan-Hsun Lin, Chia-Ping Chang, Chen-Tsung Kuo, Chien-Yeh Hsu, Shih-Hsin Hung, Chung-Yueh Lien, Siang Hao Lee, Yi-Chen Yeh, Yuan-Chia Chu. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 13.5.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2026/1/e90814"/><abstract><sec><title>Background</title><p>Accurate transcription of pathology gross examination dictation is important for clinical documentation, but multilingual dictation remains challenging in settings where clinicians mix Chinese and English while final pathology reports are written in English.</p></sec><sec><title>Objective</title><p>This study aimed to evaluate whether a Whisper-based automatic speech recognition (ASR) pipeline guided by contextual system messages and combined with open-source large language models (LLMs; Qwen2:72b, Llama3.1:70b, Gemma2:27b) could improve multilingual (Chinese-English) pathology dictation transcription accuracy and generate clinically appropriate English gross description reports.</p></sec><sec sec-type="methods"><title>Methods</title><p>We conducted a controlled proof-of-concept study using 125 simulated mixed Chinese-English pathology gross examination audio recordings created by physicians or pathologists. Audio recordings were transcribed using Whisper ASR with and without a contextual system message. The ASR transcripts were then converted into English gross description reports using 3 open-source LLMs: Qwen2:72b, Llama3.1:70b, and Gemma2:27b. Outcomes included character error rate, Bilingual Evaluation Understudy, Recall-Oriented Understudy for Gisting Evaluation (ROUGE)-1, ROUGE-2, ROUGE-L, Metric for Evaluation of Translation with Explicit Ordering, pathologist Win-Tie-Lose rankings, report-level error categories, inference time, and interrater agreement.</p></sec><sec sec-type="results"><title>Results</title><p>The ASR contextual system message reduced the mean character error rate from 0.344 (SD 0.176; 95% CI 0.313&#x2010;0.375) to 0.066 (SD 0.100; 95% CI 0.048&#x2010;0.084; <italic>P</italic>&#x003C;.001). Qwen2:72b achieved the highest automated metric scores, including a Bilingual Evaluation Understudy of 0.644 (SD 0.307), ROUGE-1 of 0.866 (SD 0.163), ROUGE-2 of 0.771 (SD 0.235), ROUGE-L of 0.842 (SD 0.178), and Metric for Evaluation of Translation with Explicit Ordering of 0.805 (SD 0.214). Pathologist-coded total error rates were 16.8% (21/125) for Qwen2:72b, 45.6% (57/125) for Llama3.1:70b, and 92.8% (116/125) for Gemma2:27b. The exact agreement between the 2 pathologists across full ranking categories was 76.8% (96/125; Cohen &#x03BA;=0.668), and agreement on the top-ranked model or tied top group was 81.6% (102/125; Cohen &#x03BA;=0.722).</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>In this proof-of-concept evaluation, contextual prompting improved ASR transcription accuracy, and Qwen2:72b generated the most accurate English pathology reports among the evaluated LLMs. However, the study used simulated audio recordings, a local vocabulary prompt, and report-level rather than term-level clinical annotation. LLM-generated reports should therefore be considered draft documentation requiring pathologist verification, and prospective validation in real clinical workflows is needed before clinical deployment.</p></sec></abstract><kwd-group><kwd>speech recognition software</kwd><kwd>natural language processing</kwd><kwd>electronic health records</kwd><kwd>multilingual</kwd><kwd>pathology</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Automatic speech recognition (ASR) technologies are increasingly being implemented in health care to improve workflow efficiency by automating the transcription of spoken input into text [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. Accurate transcription of medical records is fundamental to patient care and clinical decision-making. However, the intricate language used in medical settings, including technical terms, abbreviations, and context-specific expressions, presents significant barriers to achieving high transcription accuracy [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. These limitations are particularly problematic in high-stakes environments such as pathology, where documentation errors can result in delays or incorrect diagnoses [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>]. Errors in transcription can lead to misinterpretations, impacting patient safety and clinical outcomes.</p><p>Despite advancements in ASR technologies, transcription accuracy remains a challenge, particularly when dealing with specialized medical terminology and complex clinical workflows [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref9">9</xref>]. Current ASR systems often fall short, resulting in transcription errors that compromise the quality and reliability of electronic health records [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref12">12</xref>]. Therefore, improving the accuracy and reliability of medical transcription systems is imperative for enhancing clinical documentation and ensuring patient safety [<xref ref-type="bibr" rid="ref13">13</xref>-<xref ref-type="bibr" rid="ref15">15</xref>]. Moreover, directly using spoken language transcriptions in medical documents may be inappropriate, as clinicians&#x2019; speech often includes filler words, abbreviations, or incomplete pronunciations for the sake of efficiency. Converting spoken language transcriptions into the appropriate structured format remains a significant challenge. This challenge is even greater in multilingual settings. For example, in Taiwan, health care professionals frequently mix Chinese and English in spoken communication, while medical documents are typically written in English. As a result, directly using spoken language transcriptions in medical documents is not feasible.</p><p>Recent advancements in large language models (LLMs) offer a promising solution to these challenges. LLMs, with their sophisticated contextual understanding and ability to process vast amounts of data, have demonstrated the potential to enhance ASR outputs by reducing transcription errors and improving language comprehension in specialized domains [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>]. While there is growing evidence supporting the use of LLMs in natural language processing, their application in medical transcription, especially in combination with ASR systems, remains underexplored. This study addresses this gap by integrating the Whisper ASR system with LLMs, including Qwen2:72b [<xref ref-type="bibr" rid="ref18">18</xref>], Llama3.1:70b [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref20">20</xref>], and Gemma2:27b [<xref ref-type="bibr" rid="ref21">21</xref>], to improve transcription accuracy and generate clinically appropriate pathology reports.</p><p>A key aspect in this study is the use of system messages&#x2014;predefined instructions designed to guide the LLMs in accurately interpreting and generating clinical documentation [<xref ref-type="bibr" rid="ref22">22</xref>]. These messages are tailored to address the challenges of medical transcription, such as recognizing specialized terms and avoiding irrelevant content. By leveraging system messages, the study aims to enhance the accuracy of the Whisper ASR system, particularly in environments requiring high levels of precision, such as the documentation of gross pathology findings.</p><p>This study, conducted in Taiwan, focuses on the gross examination of pathology specimens, a process where precision in documenting specimen characteristics is essential. Traditionally, this documentation has been manual, which can be both time consuming and prone to errors. By integrating artificial intelligence&#x2013;powered ASR technology into the workflow, the process can be streamlined, allowing medical personnel to verbally dictate their findings while handling specimens [<xref ref-type="bibr" rid="ref23">23</xref>]. This approach not only improves workflow efficiency but also reduces the potential for transcription errors, ultimately contributing to better clinical outcomes.</p><p>This study aimed to evaluate, in a controlled proof-of-concept setting, whether a Whisper-based ASR pipeline guided by system messages and combined with open-source LLMs could improve multilingual pathology dictation transcription and generate clinically appropriate English gross description reports. We hypothesized that system messages would reduce ASR transcription error and that LLM performance would differ across models in report quality, error profile, and inference time.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design and Setting</title><p>This proof-of-concept study evaluated a hybrid ASR-LLM pipeline for multilingual pathology gross description documentation (<xref ref-type="fig" rid="figure1">Figure 1</xref>). The study used 125 simulated mixed Chinese-English audio recordings created by physicians or pathologists to reflect common gross examination dictation patterns in Taiwan. The recordings were not clinical recordings and did not contain real patient voices, patient data, or identifiable information. Each audio file was transcribed by Whisper ASR with and without a contextual system message. The resulting mixed-language transcripts were subsequently converted into English pathology gross description reports by 3 LLMs. Two pathologists evaluated the LLM-generated reports for clinical appropriateness and coded report-level errors. The study was designed as a controlled formative evaluation of technical feasibility and report quality rather than a clinical effectiveness or deployment trial.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Study workflow diagram. This figure provides an overview of the controlled proof-of-concept study design, including simulated audio creation, automatic speech recognition (ASR) transcription with and without contextual system messages, large language model (LLM)&#x2013;based English report generation, and evaluation by automated metrics and pathologist review. AI: artificial intelligence; BLEU: Bilingual Evaluation Understudy; ROUGE: Recall-Oriented Understudy for Gisting Evaluation; METEOR: Metric for Evaluation of Translation with Explicit Ordering.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e90814_fig01.png"/></fig></sec><sec id="s2-2"><title>Model Selection and System Integration</title><sec id="s2-2-1"><title>Whisper ASR System</title><p>We selected the Whisper ASR system for its superior capability in transcribing multilingual medical dialogues. Whisper was chosen for its zero-shot accuracy across diverse language contexts&#x2014;a crucial feature in environments involving both Chinese and English medical terms [<xref ref-type="bibr" rid="ref8">8</xref>]. To quantify its effectiveness, we compared transcription results using the character error rate (CER), both with and without the application of system messages. These system messages are preconfigured prompts designed to enhance transcription precision.</p></sec><sec id="s2-2-2"><title>LLM Selection</title><p>The Whisper ASR system was integrated with LLMs through a Flask-based application programming interface (API). Three open-source LLMs were evaluated: Qwen2:72b, Llama3.1:70b, and Gemma2:27b. All models received the same ASR transcript and the same LLM system message for each case. The LLM stage transformed the mixed Chinese-English ASR transcript into a standardized English gross description report. The LLMs were instructed to normalize terminology and formatting but not to add information beyond the transcript. No model-specific fine-tuning was performed.</p></sec><sec id="s2-2-3"><title>System Messages</title><p>Two types of system messages were used (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). For ASR, the system message was provided to the Whisper transcription service as a contextual prompt and vocabulary guide at the time of transcription. CER was calculated on the resulting ASR transcript before any LLM-based report generation. Therefore, the CER comparison reflects transcripts generated with versus without the ASR contextual prompt, rather than postprocessing by the downstream LLM. For LLM report generation, a separate system message instructed the models to convert the mixed-language transcript into an English pathology report, preserve only information present in the transcript, avoid unsupported interpretation, and exclude nonreport text.</p></sec><sec id="s2-2-4"><title>Computational Environment</title><p>All inference tasks were performed using 2 NVIDIA A100 graphics processing units (40 GB each), using the Ollama platform for local model deployment. The models were run using Ollama default generation settings, including the default temperature of 0.8. Other generation hyperparameters were not exhaustively optimized or varied, and this is acknowledged as a reproducibility limitation.</p></sec></sec><sec id="s2-3"><title>Validation Process</title><sec id="s2-3-1"><title>CER Analysis for Whisper ASR Transcription</title><p>The transcription accuracy of Whisper ASR was evaluated using CER. Physician- or pathologist-prepared ground truth transcripts were used as the reference standard. For each audio recording, CER was calculated by comparing the ASR transcript with the corresponding ground truth transcript. CER was measured separately for transcripts generated with and without the ASR contextual system message, before any LLM-based report generation.</p></sec><sec id="s2-3-2"><title>Expert Ranking and Win-Tie-Lose Analysis for LLM-Generated Reports</title><p>Two pathologists independently evaluated the reports generated by Qwen2:72b, Llama3.1:70b, and Gemma2:27b for each transcription task. Rankings were based on overall accuracy and suitability for pathology gross description reporting. Ties were permitted when reports were judged clinically similar. The resulting rankings were analyzed using a Win-Tie-Lose framework. Interrater agreement was assessed using exact agreement and Cohen &#x03BA;.</p></sec><sec id="s2-3-3"><title>Error Type Analysis for LLM-Generated Reports</title><p>Physician- or pathologist-coded report-level errors were categorized as irrelevant text, Chinese character output, other factual or report errors, and total error. Irrelevant text referred to comments, instructions, or conversational content not belonging in a pathology report. Chinese character output indicated that Chinese characters remained in the English report. Other errors included clinically relevant inaccuracies such as incorrect organ names, measurements, margins, or descriptive statements. Total error indicated the presence of any of these error categories in a report. Error rates were summarized as counts, percentages, and 95% CIs.</p></sec><sec id="s2-3-4"><title>Evaluation Using Bilingual Evaluation Understudy, Recall-Oriented Understudy for Gisting Evaluation, and Metric for Evaluation of Translation With Explicit Ordering Metrics</title><p>Automated report generation performance was evaluated using Bilingual Evaluation Understudy (BLEU), Recall-Oriented Understudy for Gisting Evaluation (ROUGE)-1, ROUGE-2, ROUGE-L, and Metric for Evaluation of Translation with Explicit Ordering (METEOR) scores, with physician-prepared reference report summaries as the ground truth. These metrics were used to provide reproducible overlap-based comparisons across LLMs. As overlap metrics do not fully capture clinical correctness, they were interpreted alongside pathologist rankings and report-level error analysis.</p></sec><sec id="s2-3-5"><title>Statistical Analysis</title><p>Continuous metrics are summarized as mean, SD, median, range, and 95% CI. For CER and automated text generation metrics, 95% CIs around the mean were estimated using the 2-tailed <italic>t</italic> distribution across the 125 paired recordings or generated reports. As CER values with and without ASR system messages were obtained from the same audio recordings, paired analyses were used to compare transcription performance. LLM-generated reports were compared across models using paired tests and nonparametric tests where appropriate. Report-level error rates are presented as counts, percentages, and Wilson 95% CIs. Interrater agreement between the 2 pathologists was assessed using exact agreement and Cohen &#x03BA;.</p></sec><sec id="s2-3-6"><title>Inference Speed Measurement</title><p>To compare the inference speed of 3 LLMs&#x2014;Gemma2:27b, Qwen2:72b, and Llama3.1:70b&#x2014;in a clinical setting, we measured the inference times as a complete cycle from audio input through the ASR system to the LLM&#x2019;s output. Each model was tested using 125 audio samples across 10 epochs to ensure consistent and reliable results. The measurements included the mean, median, and maximum inference times, as well as the 95% CIs for each model.</p></sec></sec><sec id="s2-4"><title>Ethical Considerations</title><p>This study used only simulated audio recordings created for research and system evaluation purposes. The recordings did not include human participants, patient audio, patient data, or identifiable information. Therefore, ethics approval and informed consent were not required. Future studies using real clinical recordings or patient-related data would require appropriate institutional review and privacy safeguards.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>System Integration and Performance Analysis</title><sec id="s3-1-1"><title>Overview</title><p>The integration of the Whisper ASR system with LLMs was systematically evaluated for accuracy and clinical applicability (<xref ref-type="fig" rid="figure1">Figure 1</xref>). The evaluation included 3 main components: the transcription accuracy of the Whisper ASR system, the Win-Tie-Lose analysis of the pathology reports generated by the 3 LLMs, and the error type analysis of these reports. We analyzed the differences between the transcriptions generated by Whisper ASR and the ground truth transcriptions using the CER, comparing the effects of including or excluding system messages. Expert pathologists evaluated the pathology reports produced by the 3 LLMs&#x2014;Qwen2:72b, Llama3.1:70b, and Gemma2:27b&#x2014;and assigned rankings based on the accuracy and clinical relevance of the content. Additionally, a detailed error type analysis was conducted on the pathology reports generated by the LLMs, comparing differences among the models.</p></sec><sec id="s3-1-2"><title>Transcription Accuracy of the Whisper ASR System</title><p><xref ref-type="table" rid="table1">Table 1</xref> presents the CER results for Whisper ASR with and without contextual system messages. Without system messages, the mean CER was 0.344 (SD 0.176; median 0.293; 95% CI 0.313&#x2010;0.375). With system messages, the mean CER decreased to 0.066 (SD 0.100; median 0.024; 95% CI 0.048&#x2010;0.084), representing a mean paired reduction of 0.278 (<italic>P</italic>&#x003C;.001). On the basis of this improvement, transcripts generated with the ASR contextual system message were used as inputs for subsequent LLM report generation.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Character error rate (CER) with and without an automatic speech recognition (ASR) system message (N=125).</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">ASR condition</td><td align="left" valign="bottom">CER, mean (SD; 95% CI)<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="bottom">Median (IQR)</td><td align="left" valign="bottom"><italic>P</italic> value<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">With ASR system message</td><td align="left" valign="top">0.066 (0.100; 0.048&#x2010;0.084)</td><td align="left" valign="top">0.024 (0.000&#x2010;0.538)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Without ASR system message</td><td align="left" valign="top">0.344 (0.176; 0.313&#x2010;0.375)</td><td align="left" valign="top">0.293 (0.137&#x2010;1.040)</td><td align="left" valign="top">Reference</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>Character error rate values were calculated against physician- or pathologist-prepared ground truth transcripts.</p></fn><fn id="table1fn2"><p><sup>b</sup><italic>P</italic> value reflects a paired comparison because both conditions were evaluated on the same 125 recordings.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-1-3"><title>Objective Metric Evaluation Across LLMs</title><p>The performance of the LLM-generated pathology reports was assessed using BLEU, ROUGE-1, ROUGE-2, ROUGE-L, and METEOR scores, and the score distributions are shown in <xref ref-type="fig" rid="figure2">Figure 2</xref>. Full descriptive statistics are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Qwen2:72b achieved the highest scores across all metrics, with mean scores of 0.644 (SD 0.307; 95% CI 0.590&#x2010;0.699), 0.866 (SD 0.163; 95% CI 0.837&#x2010;0.895), 0.771 (SD 0.235; 95% CI 0.729&#x2010;0.813), 0.842 (SD 0.178; 95% CI 0.811&#x2010;0.874), and 0.805 (SD 0.214; 95% CI 0.767&#x2010;0.843) for BLEU, ROUGE-1, ROUGE-2, ROUGE-L, and METEOR, respectively. Pairwise Wilcoxon tests showed that Qwen2:72b outperformed Llama3.1:70b and Gemma2:27b across all 5 automated metrics (all <italic>P</italic>&#x003C;.001).</p><p>Llama3.1:70b showed intermediate performance, with mean scores of 0.314, 0.717, 0.533, 0.675, and 0.604 for BLEU, ROUGE-1, ROUGE-2, ROUGE-L, and METEOR, respectively. Gemma2:27b had the lowest scores, with mean scores of 0.076, 0.227, 0.133, 0.203, and 0.203 for BLEU, ROUGE-1, ROUGE-2, ROUGE-L, and METEOR, respectively. Kruskal-Wallis tests across all 3 models were significant for all 5 metrics (all <italic>P</italic>&#x003C;.001).</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Distribution of Bilingual Evaluation Understudy (BLEU), Recall-Oriented Understudy for Gisting Evaluation (ROUGE), and Metric for Evaluation of Translation with Explicit Ordering (METEOR) scores across large language models. This figure presents the distributions of BLEU, ROUGE-1, ROUGE-2, ROUGE-L, and METEOR scores for Qwen2:72b, Llama3.1:70b, and Gemma2:27b. Full descriptive statistics are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e90814_fig02.png"/></fig></sec><sec id="s3-1-4"><title>Win-Tie-Lose Distribution Across Models</title><p>The Win-Tie-Lose framework was applied to assess the clinical relevance of outputs from the 3 LLMs (<xref ref-type="fig" rid="figure3">Figure 3</xref>). Qwen2:72b achieved the highest proportion of win classifications, whereas Gemma2:27b had the highest proportion of lose classifications. The 2 pathologists showed substantial agreement in the ranking assessment (<xref ref-type="table" rid="table2">Table 2</xref>): exact agreement across the full ranking categories was 76.8% (96/125), with an unweighted Cohen &#x03BA; of 0.668. Agreement on the top-ranked model or tied top group was 81.6% (102/125), with a Cohen &#x03BA; of 0.722.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Model performance distribution: Win-Tie-Lose analysis. This figure illustrates the Win-Tie-Lose distribution for each large language model (Qwen2:72b, Llama3.1:70b, and Gemma2:27b) based on the 2 pathologists&#x2019; evaluations. L: lose; T: tie; W: win.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e90814_fig03.png"/></fig><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Interrater agreement for pathologist rankings (N=125).</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Agreement analysis</td><td align="left" valign="bottom">Agreement, n (%)</td><td align="left" valign="bottom">Cohen &#x03BA;</td><td align="left" valign="bottom">Interpretation</td></tr></thead><tbody><tr><td align="left" valign="top">Full ranking category</td><td align="left" valign="top">96 (76.8)</td><td align="left" valign="top">0.668</td><td align="left" valign="top">Substantial agreement</td></tr><tr><td align="left" valign="top">Top-ranked model or tied top group</td><td align="left" valign="top">102 (81.6)</td><td align="left" valign="top">0.722</td><td align="left" valign="top">Substantial agreement</td></tr></tbody></table></table-wrap></sec><sec id="s3-1-5"><title>Error Type Distribution Across Models</title><p>Error type analysis is shown in <xref ref-type="fig" rid="figure4">Figure 4</xref> and summarized numerically in <xref ref-type="table" rid="table3">Table 3</xref>. Qwen2:72b had the lowest total error rate (21/125, 16.8%; 95% CI 11.3%&#x2010;24.3%), followed by Llama3.1:70b (57/125, 45.6%; 95% CI 37.1%&#x2010;54.3%) and Gemma2:27b (116/125, 92.8%; 95% CI 86.9%&#x2010;96.2%). These findings indicate that Qwen2:72b had the most favorable report-level error profile among the evaluated models.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Error type distribution. This figure visualizes the distribution of pathologist-coded report-level error categories across the 3 large language models, including irrelevant text, Chinese character output, other factual or report errors, and total errors.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e90814_fig04.png"/></fig><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Pathologist-coded report-level error rates (N=125).</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" colspan="2">Model and error category</td><td align="left" valign="bottom">Values, n (%)</td><td align="left" valign="bottom">95% CI</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="4">Qwen2:72b</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Irrelevant text</td><td align="left" valign="top">2 (1.6)</td><td align="left" valign="top">0.4&#x2010;5.6</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Chinese characters</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0.0&#x2010;3.0</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Other factual or report error</td><td align="left" valign="top">20 (16.0)</td><td align="left" valign="top">10.6&#x2010;23.4</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Total error</td><td align="left" valign="top">21 (16.8)</td><td align="left" valign="top">11.3&#x2010;24.3</td></tr><tr><td align="left" valign="top" colspan="4">Llama3.1:70b</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Irrelevant text</td><td align="left" valign="top">38 (30.4)</td><td align="left" valign="top">23.0&#x2010;38.9</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Chinese characters</td><td align="left" valign="top">4 (3.2)</td><td align="left" valign="top">1.3&#x2010;7.9</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Other factual or report error</td><td align="left" valign="top">28 (22.4)</td><td align="left" valign="top">16.0&#x2010;30.5</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Total error</td><td align="left" valign="top">57 (45.6)</td><td align="left" valign="top">37.1&#x2010;54.3</td></tr><tr><td align="left" valign="top" colspan="4">Gemma2:27b</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Irrelevant text</td><td align="left" valign="top">102 (81.6)</td><td align="left" valign="top">73.9&#x2010;87.4</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Chinese characters</td><td align="left" valign="top">14 (11.2)</td><td align="left" valign="top">6.8&#x2010;17.9</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Other factual or report error</td><td align="left" valign="top">108 (86.4)</td><td align="left" valign="top">79.3&#x2010;91.3</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Total error</td><td align="left" valign="top">116 (92.8)</td><td align="left" valign="top">86.9&#x2010;96.2</td></tr></tbody></table></table-wrap></sec></sec><sec id="s3-2"><title>Impact of LLM Integration on Whisper ASR System for Pathology Report Generation</title><p>In <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, we present examples of pathology reports generated by the 3 LLM models alongside their corresponding Whisper ASR transcriptions from the audio recordings. These examples illustrate both successful terminology correction and failure modes such as irrelevant text, Chinese character output, and incorrect factual or report content.</p></sec><sec id="s3-3"><title>Inference Speed Comparison</title><p>The mean inference times for the models were similar, ranging from 5.17 to 5.43 seconds (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Qwen2:72b and Llama3.1:70b showed longer maximum inference times than Gemma2:27b, indicating that latency outliers should be considered in future real-time implementations.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This proof-of-concept study evaluated a hybrid ASR-LLM pipeline for multilingual pathology report generation. The main findings were that ASR contextual prompting substantially reduced CER, Qwen2:72b achieved the best automated text generation scores and the lowest pathologist-coded report-level error rate, and 2 pathologists showed substantial agreement in their model ranking assessments. These results support the technical feasibility of combining ASR and LLMs for multilingual pathology documentation, while also highlighting the need for human verification and real-world validation.</p></sec><sec id="s4-2"><title>Comparison With Prior Work</title><p>Prior systematic reviews of speech recognition for clinical documentation have found that speech recognition can reduce report turnaround time and support documentation efficiency, but evidence is heterogeneous and documentation errors remain a major concern [<xref ref-type="bibr" rid="ref13">13</xref>]. Hodgson and Coiera [<xref ref-type="bibr" rid="ref24">24</xref>] reported speech recognition accuracy ranging from 88.9% to 96.0% across included studies and emphasized the need to evaluate error types and clinical outcomes. Johnson et al [<xref ref-type="bibr" rid="ref25">25</xref>] similarly noted that implementation depends on workflow, training, templates, accents, and system selection. Our study is consistent with this literature in showing improved transcription accuracy with contextual guidance but extends prior work by evaluating a multilingual pathology-specific ASR-LLM pipeline rather than ASR alone.</p><p>Few prior studies have specifically evaluated ASR for pathology gross examination dictation, and even fewer have examined mixed Chinese-English dictation followed by LLM-based English report generation. In another non-English medical context, Lee et al [<xref ref-type="bibr" rid="ref26">26</xref>] compared cloud-based speech recognition APIs for Korean medical terminology using real physician-patient conversations and found medical term recognition accuracies of 75.1%, 50.9%, and 57.9% across 3 APIs. Compared with that study, our controlled simulation used a different evaluation setting and metric, achieving a low mean CER with ASR contextual prompting; however, Lee et al [<xref ref-type="bibr" rid="ref26">26</xref>] used real clinical conversations and formal term-level medical terminology evaluation. This contrast highlights both the promise of contextual prompting in a pathology-specific pipeline and the need for future real-world validation with term-level concept annotation.</p></sec><sec id="s4-3"><title>Interpretation and Implications</title><p>The superior performance of Qwen2:72b suggests that model selection is important for multilingual medical documentation tasks. However, automated overlap metrics such as BLEU, ROUGE, and METEOR do not fully capture clinical correctness, unsupported inference, omission, or hallucination. The pathologist-coded error analysis therefore provides an important complement to automated metrics. Although LLMs sometimes corrected plausible ASR errors, this behavior may also introduce unsupported assumptions. Generated reports should therefore be treated as draft documentation requiring pathologist verification rather than autonomous clinical records [<xref ref-type="bibr" rid="ref27">27</xref>].</p><p>The fixed vocabulary used in the ASR contextual prompt likely contributed to the CER improvement. This is clinically useful in a specialized setting such as pathology gross examination, where recurrent organ names, margins, measurements, and descriptive terms are common. At the same time, this design limits generalizability. Other hospitals, specialties, languages, accents, and staff dictation styles would require local vocabulary adaptation and prospective validation.</p></sec><sec id="s4-4"><title>Limitations</title><p>This study has several limitations. First, all audio recordings were simulated by physicians or pathologists in a controlled setting. Although this design enabled reproducible evaluation without patient data, it does not fully capture background noise, speaker variability, interruptions, overlapping speakers, or the operational complexity of real pathology grossing rooms. Second, only 2 pathologists evaluated the generated reports. We added interrater agreement statistics, but future studies should include more raters from multiple institutions. Third, the original annotation schema captured report-level errors but did not label every organ, measurement, margin, unit, or bilingual expression separately. Therefore, formal concept-level accuracy and critical-term preservation could not be calculated retrospectively without additional annotation. Fourth, LLM generation used local Ollama defaults, and not all hyperparameters were exhaustively optimized. Finally, this study did not evaluate integration with electronic health record systems, turnaround time in live clinical workflows, or downstream patient safety outcomes.</p></sec><sec id="s4-5"><title>Conclusions</title><p>This controlled proof-of-concept study suggests that contextual ASR prompting combined with LLM-based report generation may support multilingual pathology documentation. Qwen2:72b showed the strongest overall performance among the evaluated models, with the highest automated metric scores and the lowest report-level error rate. However, the findings should be interpreted as formative evidence rather than clinical validation. Future work should evaluate the pipeline prospectively in real pathology workflows, include broader speaker and environmental variability, perform term-level clinical concept annotation, and define human verification safeguards before clinical deployment.</p></sec></sec></body><back><ack><p>The authors thank the Big Data Center at Taipei Veterans General Hospital for assistance with data preparation and Dr Shang-Liang Wu for valuable input on statistical analysis. They also acknowledge the computational resources provided by TVGH Cloud 1. During manuscript revision, generative artificial intelligence (GenAI) tools, including ChatGPT, Codex, and OpenAI image generation tools (GPT Image), were used to assist with language editing, grammar refinement, organization of reviewer responses, formatting of revision materials, and preparation of original schematic visual elements for Figure 1. GenAI was not used to generate study data, perform independent scientific interpretation, make authorship decisions, or replace author oversight. All statistical analyses, numerical results, references, figure content, and manuscript revisions were reviewed and verified by the authors. No patient data or identifiable information was provided to GenAI tools. The large language model systems evaluated in the study were distinct from those used for writing and figure-preparation assistance.</p></ack><notes><sec><title>Funding</title><p>This study was supported by grants from the National Science and Technology Council, Taiwan (NSTC 113-2320-B-075-010; NSTC 114-2320-B-075-009), and Taipei Veterans General Hospital (V114E-006-2; V114C-149; V115E-005-2; V115C-093).</p></sec><sec><title>Data Availability</title><p>The datasets used and/or analyzed during this study are available from the corresponding author on reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>KHL and YCY conceptualized and designed the study and drafted the initial manuscript. CPC contributed to data acquisition and provided critical feedback on the methodology and analysis. CTK assisted with data analysis and interpretation and supported manuscript drafting. CYH contributed to statistical analysis and interpretation of results and made substantial revisions to the manuscript. SHH coordinated data collection and verified data integrity. SHL and CYL conducted data validation and assisted with the literature review. YCY and YCC revised and refined the manuscript, ensuring consistency and clarity. All authors reviewed and approved the final manuscript. YCY and YCC shared senior author and corresponding author responsibilities.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">API</term><def><p>application programming interface</p></def></def-item><def-item><term id="abb2">ASR</term><def><p>automatic speech recognition</p></def></def-item><def-item><term id="abb3">BLEU</term><def><p>Bilingual Evaluation Understudy</p></def></def-item><def-item><term id="abb4">CER</term><def><p>character error rate</p></def></def-item><def-item><term id="abb5">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb6">METEOR</term><def><p>Metric for Evaluation of Translation with Explicit Ordering</p></def></def-item><def-item><term id="abb7">ROUGE</term><def><p>Recall-Oriented Understudy for Gisting Evaluation</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Latif</surname><given-names>S</given-names> </name><name name-style="western"><surname>Qadir</surname><given-names>J</given-names> </name><name name-style="western"><surname>Qayyum</surname><given-names>A</given-names> </name><name name-style="western"><surname>Usama</surname><given-names>M</given-names> </name><name name-style="western"><surname>Younis</surname><given-names>S</given-names> </name></person-group><article-title>Speech technology for healthcare: opportunities, challenges, and state of the art</article-title><source>IEEE Rev Biomed Eng</source><year>2021</year><volume>14</volume><fpage>342</fpage><lpage>356</lpage><pub-id pub-id-type="doi">10.1109/RBME.2020.3006860</pub-id><pub-id pub-id-type="medline">32746367</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Falcetta</surname><given-names>FS</given-names> </name><name name-style="western"><surname>de Almeida</surname><given-names>FK</given-names> </name><name name-style="western"><surname>Lemos</surname><given-names>JC</given-names> </name><name name-style="western"><surname>Goldim</surname><given-names>JR</given-names> </name><name name-style="western"><surname>da Costa</surname><given-names>CA</given-names> </name></person-group><article-title>Automatic documentation of professional health interactions: a systematic review</article-title><source>Artif Intell Med</source><year>2023</year><month>03</month><volume>137</volume><fpage>102487</fpage><pub-id pub-id-type="doi">10.1016/j.artmed.2023.102487</pub-id><pub-id pub-id-type="medline">36868684</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hull</surname><given-names>M</given-names> </name></person-group><article-title>Medical language proficiency: a discussion of interprofessional language competencies and potential for patient risk</article-title><source>Int J Nurs Stud</source><year>2016</year><month>02</month><volume>54</volume><fpage>158</fpage><lpage>172</lpage><pub-id pub-id-type="doi">10.1016/j.ijnurstu.2015.02.015</pub-id><pub-id pub-id-type="medline">25863658</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ching</surname><given-names>T</given-names> </name><name name-style="western"><surname>Himmelstein</surname><given-names>DS</given-names> </name><name name-style="western"><surname>Beaulieu-Jones</surname><given-names>BK</given-names> </name><etal/></person-group><article-title>Opportunities and obstacles for deep learning in biology and medicine</article-title><source>J R Soc Interface</source><year>2018</year><month>04</month><volume>15</volume><issue>141</issue><fpage>20170387</fpage><pub-id pub-id-type="doi">10.1098/rsif.2017.0387</pub-id><pub-id pub-id-type="medline">29618526</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shafer</surname><given-names>G</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>H</given-names> </name><name name-style="western"><surname>Suresh</surname><given-names>G</given-names> </name></person-group><article-title>Diagnostic errors in the neonatal intensive care unit: state of the science and new directions</article-title><source>Semin Perinatol</source><year>2019</year><month>12</month><volume>43</volume><issue>8</issue><fpage>151175</fpage><pub-id pub-id-type="doi">10.1053/j.semperi.2019.08.004</pub-id><pub-id pub-id-type="medline">31488330</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Blease</surname><given-names>CR</given-names> </name><name name-style="western"><surname>Bell</surname><given-names>SK</given-names> </name></person-group><article-title>Patients as diagnostic collaborators: sharing visit notes to promote accuracy and safety</article-title><source>Diagnosis (Berl)</source><year>2019</year><month>08</month><day>27</day><volume>6</volume><issue>3</issue><fpage>213</fpage><lpage>221</lpage><pub-id pub-id-type="doi">10.1515/dx-2018-0106</pub-id><pub-id pub-id-type="medline">31039128</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lippi</surname><given-names>G</given-names> </name><name name-style="western"><surname>Chance</surname><given-names>JJ</given-names> </name><name name-style="western"><surname>Church</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Preanalytical quality improvement: from dream to reality</article-title><source>Clin Chem Lab Med</source><year>2011</year><month>07</month><volume>49</volume><issue>7</issue><fpage>1113</fpage><lpage>1126</lpage><pub-id pub-id-type="doi">10.1515/CCLM.2011.600</pub-id><pub-id pub-id-type="medline">21517699</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Adedeji</surname><given-names>A</given-names> </name><name name-style="western"><surname>Joshi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Doohan</surname><given-names>B</given-names> </name></person-group><article-title>The sound of healthcare: improving medical transcription ASR accuracy with large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 12, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2402.07658</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="thesis"><person-group person-group-type="author"><name name-style="western"><surname>Strohm</surname><given-names>L</given-names> </name></person-group><article-title>The augmented radiologist - challenges and opportunities for widescale implementation of AI-based applications in Dutch radiology departments [Master&#x2019;s thesis]</article-title><year>2019</year><access-date>2026-05-02</access-date><publisher-name>Utrecht University</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://studenttheses.uu.nl/bitstream/handle/20.500.12932/33716/Thesis_LeaStrohm%20-%20Digital.pdf?sequence=3&#x0026;isAllowed=y">https://studenttheses.uu.nl/bitstream/handle/20.500.12932/33716/Thesis_LeaStrohm%20-%20Digital.pdf?sequence=3&#x0026;isAllowed=y</ext-link></comment></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kumah-Crystal</surname><given-names>YA</given-names> </name><name name-style="western"><surname>Pirtle</surname><given-names>CJ</given-names> </name><name name-style="western"><surname>Whyte</surname><given-names>HM</given-names> </name><name name-style="western"><surname>Goode</surname><given-names>ES</given-names> </name><name name-style="western"><surname>Anders</surname><given-names>SH</given-names> </name><name name-style="western"><surname>Lehmann</surname><given-names>CU</given-names> </name></person-group><article-title>Electronic health record interactions through voice: a review</article-title><source>Appl Clin Inform</source><year>2018</year><month>07</month><volume>9</volume><issue>3</issue><fpage>541</fpage><lpage>552</lpage><pub-id pub-id-type="doi">10.1055/s-0038-1666844</pub-id><pub-id pub-id-type="medline">30040113</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Luo</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>L</given-names> </name><name name-style="western"><surname>Adelgais</surname><given-names>K</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Z</given-names> </name></person-group><article-title>Assessing the effectiveness of automatic speech recognition technology in emergency medicine settings: a comparative study of four AI-powered engines</article-title><source>J Healthc Inform Res</source><year>2025</year><volume>9</volume><issue>3</issue><fpage>494</fpage><lpage>512</lpage><pub-id pub-id-type="doi">10.1007/s41666-025-00193-w</pub-id><pub-id pub-id-type="medline">40726745</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Kaswan</surname><given-names>KS</given-names> </name><name name-style="western"><surname>Gaur</surname><given-names>L</given-names> </name><name name-style="western"><surname>Dhatterwal</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Kumar</surname><given-names>R</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Gaur</surname><given-names>L</given-names> </name><name name-style="western"><surname>Solanki</surname><given-names>A</given-names> </name><name name-style="western"><surname>Wamba</surname><given-names>SF</given-names> </name><name name-style="western"><surname>Jhanjhi</surname><given-names>NZ</given-names> </name></person-group><article-title>AI-based natural language processing for the generation of meaningful information electronic health record (EHR) data</article-title><source>Advanced AI Techniques and Applications in Bioinformatics</source><year>2021</year><publisher-name>CRC Press</publisher-name><fpage>41</fpage><lpage>86</lpage><pub-id pub-id-type="doi">10.1201/9781003126164</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhou</surname><given-names>L</given-names> </name><name name-style="western"><surname>Blackley</surname><given-names>SV</given-names> </name><name name-style="western"><surname>Kowalski</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Analysis of errors in dictated clinical documents assisted by speech recognition software and professional transcriptionists</article-title><source>JAMA Netw Open</source><year>2018</year><month>07</month><volume>1</volume><issue>3</issue><fpage>e180530</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2018.0530</pub-id><pub-id pub-id-type="medline">30370424</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bowman</surname><given-names>S</given-names> </name></person-group><article-title>Impact of electronic health record systems on information integrity: quality and safety implications</article-title><source>Perspect Health Inf Manag</source><year>2013</year><volume>10</volume><issue>Fall</issue><fpage>1c</fpage><pub-id pub-id-type="medline">24159271</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Blijleven</surname><given-names>V</given-names> </name><name name-style="western"><surname>Koelemeijer</surname><given-names>K</given-names> </name><name name-style="western"><surname>Wetzels</surname><given-names>M</given-names> </name><name name-style="western"><surname>Jaspers</surname><given-names>M</given-names> </name></person-group><article-title>Workarounds emerging from electronic health record system usage: consequences for patient safety, effectiveness of care, and efficiency of care</article-title><source>JMIR Hum Factors</source><year>2017</year><month>10</month><day>5</day><volume>4</volume><issue>4</issue><fpage>e27</fpage><pub-id pub-id-type="doi">10.2196/humanfactors.7978</pub-id><pub-id pub-id-type="medline">28982645</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Deng</surname><given-names>C</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Q</given-names> </name><etal/></person-group><article-title>Recording for eyes, not echoing to ears: contextualized spoken-to-written conversion of ASR transcripts</article-title><source>AAAI&#x2019;25/IAAI&#x2019;25/EAAI&#x2019;25: Proceedings of the Thirty-Ninth AAAI Conference on Artificial Intelligence and Thirty-Seventh Conference on Innovative Applications of Artificial Intelligence and Fifteenth Symposium on Educational Advances in Artificial Intelligence</source><year>2025</year><publisher-name>AAAI Press</publisher-name><pub-id pub-id-type="doi">10.1609/aaai.v39i23.34642</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kheddar</surname><given-names>H</given-names> </name><name name-style="western"><surname>Hemis</surname><given-names>M</given-names> </name><name name-style="western"><surname>Himeur</surname><given-names>Y</given-names> </name></person-group><article-title>Automatic speech recognition using advanced deep learning approaches: a survey</article-title><source>Inf Fusion</source><year>2024</year><month>09</month><volume>109</volume><fpage>102422</fpage><pub-id pub-id-type="doi">10.1016/j.inffus.2024.102422</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>A</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>B</given-names> </name><name name-style="western"><surname>Hui</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Qwen2 technical report</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 15, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2407.10671</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Dubey</surname><given-names>A</given-names> </name><name name-style="western"><surname>Grattafiori</surname><given-names>A</given-names> </name><name name-style="western"><surname>Jauhri</surname><given-names>A</given-names> </name><etal/></person-group><article-title>The Llama 3 herd of models</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 31, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2407.21783</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Adams</surname><given-names>LC</given-names> </name><name name-style="western"><surname>Truhn</surname><given-names>D</given-names> </name><name name-style="western"><surname>Busch</surname><given-names>F</given-names> </name><etal/></person-group><article-title>Llama 3 challenges proprietary state-of-the-art large language models in radiology board-style examination questions</article-title><source>Radiology</source><year>2024</year><month>08</month><volume>312</volume><issue>2</issue><fpage>e241191</fpage><pub-id pub-id-type="doi">10.1148/radiol.241191</pub-id><pub-id pub-id-type="medline">39136566</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="other"><person-group person-group-type="author"><collab>Gemma Team</collab></person-group><article-title>Gemma 2: improving open language models at a practical size</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 31, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2408.00118</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>L</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>J</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>A scoping review of using large language models (LLMs) to investigate electronic health records (EHRs)</article-title><source>arXiv</source><comment>Preprint posted online on  May 5, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2405.03066</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="thesis"><person-group person-group-type="author"><name name-style="western"><surname>Pellecchia</surname><given-names>R</given-names> </name></person-group><article-title>Leveraging AI via speech-to-text and LLM integration for improved healthcare decision-making in primary care [Master's thesis]</article-title><year>2024</year><access-date>2026-05-03</access-date><publisher-name>Politecnico Milano</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.politesi.polimi.it/handle/10589/218053">https://www.politesi.polimi.it/handle/10589/218053</ext-link></comment></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hodgson</surname><given-names>T</given-names> </name><name name-style="western"><surname>Coiera</surname><given-names>E</given-names> </name></person-group><article-title>Risks and benefits of speech recognition for clinical documentation: a systematic review</article-title><source>J Am Med Inform Assoc</source><year>2016</year><month>04</month><volume>23</volume><issue>e1</issue><fpage>e169</fpage><lpage>e179</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocv152</pub-id><pub-id pub-id-type="medline">26578226</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>M</given-names> </name><name name-style="western"><surname>Lapkin</surname><given-names>S</given-names> </name><name name-style="western"><surname>Long</surname><given-names>V</given-names> </name><etal/></person-group><article-title>A systematic review of speech recognition technology in health care</article-title><source>BMC Med Inform Decis Mak</source><year>2014</year><month>10</month><day>28</day><volume>14</volume><fpage>94</fpage><pub-id pub-id-type="doi">10.1186/1472-6947-14-94</pub-id><pub-id pub-id-type="medline">25351845</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>SH</given-names> </name><name name-style="western"><surname>Park</surname><given-names>J</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>K</given-names> </name><name name-style="western"><surname>Min</surname><given-names>J</given-names> </name><name name-style="western"><surname>Choi</surname><given-names>J</given-names> </name></person-group><article-title>Accuracy of cloud-based speech recognition open application programming interface for medical terms of Korean</article-title><source>J Korean Med Sci</source><year>2022</year><month>05</month><day>9</day><volume>37</volume><issue>18</issue><fpage>e144</fpage><pub-id pub-id-type="doi">10.3346/jkms.2022.37.e144</pub-id><pub-id pub-id-type="medline">35535371</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Asgari</surname><given-names>E</given-names> </name><name name-style="western"><surname>Monta&#x00F1;a-Brown</surname><given-names>N</given-names> </name><name name-style="western"><surname>Dubois</surname><given-names>M</given-names> </name><etal/></person-group><article-title>A framework to assess clinical safety and hallucination rates of LLMs for medical text summarisation</article-title><source>NPJ Digit Med</source><year>2025</year><month>05</month><day>13</day><volume>8</volume><issue>1</issue><fpage>274</fpage><pub-id pub-id-type="doi">10.1038/s41746-025-01670-7</pub-id><pub-id pub-id-type="medline">40360677</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Contextual system messages used for Whisper automatic speech recognition transcription and large language model&#x2013;based pathology report generation.</p><media xlink:href="formative_v10i1e90814_app1.docx" xlink:title="DOCX File, 38 KB"/></supplementary-material></app-group></back></article>