<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v10i1e85169</article-id><article-id pub-id-type="doi">10.2196/85169</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Comparing Large Language Models and Traditional Machine Translation Tools for Translating Medical Consultation Summaries: Quantitative Pilot Feasibility Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Li</surname><given-names>Andy</given-names></name><degrees>BEE, BCS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zhou</surname><given-names>Wei</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Hoda</surname><given-names>Rashina</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Bain</surname><given-names>Chris</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Poon</surname><given-names>Peter</given-names></name><degrees>MBBS, PGDipPM, CHIA</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref></contrib></contrib-group><aff id="aff1"><institution>Faculty of Information Technology, Monash University</institution><addr-line>Room 221, Clayton Campus, 20 Exhibition Walk</addr-line><addr-line>Clayton</addr-line><country>Australia</country></aff><aff id="aff2"><institution>Faculty of Medicine, Nursing and Health Sciences, Monash University</institution><addr-line>Wellington Rd, Clayton VIC 3800</addr-line><addr-line>Monash</addr-line><country>Australia</country></aff><aff id="aff3"><institution>Supportive and Palliative Care Unit, Monash Health</institution><addr-line>Monash</addr-line><country>Australia</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Sarvestan</surname><given-names>Javad</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Abudalfa</surname><given-names>Shadi</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Karyukin</surname><given-names>Vladislav</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Rashina Hoda, PhD, Faculty of Information Technology, Monash University, Room 221, Clayton Campus, 20 Exhibition Walk, Clayton, 3800, Australia, +61 3 9902 9970; <email>Rashina.hoda@monash.edu</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>13</day><month>4</month><year>2026</year></pub-date><volume>10</volume><elocation-id>e85169</elocation-id><history><date date-type="received"><day>02</day><month>10</month><year>2025</year></date><date date-type="rev-recd"><day>06</day><month>02</month><year>2026</year></date><date date-type="accepted"><day>16</day><month>02</month><year>2026</year></date></history><copyright-statement>&#x00A9; Andy Li, Wei Zhou, Rashina Hoda, Chris Bain, Peter Poon. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 13.4.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2026/1/e85169"/><abstract><sec><title>Background</title><p>Translation of medical consultation summaries is essential for equitable health care communication in culturally and linguistically diverse populations. While machine translation (MT) tools and large language models (LLMs) are widely accessible, their feasibility and safety for health care contexts remain underexplored.</p></sec><sec><title>Objective</title><p>This pilot study investigates the feasibility and limitations of using LLMs and traditional MT tools to translate medical consultation summaries from English into the most common languages other than English spoken in Australia&#x2014;Arabic, Chinese (simplified written form), and Vietnamese.</p></sec><sec sec-type="methods"><title>Methods</title><p>Two simulated summaries&#x2014;a simple patient-facing summary and a complex clinician-oriented interprofessional letter&#x2014;were translated using 3 LLMs (GPT-4o, Llama-3.1, and Gemma-2) and 3 MT tools (Google Translate, Microsoft Bing Translator, and DeepL). Translations were benchmarked against professional third-party interpreter translations using Bilingual Evaluation Understudy, Character-level F-score, and Metric for Evaluation of Translation with Explicit Ordering metrics.</p></sec><sec sec-type="results"><title>Results</title><p>The translation performance varied across languages, tools, and summary complexity when assessed using automatic evaluation metrics. Traditional MT tools outperformed LLMs on surface-level metrics, while LLMs showed relative strengths in semantic similarity for Vietnamese and Chinese. Arabic translations improved with complex input, suggesting morphological advantages. The metric-based evaluation highlighted feasibility but also risks, particularly in Chinese clinical contexts.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>This pilot study provides formative evidence of opportunities and limitations in applying artificial intelligence translation for health care communication. Findings underscore the importance of human oversight; domain-specific evaluation metrics; and further formative and clinical research to guide the safe, equitable use of artificial intelligence translation tools.</p></sec></abstract><kwd-group><kwd>feasibility study</kwd><kwd>pilot evaluation</kwd><kwd>translation</kwd><kwd>large language models</kwd><kwd>machine translation</kwd><kwd>consultation summary</kwd><kwd>responsible artificial intelligence</kwd><kwd>responsible AI</kwd><kwd>artificial intelligence</kwd><kwd>AI</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Machine translation (MT) has seen rapid evolution in recent years, particularly with the advent of large language models (LLMs). Traditional neural MT tools, such as Google Translate [<xref ref-type="bibr" rid="ref1">1</xref>], Microsoft Bing Translator [<xref ref-type="bibr" rid="ref2">2</xref>], and DeepL [<xref ref-type="bibr" rid="ref3">3</xref>], have been widely used in general and domain-specific applications. These tools rely heavily on sequence-to-sequence architectures and large-scale parallel corpora, showing strong performance for high-resource language pairs with abundant linguistic data. However, the effectiveness of MT is often constrained in domains that demand specialized vocabulary and contextual precision, such as medical communication [<xref ref-type="bibr" rid="ref4">4</xref>]. While studies on digital translation in clinical care have reported improvements in communication quality and efficiency, they also highlighted limitations of MT in accuracy and further noted that direct, word-for-word translation, without accounting for cultural and contextual nuances, may lead to patient misunderstanding or even emotional distress [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>].</p><p>In contrast, LLMs such as GPT [<xref ref-type="bibr" rid="ref7">7</xref>], Gemma [<xref ref-type="bibr" rid="ref8">8</xref>], and Llama [<xref ref-type="bibr" rid="ref9">9</xref>] have emerged as versatile alternatives capable of performing a wide range of natural language tasks [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref16">16</xref>], including language translations [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]. These models leverage extensive pretraining on diverse corpora, enabling them to capture broader contextual relationships and discourse-level semantics across languages. Previous studies have shown that LLMs can outperform traditional MT tools in document-level linguistic translation by better preserving coherence and semantic intent [<xref ref-type="bibr" rid="ref17">17</xref>]. Moreover, recent work has shown that LLMs can act as effective quality estimators for translation output, even without explicit references [<xref ref-type="bibr" rid="ref18">18</xref>].</p><p>Despite these advances, the application of LLMs in medical translation remains underexplored and presents unique challenges [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref20">20</xref>]. Medical texts require not only accurate translation of terminology and clinical concepts but also contextual sensitivity, as even small errors can lead to harm [<xref ref-type="bibr" rid="ref4">4</xref>]. Terminological consistency, expansion of abbreviations, and the handling of multilingual clinical guidelines add layers of complexity [<xref ref-type="bibr" rid="ref21">21</xref>]. These challenges are heightened in multilingual health care settings such as Australia, where languages such as Arabic, Chinese, and Vietnamese are widely spoken but often underrepresented in medical translation tools [<xref ref-type="bibr" rid="ref22">22</xref>]. The accuracy and safety of using general-purpose LLMs in such contexts remain largely untested.</p><p>This pilot feasibility study builds on existing research by providing an early-stage, empirical comparison of LLMs and traditional MT tools in translating palliative care consultation summaries&#x2014;a use case that is both medically sensitive and linguistically nuanced [<xref ref-type="bibr" rid="ref23">23</xref>]. Our study focuses on 3 of the most spoken languages other than English in Australia&#x2014;Arabic, Chinese, and Vietnamese&#x2014;each posing different linguistic and morphological challenges [<xref ref-type="bibr" rid="ref22">22</xref>]. While Vietnamese, like many other languages, is relatively undersupported in traditional MT tools (eg, not supported by DeepL), LLMs have the potential to fill these gaps through flexible, prompt-based translation.</p><p>We designed 2 types of simulated summaries in English to reflect real-world use cases: a simple summary for patients, written in lay language and a complex, clinician-targeted version featuring domain-specific jargon, abbreviations, and nuanced medical reasoning. Translations were generated using default prompt settings for each LLM and default web versions of each MT tool and evaluated against professional third-party medical interpreter translations. Automatic evaluation metrics&#x2014;Bilingual Evaluation Understudy (BLEU) [<xref ref-type="bibr" rid="ref24">24</xref>], Character-level F-score (CHR-F) [<xref ref-type="bibr" rid="ref25">25</xref>], and Metric for Evaluation of Translation with Explicit Ordering (METEOR) [<xref ref-type="bibr" rid="ref26">26</xref>]&#x2014;were used to capture surface-level overlap, morphological robustness, and semantic similarity, respectively [<xref ref-type="bibr" rid="ref27">27</xref>]. This multifaceted analysis allowed us to assess the models not only in terms of fidelity to reference but also their ability to generalize across languages and document complexity.</p><p>By comparing performance across tool types, summary complexity, and languages, this formative study provides insights into the current feasibility and limitations of LLM-based translation for medical applications. Our findings highlight language-specific challenges, the shortcomings of current evaluation metrics for clinical accuracy, and the implications of deploying AI translation tools in health care communication. Importantly, translation errors in clinical contexts can compromise patient safety, reduce provider efficiency, and hinder equitable access to care [<xref ref-type="bibr" rid="ref28">28</xref>].</p><p>This study is intentionally designed as a <italic>pilot feasibility investigation</italic>. Its goal is not to benchmark optimal system performance or establish clinical readiness but to explore practical risks, limitations, and methodological challenges of using LLMs and MT tools for medical translation and to inform the design of future, clinically grounded evaluation studies.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Overview</title><p>This pilot feasibility study presents an early-stage comparison between popular LLMs and traditional MT tools in translating medical documents. We selected the latest and most capable variants of 3 state-of-the-art LLMs: GPT-4o (OpenAI), Gemma-2-27B (Google), and Llama-3.1-405B (Meta). For traditional MT tools, we selected 3 widely used services: Google Translate, Microsoft Bing Translator, and DeepL. LLMs have demonstrated strong capabilities in general-purpose translation, and commercial translation products based on LLMs have already been introduced into the market. However, the extent to which LLMs can be reliably and safely used for translating medical texts, especially in safety-critical areas such as digital health, remains largely unexplored. This study aims to investigate this gap by evaluating translation quality across both LLMs and traditional MT tools. Our design was intentionally small scale and exploratory, with the goal of generating formative evidence to inform larger clinical studies.</p></sec><sec id="s2-2"><title>Ethical Considerations</title><p>This study used entirely fictitious medical consultation summaries that were constructed by an experienced palliative care clinician and did not contain any real patient information, personally identifiable data, or clinical records. As a result, formal human research ethics approval was not required.</p></sec><sec id="s2-3"><title>Original Summaries in English</title><p>To simulate realistic clinical use cases, we created 2 types of fictitious consultation summaries in English. The first is a short and relatively simple summary with minimal medical terminology intended for patient-facing medical documents. It summarizes a simulated palliative care consultation of a patient with progressive dementia who recently recovered from a urinary tract infection. It captures the necessary components of a patient-facing summary, including medical history, recent care updates, and future recommendations. The second summary is longer and more complex, featuring extensive medical jargon, including abbreviations and acronyms, typically used for communication purposes between clinicians (eg, correspondence letter between the specialist and family physicians) or toward health care provider documents such as discharge summaries. These summaries were designed to reflect common clinical communication scenarios while ensuring no real patient data were used. The complex summary described a patient with advanced acute myeloid leukemia and treatment complications, requiring specialized care.</p><p>It is important to emphasize that the summaries were entirely fictitious and meticulously constructed by a qualified palliative care expert with over 25 years of clinical experience. No personally identifiable information or sensitive patient data were included. This ensured compliance with ethical standards and minimized risks to participants, in line with formative study expectations.</p></sec><sec id="s2-4"><title>Selected Languages</title><p>The target languages for translation were Arabic, Chinese (simplified), and Vietnamese, which were chosen because they are the 3 most commonly spoken languages other than English in Australia, as reported by the Australian Bureau of Statistics [<xref ref-type="bibr" rid="ref22">22</xref>]. Due to DeepL not supporting Vietnamese at the time of the study, its outputs for that language were not included in our analysis. The language set was chosen to reflect the culturally and linguistically diverse populations most relevant to the Australian health care system.</p></sec><sec id="s2-5"><title>Reference Translations</title><p>The reference translations for both summaries were generated by a professional third-party translation service with extensive experience in translating medical documents. The service is certified under ISO 17100, an international standard that establishes requirements to ensure the quality of translation services. All translators involved were accredited by the National Accreditation Authority for Translators and Interpreters. Each translation underwent a rigorous quality assurance process, which included a review by an independent translator (ie, a linguist not involved in the original translation task or providing verification) to identify typographical errors, formatting inconsistencies, untranslated or mistranslated content, and placeholder artifacts that may arise during document handling or after editing.</p><p>To ensure that reference translations were reflective of industry practice and standards, we engaged accredited translation services. However, it is important to note that the contemporary professional translation workflow commonly incorporates machine-generated drafts as a starting point for human postediting. There are no regulations that prohibit the use of machine-generated translations, and ISO 17100 also incorporates guidelines for using machine-generated translations as preliminary drafts. Consequently, while every effort was made to ensure the quality and accuracy of the translations, obtaining translations that are purely human generated from first principles is increasingly difficult in modern professional settings. While the translation provider indicated that the Chinese and Vietnamese translations were produced entirely by human translators, the first draft of the Arabic translation was generated with the assistance of artificial intelligence (AI)&#x2013;based translation tools and later reviewed by the human translator. This further underscores the increasingly widespread adoption and normalization of generative AI technologies to assist professional translation services.</p></sec><sec id="s2-6"><title>Generated Translations</title><p>For each summary, 2 sets of translations were generated as shown in the translation task matrix (<xref ref-type="table" rid="table1">Table 1</xref>). The first set consisted of 3 LLM-generated translations. This experiment aimed to mimic laypeople using LLMs as a translation service; therefore, the basic web platform was used instead of the developer&#x2019;s application programming interface, which means alll parameters (including system prompt) were set as the model&#x2019;s default values. All LLM-generated translations also used the same basic prompt: &#x201C;Can you translate this document into Arabic/Chinese/Vietnamese, make sure no information is lost. &#x2018;<italic>DOCUMENT</italic>&#x2019;,&#x201D; where <italic>DOCUMENT</italic> is the original summary. All LLM experiments in this study were conducted using a <italic>zero-shot setting</italic>, without any examples, prompt tuning, or system-level customization. The single, minimal prompt was intentionally chosen to reflect typical lay-user (eg, patients) behavior, rather than to evaluate the maximum achievable capability of each model. No terminology constraints, abbreviation handling rules, unit normalization, or do-not-translate rules were provided. Therefore, the results should be interpreted as reflecting realistic, nonexpert use, not optimized or clinician-engineered performance.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Translation task matrix (summary type&#x00D7;language&#x00D7;system).<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Summary type and language</td><td align="left" valign="bottom">GPT</td><td align="left" valign="bottom">Gemma</td><td align="left" valign="bottom">Llama</td><td align="left" valign="bottom">Google Translate</td><td align="left" valign="bottom">Microsoft Bing</td><td align="left" valign="bottom">DeepL</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="7">Simple</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Arabic</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Chinese</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Vietnamese</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top" colspan="7">Complex</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Arabic</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Chinese</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Vietnamese</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top"/></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>DeepL did not support Vietnamese translation at the time of this study. A total of 34 translations were produced across 36 combinations.</p></fn></table-wrap-foot></table-wrap><p>The second set of translations was produced using traditional MT tools&#x2014;Google Translate, Microsoft Bing Translator, and DeepL&#x2014;by inputting the same English summaries into their publicly accessible web interfaces. As with the LLMs, no customization or advanced application programming interface use was applied, ensuring that the outputs reflect a typical user experience. This approach reflects feasibility testing rather than optimized performance.</p></sec><sec id="s2-7"><title>Statistical Analysis</title><p>We used 3 widely used automatic metrics from the MT domain to obtain a comprehensive assessment for each generated summary. These were BLEU, CHR-F, and METEOR. Although originally developed for MT, these metrics remain widely applied to LLM outputs and were used here to provide indicative, rather than definitive, measures of quality [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref29">29</xref>]. Currently, there are no standardized evaluation metrics specifically designed for LLM-generated translations, although recent studies suggest LLMs themselves may serve as quality estimators [<xref ref-type="bibr" rid="ref18">18</xref>]. A brief description of each metric used and its limitations are listed in <xref ref-type="table" rid="table2">Table 2</xref>. The values of each metric range from 0 to 1, where 1 indicates an identical match between the reference translation and generated translation, while 0 indicates no overlap at all.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Description and limitations of each statistical metric used in the evaluation.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Metric</td><td align="left" valign="bottom">Description</td><td align="left" valign="bottom">Limitation</td></tr></thead><tbody><tr><td align="left" valign="top">BLEU (Bilingual Evaluation Understudy)</td><td align="left" valign="top">Measures the word level of overlapping between candidate and reference translations. It is the most common metric in machine translation.</td><td align="left" valign="top">Lacks acceptance of synonyms, paraphrasing, and overall semantic meaning.</td></tr><tr><td align="left" valign="top">CHR-F (Character-level F-score)</td><td align="left" valign="top">Measures the character level of overlapping between candidate and reference translations, making it particularly effective for languages with complex morphology or segmentation issues.</td><td align="left" valign="top">Lacks acceptance of synonyms, paraphrasing, and overall semantic meaning.</td></tr><tr><td align="left" valign="top">METEOR (Metric for Evaluation of Translation with Explicit Ordering)</td><td align="left" valign="top">Aligns words between candidate and reference translations, incorporating synonyms, stemming, and paraphrasing, offering a balance between surface and semantic matching, and is also sensitive to syntax.</td><td align="left" valign="top">Struggles with complex semantics or meanings that require deep understanding.</td></tr></tbody></table></table-wrap><p>Specifically, BLEU measures the <italic>surface similarity at the word level compared to the reference translation</italic>. A high score in BLEU in the health setting means the medical terminology and linguistics for symptoms or diagnosis are captured accurately. CHR-F measures the <italic>surface similarity at the character level compared to the reference translation</italic>, which handles morphological variation better than BLEU. A high CHR-F score means that the generated translation is capturing medical jargon, especially abbreviations, better. METEOR measures <italic>similarity at the semantic level</italic>. A high METEOR score indicates that even when a medical condition or advice is expressed differently, the core semantic content is still preserved.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Overview</title><p><xref ref-type="table" rid="table3">Table 3</xref> presents a comparative evaluation of translation quality for 2 English medical consultation summaries&#x2014;simple (layperson-friendly) and complex (clinician-oriented)&#x2014; translated into Arabic, Chinese, and Vietnamese using 3 LLMs (GPT-4o, Llama-3.1, and Gemma-2) and 3 traditional MT tools (Google Translate, Microsoft Bing Translator, and DeepL). Translation outputs were scored using 3 established automatic evaluation metrics: BLEU (for surface-level n-gram overlap), CHR-F (for character-level fidelity), and METEOR (for semantic similarity and paraphrasing tolerance).</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Translation performance comparison across 3 automatic evaluation metrics&#x2014;Bilingual Evaluation Understudy (BLEU), Character-level F-score (CHR-F), and Metric for Evaluation of Translation with Explicit Ordering (METEOR) &#x2014;for 3 LLMs (GPT-4o, Llama-3.1, and Gemma-2) and 3 traditional MT tools (Google Translate, Microsoft Bing Translator, and DeepL).</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">GPT</td><td align="left" valign="bottom">Llama</td><td align="left" valign="bottom">Gemma</td><td align="left" valign="bottom">Microsoft Bing Translator</td><td align="left" valign="bottom">Google Translate</td><td align="left" valign="bottom">DeepL</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="7">Arabic</td></tr><tr><td align="left" valign="top" colspan="7"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Simple</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>BLEU</td><td align="left" valign="top">0.6536</td><td align="left" valign="top">0.5947</td><td align="left" valign="top">0.6051</td><td align="char" char="." valign="top">0.6405</td><td align="char" char="." valign="top">0.6528</td><td align="char" char="." valign="top">0.6523</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>CHR-F</td><td align="left" valign="top">0.5064</td><td align="left" valign="top">0.4562</td><td align="left" valign="top">0.4693</td><td align="char" char="." valign="top">0.5078</td><td align="char" char="." valign="top">0.5151</td><td align="char" char="." valign="top">0.5156</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>METEOR</td><td align="left" valign="top">0.3166</td><td align="left" valign="top">0.2568</td><td align="left" valign="top">0.2810</td><td align="char" char="." valign="top">0.3289</td><td align="char" char="." valign="top">0.3399</td><td align="char" char="." valign="top">0.3048</td></tr><tr><td align="left" valign="top" colspan="7"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Complex</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>BLEU</td><td align="left" valign="top">0.6300</td><td align="left" valign="top">0.6029</td><td align="left" valign="top">0.6006</td><td align="char" char="." valign="top">0.6149</td><td align="char" char="." valign="top">0.6787</td><td align="char" char="." valign="top">0.6692</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>CHR-F</td><td align="left" valign="top">0.4957</td><td align="left" valign="top">0.4726</td><td align="left" valign="top">0.4768</td><td align="char" char="." valign="top">0.5005</td><td align="char" char="." valign="top">0.5907</td><td align="char" char="." valign="top">0.5423</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>METEOR</td><td align="left" valign="top">0.3343</td><td align="left" valign="top">0.3575</td><td align="left" valign="top">0.3682</td><td align="char" char="." valign="top">0.4012</td><td align="char" char="." valign="top">0.4988</td><td align="char" char="." valign="top">0.4171</td></tr><tr><td align="left" valign="top" colspan="7">Chinese</td></tr><tr><td align="left" valign="top" colspan="7"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Simple</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>BLEU</td><td align="left" valign="top">0.4266</td><td align="left" valign="top">0.4164</td><td align="left" valign="top">0.5227</td><td align="char" char="." valign="top">0.5018</td><td align="char" char="." valign="top">0.4358</td><td align="char" char="." valign="top">0.4322</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>CHR-F</td><td align="left" valign="top">0.4165</td><td align="left" valign="top">0.3910</td><td align="left" valign="top">0.4392</td><td align="char" char="." valign="top">0.4193</td><td align="char" char="." valign="top">0.3709</td><td align="char" char="." valign="top">0.3844</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>METEOR</td><td align="left" valign="top">0.4698</td><td align="left" valign="top">0.5033</td><td align="left" valign="top">0.5678</td><td align="char" char="." valign="top">0.5169</td><td align="char" char="." valign="top">0.4564</td><td align="char" char="." valign="top">0.5097</td></tr><tr><td align="left" valign="top" colspan="7"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Complex</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>BLEU</td><td align="left" valign="top">0.2662</td><td align="left" valign="top">0.2422</td><td align="left" valign="top">0.2466</td><td align="char" char="." valign="top">0.2393</td><td align="char" char="." valign="top">0.2596</td><td align="char" char="." valign="top">0.2603</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>CHR-F</td><td align="left" valign="top">0.2871</td><td align="left" valign="top">0.2387</td><td align="left" valign="top">0.2576</td><td align="char" char="." valign="top">0.2427</td><td align="char" char="." valign="top">0.2729</td><td align="char" char="." valign="top">0.2526</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>METEOR</td><td align="left" valign="top">0.3055</td><td align="left" valign="top">0.3347</td><td align="left" valign="top">0.3394</td><td align="char" char="." valign="top">0.3294</td><td align="char" char="." valign="top">0.3349</td><td align="char" char="." valign="top">0.3331</td></tr><tr><td align="left" valign="top" colspan="7">Vietnamese</td></tr><tr><td align="left" valign="top" colspan="7"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Simple</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>BLEU</td><td align="left" valign="top">0.7188</td><td align="left" valign="top">0.7517</td><td align="left" valign="top">0.7458</td><td align="char" char="." valign="top">0.7464</td><td align="char" char="." valign="top">0.7719</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>CHR-F</td><td align="left" valign="top">0.5956</td><td align="left" valign="top">0.5979</td><td align="left" valign="top">0.5957</td><td align="char" char="." valign="top">0.5864</td><td align="char" char="." valign="top">0.6184</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>METEOR</td><td align="left" valign="top">0.5566</td><td align="left" valign="top">0.5894</td><td align="left" valign="top">0.5489</td><td align="char" char="." valign="top">0.5373</td><td align="char" char="." valign="top">0.5908</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top" colspan="7"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Complex</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>BLEU</td><td align="left" valign="top">0.6857</td><td align="left" valign="top">0.7441</td><td align="left" valign="top">0.7118</td><td align="char" char="." valign="top">0.7059</td><td align="char" char="." valign="top">0.7325</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>CHR-F</td><td align="left" valign="top">0.5494</td><td align="left" valign="top">0.6055</td><td align="left" valign="top">0.6130</td><td align="char" char="." valign="top">0.5862</td><td align="char" char="." valign="top">0.6252</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>METEOR</td><td align="left" valign="top">0.4652</td><td align="left" valign="top">0.4787</td><td align="left" valign="top">0.5399</td><td align="char" char="." valign="top">0.5222</td><td align="char" char="." valign="top">0.5744</td><td align="left" valign="top">&#x2014;</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>Not available.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Simple vs Complex Summary Translation Performance</title><p>We observed a consistent pattern across the Chinese and Vietnamese languages, where simple summaries&#x2014;written in layperson language with minimal technical jargon&#x2014;achieved higher scores across most metrics compared with complex summaries. This trend was less pronounced or even reversed in Arabic. For example, in Vietnamese, BLEU scores for the simple summary were high across all models, with Google Translate achieving the highest score of 0.7719, followed closely by Llama (0.7517) and MS (0.7464). CHR-F and METEOR scores also reflected strong alignment, with Google Translate again leading in METEOR score (0.5908).</p><p>However, for the complex summary, BLEU scores dipped across all models (eg, GPT declined from 0.7188 to 0.6857), although the degradation for Vietnamese was relatively minor, suggesting good resilience for Vietnamese translation. A dramatic drop was observed in Chinese performance. For instance, Gemma&#x2019;s BLEU score fell from 0.5227 (simple) to 0.2466 (complex). CHR-F and METEOR scores showed a similar drop, with CHR-F scores dropping from 0.4392 to 0.2576 and METEOR scores dropping from 0.5678 to 0.3394. This suggests a struggle in translating complex, technical content due to syntactic and terminological challenges.</p><p>In Arabic, an interesting reverse trend emerged. For instance, Google Translate&#x2019;s BLEU score increased from 0.6528 (simple) to 0.6787 (complex). The METEOR score also jumped significantly, from 0.3399 to 0.4988. This may be due to Arabic&#x2019;s rich morphology, where longer, more context-rich sentences provide better clues for disambiguation and grammatical accuracy.</p></sec><sec id="s3-3"><title>LLMs vs Traditional MT Tools</title><p>Across all languages, traditional MT tools (Google Translate and Microsoft Bing Translator) generally outperformed LLMs such as GPT-4o, Llama, and Gemma on standard metrics, particularly for complex summaries.</p><p>For example, in Arabic complex summaries, Google Translate achieved a BLEU score of 0.6787 and a METEOR score of 0.4988, outperforming GPT (BLEU: 0.6300; METEOR: 0.3343), Llama, and Gemma. Similar trends were seen in CHR-F, where Google Translate scored 0.5907, the highest among all systems.</p><p>This reflects the focus of traditional MT tools on token-level alignment and training objectives that favor metrics such as BLEU and CHR-F, rewarding close n-gram matches. On the other hand, LLMs often prioritize fluency and coherence, producing paraphrased outputs that, while semantically accurate, may score lower due to structural differences.</p><p>However, Llama outperformed GPT-4o in Vietnamese and Chinese METEOR scores, and Gemma did the same except for Vietnamese simple summaries. For example, in Vietnamese simple summaries, Llama outperformed GPT with a METEOR score of 0.5894 vs 0.5566, suggesting that specific LLMs may better capture semantic nuances in certain linguistic contexts.</p></sec><sec id="s3-4"><title>Language-Specific Performance Trends</title><p>The 3 target languages demonstrated distinct translation characteristics in response to summary complexity and model type:</p><list list-type="bullet"><list-item><p>Arabic: translation quality improved with complex summaries across most models. For instance, GPT&#x2019;s CHR-F score went from 0.5064 (simple) to 0.4957 (complex)&#x2014;a small drop, but METEOR scores rose from 0.3166 to 0.3343 and other models (such as Llama and Gemma) saw larger METEOR score gains. This supports the hypothesis that longer, more redundant input helps Arabic models resolve morphological ambiguities.</p></list-item><list-item><p>Chinese: performance dropped sharply from simple to complex summaries. BLEU scores for Gemma fell from 0.5227 to 0.2466 and METEOR fell from 0.5678 to 0.3394. This suggests transliteration issues, syntactic mismatches, and rare term handling present challenges in Chinese clinical text translation. Similar challenges in Chinese medical translation have been reported in previous studies, where syntactic ambiguity, segmentation issues, and limited domain-specific parallel corpora reduce translation accuracy [<xref ref-type="bibr" rid="ref21">21</xref>].</p></list-item><list-item><p>Vietnamese: scores declined modestly from simple to complex, showing the least performance degradation. For example, Llama&#x2019;s BLEU score only dropped from 0.7517 to 0.7441 and CHR-F score actually increased from 0.5979 to 0.6055. This suggests that Vietnamese translation is robust, possibly due to better multilingual representation in modern models.</p></list-item></list></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This pilot feasibility study compared LLMs and traditional MT tools for translating medical consultation summaries into Arabic, Chinese, and Vietnamese. Translation performance varied substantially by language, document complexity, and system type.</p><p>Overall, traditional MT tools achieved higher scores on surface-level metrics such as BLEU and CHR-F, particularly for complex, clinician-oriented summaries, reflecting optimization for lexical fidelity. In contrast, LLMs demonstrated relative strengths in semantic similarity in selected language-summary combinations, as reflected by METEOR scores, particularly for Vietnamese and Chinese simple summaries. These patterns suggest that while traditional MT tools may better preserve surface form, LLMs may capture broader semantic intent in some contexts.</p><p>Language-specific trends further illustrate the complexity of multilingual medical translation. Chinese translations showed a marked decline in performance for complex summaries, highlighting challenges related to segmentation and specialized terminology. Vietnamese translations were comparatively robust across summary complexity, while Arabic translations improved for more complex input, potentially due to increased contextual redundancy supporting disambiguation in morphologically rich language structures. Importantly, metric-based performance alone does not indicate clinical safety or appropriateness for health care use.</p></sec><sec id="s4-2"><title>Implications</title><p>These findings should be interpreted in light of how translation quality is evaluated and how AI translation tools are used in real-world health care settings. Differences observed between systems reflect not only model capability but also the interaction between evaluation metrics, linguistic structure, and user behavior.</p><p>Automatic translation metrics such as BLEU and CHR-F prioritize surface-level similarity and therefore favor systems optimized for n-gram alignment. METEOR partially accounts for paraphrasing but remains unable to capture clinical salience, pragmatic intent, or contextual appropriateness. As a result, metric differences should not be interpreted as indicators of clinical safety. In health care communication, mistranslating a medication name or dosage carries far greater risk than stylistic variation, yet current metrics penalize both equally [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref31">31</xref>]. This reflects a broader challenge in evaluating AI-generated medical translations.</p><p>Differences between LLMs and traditional MT tools further illustrate this issue. Traditional MT systems generally achieved higher surface-level scores, while LLMs often produced more fluent and paraphrased outputs that diverged structurally from professional reference translations. Reliance on automatic metrics alone may therefore underestimate communicative strengths while simultaneously obscuring clinically meaningful errors that require expert judgment to identify.</p><p>The observed language-specific patterns underscore the need for language-aware evaluation approaches. Performance differences across Arabic, Chinese, and Vietnamese suggest that linguistic structure, morphology, and contextual density influence how effectively models disambiguate clinical meaning. These findings caution against assuming uniform translation performance across languages in health care contexts.</p><p>Finally, the results must be interpreted in the context of real-world use. The workflows evaluated in this study intentionally reflect nonexpert use, where patients or carers rely on publicly accessible tools without structured prompts, terminology constraints, or professional oversight. Translation quality in LLMs is known to be prompt sensitive [<xref ref-type="bibr" rid="ref32">32</xref>,<xref ref-type="bibr" rid="ref33">33</xref>], and professional translation workflows increasingly involve human postediting of machine-generated drafts. Accordingly, the findings represent feasibility and risk under typical lay-user conditions rather than upper-bound system capability.</p></sec><sec id="s4-3"><title>Limitations and Future Directions</title><p>This study has several limitations that define its methodological scope and directly inform future research priorities.</p><p>First, the experimental design reflects lay-user translation behavior rather than professional or clinical workflows. We used a single minimal prompt for LLMs and default web interfaces for MT tools, without terminology constraints, glossary enforcement, abbreviation expansion, or structured output formats. While such safeguards are standard in clinical translation services, they are rarely applied by nonexpert users. Future work should systematically compare lay-user workflows with clinically realistic translation pipelines to distinguish model capability from misuse-related risk.</p><p>Second, no human expert adjudication was conducted. Without clinician or professional translator review, it is not possible to classify error types, assess severity, or determine potential impacts on patient understanding or clinical decision-making [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref34">34</xref>]. Future studies should incorporate structured human-in-the-loop evaluation to identify safety-critical errors.</p><p>Third, the study is deliberately small in scale, using 2 simulated consultation summaries representing limited clinical contexts. This precludes generalization across specialties, documentation styles, or real-world variability. Larger multidomain datasets are required to assess generalizability.</p><p>Fourth, statistical reporting is limited to point estimates of automatic metrics. CIs, uncertainty estimation, and robustness analyses were not performed due to the small sample size. Future evaluations should adopt statistically robust analysis frameworks while avoiding overinterpretation of metric differences. While some studies indicate that LLMs themselves can be used as evaluators to grade the quality of translation, the research is still at a very early stage and lacks a comprehensive understanding of the limitations [<xref ref-type="bibr" rid="ref35">35</xref>].</p><p>Finally, existing automatic metrics are poorly aligned with clinical risk and cannot distinguish stylistic variation from safety-critical errors. There is a clear need for translation-specific clinical evaluation frameworks that integrate human judgment and explicitly assess dimensions such as accuracy of critical entities, hallucination rate, trustworthiness, and interpretability in multilingual health care contexts [<xref ref-type="bibr" rid="ref35">35</xref>-<xref ref-type="bibr" rid="ref37">37</xref>].</p><p>Together, these limitations and future directions underscore that the primary contribution of this study is not to establish clinical readiness but to clarify key risks, evaluation gaps, and methodological challenges that must be addressed before AI-assisted medical translation can be responsibly used in health care communication.</p></sec><sec id="s4-4"><title>Conclusion</title><p>This pilot feasibility study provides an early, structured comparison of LLMs and traditional MT tools for translating medical consultation summaries into Arabic, Chinese, and Vietnamese within a palliative care context. The findings highlight that while both system types can produce translations that appear usable under automatic evaluation metrics, performance varies substantially by language, document complexity, and evaluation method. Importantly, metric-based similarity does not equate to clinical safety, and unstructured use of AI translation tools&#x2014;particularly by nonexpert users&#x2014;poses tangible risks in health care communication.</p><p>This pilot feasibility study does not assess clinical safety or readiness for deployment. Instead, it clarifies key methodological challenges and evaluation gaps that must be addressed before AI-assisted medical translation can be responsibly integrated into health care practice. Human oversight and clinically grounded evaluation remain essential.</p></sec></sec></body><back><ack><p>Generative artificial intelligence tool ChatGPT by OpenAI was used to suggest language improvements within the manuscript.</p></ack><notes><sec><title>Funding</title><p>The authors declared no financial support was received for this work.</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">BLEU</term><def><p>Bilingual Evaluation Understudy</p></def></def-item><def-item><term id="abb3">CHR-F</term><def><p>Character-level F-score</p></def></def-item><def-item><term id="abb4">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb5">METEOR</term><def><p>Metric for Evaluation of Translation with Explicit Ordering</p></def></def-item><def-item><term id="abb6">MT</term><def><p>machine translation</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="web"><article-title>Google Translate</article-title><source>Google</source><access-date>2026-02-27</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://translate.google.com/?hl=bn&#x0026;sl=auto&#x0026;tl=bn&#x0026;op=translate">https://translate.google.com/?hl=bn&#x0026;sl=auto&#x0026;tl=bn&#x0026;op=translate</ext-link></comment></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="web"><article-title>Bing Translator</article-title><source>Microsoft</source><access-date>2026-02-27</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.microsoft.com/en-us/bing/features/translator/?form=MA13FV">https://www.microsoft.com/en-us/bing/features/translator/?form=MA13FV</ext-link></comment></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="web"><article-title>DeepL Translate</article-title><source>DeepL</source><access-date>2026-02-27</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.deepl.com/en/translator">https://www.deepl.com/en/translator</ext-link></comment></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Costa-Juss&#x00E0;</surname><given-names>MR</given-names> </name><name name-style="western"><surname>Farr&#x00FA;s</surname><given-names>M</given-names> </name><name name-style="western"><surname>Serrano Pons</surname><given-names>J</given-names> </name></person-group><article-title>Machine translation in medicine: a quality analysis of statistical machine translation in the medical domain</article-title><access-date>2026-02-27</access-date><conf-name>1st Virtual International Conference on Advanced Research in Scientific Fields (ARSA-2012)</conf-name><conf-date>Dec 3-7, 2012</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://repositori.upf.edu/items/d591266b-d0aa-4bcf-817b-f34391b07fab">https://repositori.upf.edu/items/d591266b-d0aa-4bcf-817b-f34391b07fab</ext-link></comment></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>M&#x00FC;ller</surname><given-names>F</given-names> </name><name name-style="western"><surname>Schr&#x00F6;der</surname><given-names>D</given-names> </name><name name-style="western"><surname>Noack</surname><given-names>EM</given-names> </name></person-group><article-title>Overcoming language barriers in paramedic care with an app designed to improve communication with foreign-language patients: nonrandomized controlled pilot study</article-title><source>JMIR Form Res</source><year>2023</year><month>03</month><day>23</day><volume>7</volume><fpage>e43255</fpage><pub-id pub-id-type="doi">10.2196/43255</pub-id><pub-id pub-id-type="medline">36951895</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Olsavszky</surname><given-names>V</given-names> </name><name name-style="western"><surname>Bazari</surname><given-names>M</given-names> </name><name name-style="western"><surname>Dai</surname><given-names>TB</given-names> </name><etal/></person-group><article-title>Digital translation platform (translatly) to overcome communication barriers in clinical care: pilot study</article-title><source>JMIR Form Res</source><year>2025</year><month>03</month><day>14</day><volume>9</volume><fpage>e63095</fpage><pub-id pub-id-type="doi">10.2196/63095</pub-id><pub-id pub-id-type="medline">39451122</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="web"><article-title>Hello GPT&#x2011;4o</article-title><source>OpenAI</source><access-date>2026-02-27</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openai.com/index/hello-gpt-4o/">https://openai.com/index/hello-gpt-4o/</ext-link></comment></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="web"><article-title>Gemma</article-title><source>Google DeepMind</source><access-date>2026-02-27</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://deepmind.google/models/gemma/">https://deepmind.google/models/gemma/</ext-link></comment></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="web"><article-title>Open-source AI models for any application | Llama 3</article-title><source>Meta</source><access-date>2026-03-18</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.llama.com/models/llama-3/">https://www.llama.com/models/llama-3/</ext-link></comment></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Soto-Ch&#x00E1;vez</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Bustos</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Fern&#x00E1;ndez-&#x00C1;vila</surname><given-names>DG</given-names> </name><name name-style="western"><surname>Mu&#x00F1;oz</surname><given-names>OM</given-names> </name></person-group><article-title>Evaluation of information provided to patients by ChatGPT about chronic diseases in Spanish language</article-title><source>Digit Health</source><year>2024</year><month>01</month><day>2</day><volume>10</volume><fpage>20552076231224603</fpage><pub-id pub-id-type="doi">10.1177/20552076231224603</pub-id><pub-id pub-id-type="medline">38188865</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Albogami</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Alfakhri</surname><given-names>A</given-names> </name><name name-style="western"><surname>Alaqil</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Safety and quality of AI chatbots for drug-related inquiries: a real-world comparison with licensed pharmacists</article-title><source>Digit Health</source><year>2024</year><month>05</month><day>15</day><volume>10</volume><fpage>20552076241253523</fpage><pub-id pub-id-type="doi">10.1177/20552076241253523</pub-id><pub-id pub-id-type="medline">38757086</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Heo</surname><given-names>S</given-names> </name><name name-style="western"><surname>Park</surname><given-names>JH</given-names> </name></person-group><article-title>Performance of ChatGPT on the National Korean Occupational Therapy Licensing Examination</article-title><source>Digit Health</source><year>2024</year><month>02</month><day>29</day><volume>10</volume><fpage>20552076241236635</fpage><pub-id pub-id-type="doi">10.1177/20552076241236635</pub-id><pub-id pub-id-type="medline">38434792</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>JH</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>SK</given-names> </name><name name-style="western"><surname>Choi</surname><given-names>J</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>Y</given-names> </name></person-group><article-title>Reliability of ChatGPT for performing triage task in the emergency department using the Korean Triage and Acuity Scale</article-title><source>Digit Health</source><year>2024</year><month>01</month><day>17</day><volume>10</volume><fpage>20552076241227132</fpage><pub-id pub-id-type="doi">10.1177/20552076241227132</pub-id><pub-id pub-id-type="medline">38250148</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ting</surname><given-names>YT</given-names> </name><name name-style="western"><surname>Hsieh</surname><given-names>TC</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>YF</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT incorporated chain-of-thought method in bilingual nuclear medicine physician board examinations</article-title><source>Digit Health</source><year>2024</year><month>01</month><day>5</day><volume>10</volume><fpage>20552076231224074</fpage><pub-id pub-id-type="doi">10.1177/20552076231224074</pub-id><pub-id pub-id-type="medline">38188855</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>CH</given-names> </name><name name-style="western"><surname>Hsiao</surname><given-names>HJ</given-names> </name><name name-style="western"><surname>Yeh</surname><given-names>PC</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>KC</given-names> </name><name name-style="western"><surname>Kao</surname><given-names>CH</given-names> </name></person-group><article-title>Performance of ChatGPT on Stage 1 of the Taiwanese medical licensing exam</article-title><source>Digit Health</source><year>2024</year><month>02</month><day>16</day><volume>10</volume><fpage>20552076241233144</fpage><pub-id pub-id-type="doi">10.1177/20552076241233144</pub-id><pub-id pub-id-type="medline">38371244</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alam</surname><given-names>S</given-names> </name><name name-style="western"><surname>Sohail</surname><given-names>SS</given-names> </name></person-group><article-title>Integrating ChatGPT: enhancing postpartum mental healthcare with artificial intelligence (AI) support</article-title><source>Digit Health</source><year>2024</year><month>12</month><day>8</day><volume>10</volume><fpage>20552076241295565</fpage><pub-id pub-id-type="doi">10.1177/20552076241295565</pub-id><pub-id pub-id-type="medline">39655059</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Lyu</surname><given-names>C</given-names> </name><name name-style="western"><surname>Ji</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Document-level machine translation with large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 5, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2304.02210</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Kocmi</surname><given-names>T</given-names> </name><name name-style="western"><surname>Federmann</surname><given-names>C</given-names> </name></person-group><article-title>Large language models are state-of-the-art evaluators of translation quality</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 28, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2302.14520</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wassie</surname><given-names>AK</given-names> </name><name name-style="western"><surname>Molaei</surname><given-names>M</given-names> </name><name name-style="western"><surname>Moslem</surname><given-names>Y</given-names> </name></person-group><article-title>Domain-specific translation with open-source large language models: resource-oriented analysis</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 8, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2412.05862</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Rios</surname><given-names>M</given-names> </name></person-group><article-title>Instruction-tuned large language models for machine translation in the medical domain</article-title><source>arXiv</source><comment>Preprint posted online on  Aug 29, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2408.16440</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Cardey</surname><given-names>S</given-names> </name><name name-style="western"><surname>Greenfield</surname><given-names>P</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>X</given-names> </name></person-group><article-title>Designing a controlled language for the machine translation of medical protocols: the case of English to Chinese</article-title><access-date>2026-02-27</access-date><conf-name>6th Conference of the Association for Machine Translation in the Americas</conf-name><conf-date>Sep 28 to Oct 2, 2004</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://link.springer.com/chapter/10.1007/978-3-540-30194-3_5">https://link.springer.com/chapter/10.1007/978-3-540-30194-3_5</ext-link></comment></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="web"><article-title>Cultural diversity of Australia</article-title><source>Australian Bureau of Statistics</source><year>2022</year><access-date>2026-02-27</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.abs.gov.au/articles/cultural-diversity-australia">https://www.abs.gov.au/articles/cultural-diversity-australia</ext-link></comment></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>W</given-names> </name><name name-style="western"><surname>Hoda</surname><given-names>R</given-names> </name><name name-style="western"><surname>Li</surname><given-names>A</given-names> </name><name name-style="western"><surname>Bain</surname><given-names>C</given-names> </name><name name-style="western"><surname>Poon</surname><given-names>P</given-names> </name></person-group><article-title>Exploring the opportunities of large language models for summarizing palliative care consultations: a pilot comparative study</article-title><source>Digit Health</source><year>2024</year><month>11</month><day>20</day><volume>10</volume><fpage>20552076241293932</fpage><pub-id pub-id-type="doi">10.1177/20552076241293932</pub-id><pub-id pub-id-type="medline">39569395</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Papineni</surname><given-names>K</given-names> </name><name name-style="western"><surname>Roukos</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ward</surname><given-names>T</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>WJ</given-names> </name></person-group><article-title>BLEU: a method for automatic evaluation of machine translation</article-title><conf-name>ACL &#x2019;02: Proceedings of the 40th Annual Meeting on Association for Computational Linguistics</conf-name><conf-date>Jul 7-12, 2002</conf-date><pub-id pub-id-type="doi">10.3115/1073083.1073135</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Popovi&#x0107;</surname><given-names>M</given-names> </name></person-group><article-title>chrF: character n-gram F-score for automatic MT evaluation</article-title><conf-name>Proceedings of the Tenth Workshop on Statistical Machine Translation</conf-name><conf-date>Sep 17-18, 2017</conf-date><pub-id pub-id-type="doi">10.18653/v1/W15-3049</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Banerjee</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lavie</surname><given-names>A</given-names> </name></person-group><article-title>METEOR: an automatic metric for MT evaluation with improved correlation with human judgments</article-title><access-date>2026-02-28</access-date><conf-name>ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and/or Summarization</conf-name><conf-date>Jun 29, 2005</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/W05-0909/">https://aclanthology.org/W05-0909/</ext-link></comment></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Baek</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>K</given-names> </name><name name-style="western"><surname>Ki</surname><given-names>D</given-names> </name><name name-style="western"><surname>Park</surname><given-names>C</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>HG</given-names> </name><name name-style="western"><surname>Choo</surname><given-names>J</given-names> </name></person-group><article-title>Towards accurate translation via semantically appropriate application of lexical constraints</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 21, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2306.12089</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Du&#x0161;ek</surname><given-names>O</given-names> </name><name name-style="western"><surname>Haji&#x010D;</surname><given-names>J</given-names> </name><name name-style="western"><surname>Hlav&#x00E1;&#x010D;ov&#x00E1;</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Machine translation of medical texts in the Khresmoi project</article-title><access-date>2026-02-28</access-date><conf-name>Proceedings of the Ninth Workshop on Statistical Machine Translation</conf-name><conf-date>Jun 26-27, 2014</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/W14-3326.pdf">https://aclanthology.org/W14-3326.pdf</ext-link></comment></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Elshin</surname><given-names>D</given-names> </name><name name-style="western"><surname>Karpachev</surname><given-names>N</given-names> </name><name name-style="western"><surname>Gruzdev</surname><given-names>B</given-names> </name><etal/></person-group><article-title>From general LLM to translation: how we dramatically improve translation quality using human evaluation data for LLM finetuning</article-title><conf-name>Proceedings of the Ninth Conference on Machine Translation</conf-name><conf-date>Nov 15-16, 2024</conf-date><pub-id pub-id-type="doi">10.18653/v1/2024.wmt-1.17</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vieira</surname><given-names>LN</given-names> </name><name name-style="western"><surname>O&#x2019;Hagan</surname><given-names>M</given-names> </name><name name-style="western"><surname>O&#x2019;Sullivan</surname><given-names>C</given-names> </name></person-group><article-title>Understanding the societal impacts of machine translation: a critical review of the literature on medical and legal use cases</article-title><source>Inf Commun Soc</source><year>2021</year><volume>24</volume><issue>11</issue><fpage>1515</fpage><lpage>1532</lpage><pub-id pub-id-type="doi">10.1080/1369118X.2020.1776370</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ismayilli</surname><given-names>TM</given-names> </name></person-group><article-title>Navigating complexities in medical text translation: challenges, strategies, and solutions</article-title><source>Acta Glob Humanit Linguar</source><year>2024</year><volume>1</volume><issue>2</issue><fpage>170</fpage><lpage>176</lpage><pub-id pub-id-type="doi">10.69760/aghel.01024080</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>B</given-names> </name><name name-style="western"><surname>Haddow</surname><given-names>B</given-names> </name><name name-style="western"><surname>Birch</surname><given-names>A</given-names> </name></person-group><article-title>Prompting large language model for machine translation: a case study</article-title><conf-name>ICML&#x2019;23: Proceedings of the 40th International Conference on Machine Learning</conf-name><conf-date>Jul 23-29, 2023</conf-date><pub-id pub-id-type="doi">10.5555/3618408.3620130</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Garcia</surname><given-names>X</given-names> </name><name name-style="western"><surname>Bansal</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Cherry</surname><given-names>C</given-names> </name><etal/></person-group><article-title>The unreasonable effectiveness of few-shot learning for machine translation</article-title><conf-name>ICML&#x2019;23: Proceedings of the 40th International Conference on Machine Learning</conf-name><conf-date>Jul 23-29, 2023</conf-date><pub-id pub-id-type="doi">10.5555/3618408.3618846</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Eshbo&#x2019;riyevich</surname><given-names>AY</given-names> </name></person-group><article-title>Problems of translation of medical terms</article-title><source>International Conference on Linguistics, Literature and Translation (London)</source><year>2024</year><access-date>2026-02-28</access-date><volume>2</volume><fpage>1</fpage><lpage>5</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://top-conferences.us/index.php/ICLLT/article/view/226">https://top-conferences.us/index.php/ICLLT/article/view/226</ext-link></comment></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Croxford</surname><given-names>E</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>Y</given-names> </name><name name-style="western"><surname>First</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Evaluating clinical AI summaries with large language models as judges</article-title><source>NPJ Digit Med</source><year>2025</year><month>11</month><day>5</day><volume>8</volume><issue>1</issue><fpage>640</fpage><pub-id pub-id-type="doi">10.1038/s41746-025-02005-2</pub-id><pub-id pub-id-type="medline">41193667</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Abbasian</surname><given-names>M</given-names> </name><name name-style="western"><surname>Khatibi</surname><given-names>E</given-names> </name><name name-style="western"><surname>Azimi</surname><given-names>I</given-names> </name><etal/></person-group><article-title>Foundation metrics for evaluating effectiveness of healthcare conversations powered by generative AI</article-title><source>NPJ Digit Med</source><year>2024</year><month>03</month><day>29</day><volume>7</volume><issue>1</issue><fpage>82</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01074-z</pub-id><pub-id pub-id-type="medline">38553625</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Asgari</surname><given-names>E</given-names> </name><name name-style="western"><surname>Monta&#x00F1;a-Brown</surname><given-names>N</given-names> </name><name name-style="western"><surname>Dubois</surname><given-names>M</given-names> </name><etal/></person-group><article-title>A framework to assess clinical safety and hallucination rates of LLMs for medical text summarisation</article-title><source>NPJ Digit Med</source><year>2025</year><month>05</month><day>13</day><volume>8</volume><issue>1</issue><fpage>274</fpage><pub-id pub-id-type="doi">10.1038/s41746-025-01670-7</pub-id><pub-id pub-id-type="medline">40360677</pub-id></nlm-citation></ref></ref-list></back></article>