<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="case-report"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v10i1e72604</article-id><article-id pub-id-type="doi">10.2196/72604</article-id><article-categories><subj-group subj-group-type="heading"><subject>Case Report</subject></subj-group></article-categories><title-group><article-title>Korean Medical Consultation With Open-Weight Large Language Models: Pilot Comparative Evaluation of Retrieval-Augmented Generation With Metadata Filtering</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Choi</surname><given-names>Saeyoun</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kim</surname><given-names>Donghyun</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Jeon</surname><given-names>Ji-Hwan</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kim</surname><given-names>Minji</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lee</surname><given-names>Dong Hun</given-names></name><degrees>MPH</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ahn</surname><given-names>DaeHwan</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lee</surname><given-names>Eu Sun</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kim</surname><given-names>Yoon Ji</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Youk</surname><given-names>Hyun</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff7">7</xref><xref ref-type="aff" rid="aff8">8</xref></contrib></contrib-group><aff id="aff1"><institution>MAIN Corp</institution><addr-line>1 Gangwon-daehak-gil, Room 1201 (Bodeum-gwan)</addr-line><addr-line>Chuncheon City</addr-line><addr-line>Gangwon Province</addr-line><country>Republic of Korea</country></aff><aff id="aff2"><institution>Department of Industrial and Systems Engineering, Dongguk University</institution><addr-line>Seoul</addr-line><country>Republic of Korea</country></aff><aff id="aff3"><institution>InVisionLab Inc</institution><addr-line>Seoul</addr-line><country>Republic of Korea</country></aff><aff id="aff4"><institution>Department of Healthcare Management, Gachon University</institution><addr-line>Seongnam</addr-line><country>Republic of Korea</country></aff><aff id="aff5"><institution>KOI Healthcare Co, Ltd</institution><addr-line>Seongnam</addr-line><country>Republic of Korea</country></aff><aff id="aff6"><institution>Department of Preventive Medicine, College of Medicine, University of Ulsan</institution><addr-line>Seoul</addr-line><country>Republic of Korea</country></aff><aff id="aff7"><institution>Digital Health Laboratory, Wonju College of Medicine, Yonsei University</institution><addr-line>Wonju</addr-line><country>Republic of Korea</country></aff><aff id="aff8"><institution>Regional Trauma Center, Wonju Severance Christian Hospital</institution><addr-line>Wonju</addr-line><country>Republic of Korea</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Sarvestan</surname><given-names>Javad</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Acharya</surname><given-names>Nirajan</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Mohanadas</surname><given-names>Sadhasivam</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Hyun Youk, MD, MAIN Corp, 1 Gangwon-daehak-gil, Room 1201 (Bodeum-gwan), Chuncheon City, Gangwon Province, 24341, Republic of Korea, 82 10-9840-2120; <email>ceo@gkmain.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>30</day><month>4</month><year>2026</year></pub-date><volume>10</volume><elocation-id>e72604</elocation-id><history><date date-type="received"><day>15</day><month>05</month><year>2025</year></date><date date-type="rev-recd"><day>01</day><month>04</month><year>2026</year></date><date date-type="accepted"><day>02</day><month>04</month><year>2026</year></date></history><copyright-statement>&#x00A9; Saeyoun Choi, Donghyun Kim, Ji-Hwan Jeon, Minji Kim, Dong Hun Lee, DaeHwan Ahn, Eu Sun Lee, Yoon Ji Kim, Hyun Youk. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 30.4.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2026/1/e72604"/><abstract><sec><title>Background</title><p>This study develops an open-source large language model&#x2013;based chatbot tailored for Korean health consultations. The chatbot was implemented using the retrieval-augmented generation (RAG) technique alongside metadata filtering to enhance its performance.</p></sec><sec><title>Objective</title><p>This study aims to analyze and compare the performance of a RAG-based chatbot with other leading language models in the context of Korean health consultations.</p></sec><sec sec-type="methods"><title>Methods</title><p>A 10.4 GB Korean medical document corpus (487,277 segments) was constructed from official websites of major Korean hospitals, public health sources, and medical textbooks. This study quantitatively compared 5 open-source large language models (Qwen3:4B, Mistral:7B, Llama-3.1:8B, Gpt-Oss:20B, and Gemma3:27B) in 3 configurations: baseline (model only), RAG-only, and RAG with metadata filtering. The RAG system used a specialized Korean embedding model (upskyy/bge-m3-korean) and an Elasticsearch store. Performance was assessed by an emergency medicine specialist using a validation set of 226 questions across 7 common diseases and scoring responses based on accuracy, safety, and helpfulness.</p></sec><sec sec-type="results"><title>Results</title><p>The application of RAG alone failed to yield statistically significant performance improvements and, in some cases (Llama 3.1: 8B and Gemma 3: 27B), resulted in decreased scores. However, the combination of RAG with metadata filtering yielded statistically significant (<italic>P</italic>&#x003C;.05) performance increases in most models. Notably, the average score for Mistral:7B increased from 3.79, SD 0.08, to 4.10, SD 0.10, and Gpt-Oss:20B increased from 4.43, SD 0.05, to 4.51, SD 0.04, with the latter achieving the highest safety score (4.61, SD 0.03). The Gemma3:27B model, which possessed a high baseline performance (4.42, SD 0.03), was an exception, exhibiting no significant improvement (<italic>P</italic>=.14) even with filtering.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The effectiveness of RAG for specialized domains such as Korean medical consultation is highly dependent on a metadata filtering process that controls the quality of retrieved information; simple information augmentation is insufficient. Furthermore, the benefit of RAG is limited when a model&#x2019;s intrinsic knowledge (eg, Gemma3:27B) already meets or exceeds the quality of the external knowledge base. This finding indicates that performance enhancement strategies must account for both the retrieval mechanism&#x2019;s quality and the model&#x2019;s preexisting capabilities.</p></sec></abstract><kwd-group><kwd>large language model</kwd><kwd>LLM</kwd><kwd>retrieval-augmented generation</kwd><kwd>RAG</kwd><kwd>metadata filtering</kwd><kwd>health chatbot</kwd><kwd>Korean health care</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>With the rapid advancement of artificial intelligence (AI), medical chatbots are playing an increasingly pivotal role in providing personalized health care consultations and managing patient data [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. However, despite the overall progress of large language models (LLMs), a performance gap between English and Korean persists, as technical breakthroughs in high-resource languages do not inherently transfer to midresource languages such as Korean [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. These limitations act as structural constraints when directly applying general-purpose LLMs to the Korean health care environment, particularly in their failure to sufficiently reflect local medical systems, legal regulations, and ethical standards [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>].</p><p>Concurrently, South Korea&#x2019;s medical ethics, guidelines, and legal norms are closely integrated with highly standardized prescription and treatment protocols, which are intrinsically linked to the National Health Insurance reimbursement system, licensing requirements, and clinical guidelines from professional societies [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]. Consequently, standardized procedures and criteria often carry more weight than individual clinical discretion [<xref ref-type="bibr" rid="ref9">9</xref>]. As a result, recommendations from general-purpose GPT-based models&#x2014;primarily trained on United States or European guidelines&#x2014;may conflict with Korean regulations, potentially suggesting interventions that are prohibited, not covered by insurance, or legally sensitive [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref12">12</xref>]. Such discrepancies pose significant risks, including information errors, misdiagnosis, and institutional misalignment, thereby compromising both patient safety and legal liability [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>]. This misalignment represents a substantial barrier, revealing the structural limitations of applying general LLMs directly to the Korean health care context.</p><p>Furthermore, Korean medical inquiries are characterized by patients describing their symptoms, progress, and concerns in detailed narrative forms [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. A primary challenge for medical chatbots is accurately extracting clinically essential information from such verbose text and matching it with the appropriate Korean medical guideline documents [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]. Specifically, as Korean is an agglutinative language with high morphological complexity&#x2014;where medical vocabulary consists of a mixture of Hanja, loanwords, and native Korean terms&#x2014;specialized text processing and retrieval technologies optimized for the Korean language are required [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref20">20</xref>].</p><p>Current LLM-based medical chatbots, including commercial models, primarily rely on pretrained internal knowledge. This often leads to &#x201C;hallucinations&#x201D;&#x2014;generating factually incorrect information&#x2014;and a failure to ensure reliability based on the latest domain-specific evidence [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>]. Specifically, when consultation content pertains directly to a specific country&#x2019;s insurance standards, local medical systems, or locally used products and ingredients, simple learning-based response mechanisms struggle to guarantee the precision and up-to-dateness required for medical safety [<xref ref-type="bibr" rid="ref10">10</xref>]. To address these limitations, the ecosystem has recently shifted toward state-of-the-art open-source LLMs such as Llama 3 (Meta), Mistral (Mistral AI), and Gemma (Google LLC), which allow for higher levels of customization, including on-premises deployment, domain-specific fine-tuning, and enhanced data privacy [<xref ref-type="bibr" rid="ref23">23</xref>]. Nevertheless, even high-performance open-source models require architectural reinforcement, such as systematic integration with external knowledge, to ensure accuracy and safety in the medical domain. In this context, retrieval-augmented generation (RAG) is emerging as a promising solution [<xref ref-type="bibr" rid="ref24">24</xref>]. By using dense retrieval techniques and Korean-specific embedding models, RAG enables chatbots to access refined medical databases, allowing this study to focus on generating fact-based responses grounded in prevalidated medical literature, guidelines, and local regulations [<xref ref-type="bibr" rid="ref25">25</xref>].</p><p>However, RAG alone faces limitations in sufficiently processing the unique characteristics of Korean medical inquiries. As these inquiries are often long and narrative, with the patient&#x2019;s core intent frequently obscured by peripheral context (such as emotions or treatment experiences), simple vector similarity-based retrieval often fails to select the most appropriate evidence documents [<xref ref-type="bibr" rid="ref26">26</xref>]. As a sophisticated enhancement to overcome this, metadata filtering has been proposed. This approach involves the LLM extracting structured metadata&#x2014;such as clinical department, disease category, age group, and specific drugs or procedures&#x2014;from the query and filtering the database accordingly before the retrieval stage, thereby narrowing the search space [<xref ref-type="bibr" rid="ref27">27</xref>]. This ensures that the system performs retrieval and generation only on the most relevant subsets of documents, which significantly improves response accuracy and consistency, especially for complex, multistep medical questions [<xref ref-type="bibr" rid="ref28">28</xref>]. Despite these developments, there is a lack of empirical research systematically comparing and evaluating the performance of various open-source LLM architectures that combine RAG with metadata filtering within the Korean medical context [<xref ref-type="bibr" rid="ref29">29</xref>]. Furthermore, benchmarks and evaluation frameworks that reflect the unique characteristics of Korean medical inquiries are not yet sufficiently established [<xref ref-type="bibr" rid="ref30">30</xref>]. We identified a research gap that necessitates an empirical validation of open-source LLM-based architectures integrating RAG and metadata filtering to design safe and reliable medical chatbots tailored to the Korean health care environment.</p></sec><sec id="s1-2"><title>Objectives</title><p>Therefore, we aimed to develop and comparatively evaluate a Korean medical consultation chatbot architecture that integrates RAG with LLM-based metadata filtering to better align responses with Korean clinical information needs and locally grounded evidence. We constructed a Korean medical document corpus (10.4 GB; 487,277 segments) and implemented an RAG pipeline using a Korean-specialized embedding model with an Elasticsearch-based retriever, augmented by metadata filtering that extracts structured clinical cues (eg, disease-, symptom-, or drug-related signals) to narrow the retrieval space before generation. We then compared 5 open-weight LLM backbones (Qwen3:4B, Mistral:7B, Llama-3.1:8B, Gpt-Oss:20B, and Gemma3:27B) under 3 configurations&#x2014;baseline (no retrieval), RAG-only, and RAG plus metadata filtering&#x2014;to determine when retrieval and filtering improve Korean health consultation responses. Performance was validated on 226 Korean consultation questions spanning 7 common disease categories, with an emergency medicine specialist rating each response for accuracy, safety, and helpfulness using a predefined scoring protocol.</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Data Collection</title><p>This study focused on collecting and preprocessing Korean medical consultation data. To ensure a comprehensive dataset in diverse medical fields, we incorporated a variety of medical fields and consultation scenarios. The main data sources included publicly available disease information from the official websites of major Korean hospitals (n=3537; eg, disease encyclopedias accessible via the public health information portal of Asan Medical Center), legally usable open-source textbooks (n=2336), public data from the Ministry of Food and Drug Safety (n=87,635), and health-related knowledge data from major Korean search engines (n=49,681). These datasets were rigorously curated and refined through the supervision of health care professionals among the coauthors.</p><p>The data preprocessing process involved four key steps: (1) cleaning the data by removing unnecessary characters, duplicate entries, and nonmedical content to ensure dataset integrity; (2) ensuring consistency by correcting spelling errors, managing synonyms, and standardizing medical terminology; (3) conducting a rigorous screening process to ensure the complete absence of any personally identifiable information (PII) through a dual-verification approach, combining automated rule-based detection with manual review by medical professionals; and (4) classifying data by disease categories, examination and procedure types, body regions, medical specialties, and consultation types to enhance the efficiency of subsequent analyses. As a result, we constructed a medical document corpus of 10.4 GB, segmented into 487,277 text units for efficient retrieval within the RAG system. An overview of the data collection and management web user interface is shown in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><p>This high-quality Korean medical consultation dataset, constructed through this process, is crucial for developing the RAG-integrated small large language models. This curated dataset includes frequently asked medical questions from the general public and answers based on the clinical expertise of medical professionals. By using this data for performance evaluation, we aimed to more accurately assess our service&#x2019;s accuracy, safety, and helpfulness in a real-world medical consultation environment within Korea.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>The database construction process, data count, and web UI screen for database management. We built a 10.4 GB medical document corpus, divided into 487,277 text segments, for efficient searching in the RAG system. RAG: retrieval-augmented generation; UI: user interface.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e72604_fig01.png"/></fig></sec><sec id="s2-2"><title>Implementation of Default LLM</title><p>Concerning the LLM backbone, we built the LLM using Ollama as the backbone. The models were loaded from the Ollama library, allowing for model selection&#x2014;including Qwen-4B (Alibaba Group), Mistral-7B (Mistral AI), Llama-3.1-8B (Meta), Gpt-Oss-20B (Google LLC), and Gemma-3-27B (Google LLC)&#x2014;depending on the test. The service was run in an NVIDIA A100 (NVIDIA Corp) with 40 GB of video random access memory and an Ubuntu 22.04 (Canonical Ltd) environment to ensure the models could operate at optimal performance. Each model is briefly summarized in <xref ref-type="table" rid="table1">Table 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Overview and characteristics of LLMs<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> evaluated in this study. This table summarizes the LLMs<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> compared in this research. The overall process is illustrated in <xref ref-type="fig" rid="figure2">Figure 2</xref>.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">LLM</td><td align="left" valign="bottom">Parameter</td><td align="left" valign="bottom">Description</td><td align="left" valign="bottom">Selection reason</td></tr></thead><tbody><tr><td align="left" valign="top">Qwen3:4B</td><td align="left" valign="top">4B</td><td align="left" valign="top">A multilingual LLM developed by China&#x2019;s Alibaba Group that also offers respectable performance in the Korean language.</td><td align="left" valign="top">Chosen to validate the practical service applicability of a small large language model.</td></tr><tr><td align="left" valign="top">Mistral:7B</td><td align="left" valign="top">7B</td><td align="left" valign="top">An LLM developed by France&#x2019;s Mistral AI, renowned for delivering high performance with a relatively small number of parameters.</td><td align="left" valign="top">A widely used small open-source model, often used as a performance benchmark for models in the 7B parameter class.</td></tr><tr><td align="left" valign="top">Llama-3.1:8B</td><td align="left" valign="top">8B</td><td align="left" valign="top">The latest generation open-source model from Meta. Widely recognized as a top-performing benchmark in the 8B class and is instruction-tuned.</td><td align="left" valign="top">Selected as a state-of-the-art benchmark for the 8B parameter class, given its widespread adoption and strong performance.</td></tr><tr><td align="left" valign="top">Gpt-Oss:20B</td><td align="left" valign="top">20B</td><td align="left" valign="top">OpenAI&#x2019;s first open-source model to have its weights made public, known for its solid performance.</td><td align="left" valign="top">To evaluate the performance of the latest architectures and verify the on-device applicability of small-scale models at the 20B parameter level.</td></tr><tr><td align="left" valign="top">Gemma-3:27B</td><td align="left" valign="top">27B</td><td align="left" valign="top">Google&#x2019;s latest generation instruction-tuned open-source model. Known for strong performance that competes with larger models.</td><td align="left" valign="top">To evaluate a recent, larger-scale model (27B) and compare its capabilities against the smaller models (4B-20B).</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>LLM: large language model.</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Implementation of a chatbot using a specific LLM. The diagram outlines the overall system architecture and workflow applied to develop the Korean health consultation chatbot. LLM: large language model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e72604_fig02.png"/></fig><p>Concerning conversation flow design, in health counseling, responding accurately and appropriately to user inquiries is crucial. Therefore, we carefully structured the conversation flow. This includes defining a step-by-step dialogue structure that incorporates initial greetings, symptom inquiries, requests for additional information, and guidance to visit a medical institution in case of an emergency, enabling users to interact smoothly with the chatbot. The chatbot also includes a memory function to remember the context of previous conversations and provide context-aware responses.</p><p>Concerning information extraction, when a user uploads an image of a prescription or medical report, the chatbot analyzes the image to extract medication and disease codes and presents the relevant information in a tabular format. This feature allows users to easily check detailed information about their health status and prescribed medications.</p><p>Concerning RAG, to use the database constructed in the data collection phase, we built an RAG and application programming interface system, enabling the LLM to generate answers based on this data. This approach ensures that the chatbot provides reliable and evidence-based information, consequently delivering more accurate and trustworthy medical responses to the user. A more detailed explanation of RAG will be provided in the following section.</p></sec><sec id="s2-3"><title>About RAG</title><p>Concerning Korean embedding, in this study, we adopted the upskyy/BGE-M3-Korean embedding model [<xref ref-type="bibr" rid="ref31">31</xref>], which is specialized for the Korean language, to maximize the accuracy and efficiency of information retrieval. The key reason for selecting this model is that BGE-M3 possesses a &#x201C;multigranularity&#x201D; feature, making it specialized for Korean embeddings. This means it can process texts of various lengths&#x2014;from short user queries to long, specialized medical documents&#x2014;with consistently high performance. Furthermore, this model is fine-tuned on Korean data, enabling it to accurately capture complex Korean medical terminology and the subtle semantic nuances of user queries.</p><p>Concerning the Elasticsearch store configuration, for efficient information retrieval, we configured an Elasticsearch store, an open-source search engine designed to maximize search performance in large datasets using an inverted index structure [<xref ref-type="bibr" rid="ref32">32</xref>]. In this study, it was integrated with a Korean medical database, allowing for fast responses to user queries. This setup enabled quick searches across large-scale medical data and maximized the performance of the RAG technique. Elasticsearch is particularly effective in text-based query-answering systems, offering fast and reliable search results.</p><p>Concerning retriever configuration, we constructed the retriever by integrating the aforementioned Korean embedding model with the Elasticsearch store. When a user inputs a query, the retriever uses the embedding model to generate a vector representation and searches the Elasticsearch store for the most similar vectors. These retrieved documents are provided as input to our backbone model, which then generates an optimal answer based on that data. Through this process, the system provides information based on reliable evidence rather than offering simple conversational answers. This ensures the chatbot can deliver accurate and trustworthy medical information to the user.</p><p>Concerning metadata filtering, we enhanced search efficiency using metadata filtering technology. Due to the nature of our model, which must use large volumes of health care data such as textbooks and accumulated big data responses, we determined that metadata filtering technology was essential for speed optimization. Metadata filtering first recognizes specific disease names or symptoms from the user&#x2019;s query and uses relevant metadata to reconstruct the query more accurately. This improves search accuracy and helps users quickly access the information they want [<xref ref-type="bibr" rid="ref33">33</xref>]. <xref ref-type="table" rid="table2">Table 2</xref> shows examples of keywords that the metadata actually retrieves and the information brought in through them.</p><p><xref ref-type="fig" rid="figure3">Figure 3</xref> depicts the overall workflow, illustrating the conversion of user queries into vector representations via the Korean embedding model for semantic retrieval.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Comparison of data retrieved by the retriever with and without metadata filtering.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Data</td><td align="left" valign="bottom">Korean</td><td align="left" valign="bottom">English</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="3">Effect of metadata filtering: accurately includes &#x201C;loss of motivation&#x201D; and &#x201C;depressive feelings,&#x201D; which are the user&#x2019;s core symptoms, and the explanation that this causes &#x201C;psychosomatic symptoms.&#x201D;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Query</td><td align="left" valign="top">(&#xC911;&#xB7B5;) &#xCE58;&#xB8CC;&#xB294; &#xB418;&#xC5C8;&#xB294;&#xB370; &#xCCB4;&#xC911;&#xC774; &#xBE60;&#xC838; &#xAC71;&#xC815;&#xB418;&#xC5B4; &#xB300;&#xC7A5; &#xB0B4;&#xC2DC;&#xACBD; &#xAC80;&#xC0AC;&#xB97C; &#xD588;&#xB294;&#xB370; &#xACB0;&#xACFC;&#xB294; &#xC6A9;&#xC885; &#xD558;&#xB098;&#xB3C4; &#xC5C6;&#xC774; &#xAE68;&#xB057;&#xD55C;&#xAC83;&#xC73C;&#xB85C; &#xB098;&#xC654;&#xC2B5;&#xB2C8;&#xB2E4; &#xBB38;&#xC81C;&#xB294; &#xCCB4;&#xC911;&#xAC10;&#xC18C;&#xC640; &#xCEE8;&#xB514;&#xC158; &#xB09C;&#xC870;&#xC778;&#xB370; &#xAC74;&#xAC15;&#xC5D0; &#xC0C1;&#xAD00; &#xC5C6;&#xB294;&#xAC74;&#xC9C0;&#xC694;? &#xC57D;&#xAC04;&#xC758; &#xC6B0;&#xC6B8;&#xC99D;&#xACFC; &#xC758;&#xC695;&#xC774; &#xC880; &#xC0C1;&#xC2E4;&#xB41C; &#xAE30;&#xBD84;&#xC785;&#xB2C8;&#xB2E4; (&#xC911;&#xB7B5;)</td><td align="left" valign="top">(Abridged) treated... but worried about weight loss. Colonoscopy was clean... Problem is weight loss and poor condition... Also slight depression and loss of motivation... (abridged)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Extracted filter</td><td align="left" valign="top">&#xCCB4;&#xC911; &#xAC10;&#xC18C;, &#xCEE8;&#xB514;&#xC158; &#xB09C;&#xC870;, &#xC6B0;&#xC6B8;&#xC99D;, &#xC758;&#xC695; &#xC0C1;&#xC2E4;, &#xAC74;&#xAC15;&#xC2DD;&#xD488;</td><td align="left" valign="top">Weight loss, poor condition, depression, loss of motivation, health supplements</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Retriever without metadata filtering</td><td align="left" valign="top">[&#xBC15;&#xB9AC;&#xC131; &#xAC04;&#xC9C8;&#xC131; &#xD3D0;&#xB834;] (&#xC911;&#xB7B5;) &#xC0C1; &#xBD80;&#xC704; &#xC790;&#xCCB4;&#xB3C4; &#xD3D0;&#xD3EC; &#xB300;&#xC2DD;&#xC138;&#xD3EC;&#xB4E4;&#xC758; &#xD3D0;&#xD3EC; &#xB0B4; &#xC0BC;&#xCD9C;&#xC774; &#xC8FC; &#xC18C;&#xACAC;&#xC774;&#xACE0;, &#xAC04;&#xC9C8;&#xC740; &#xAC70;&#xC758; &#xBCC0;&#xD654;&#xAC00; &#xC5C6;&#xAC70;&#xB098; &#xC57D;&#xD558;&#xAC8C; &#xC5FC;&#xC99D; &#xC138;&#xD3EC; &#xCE68;&#xC724;&#xB9CC; &#xAD00;&#xCC30;&#xB41C;&#xB2E4;. &#xC81C;2&#xD615; &#xC0C1;&#xD53C;&#xC138;&#xD3EC;&#xAC00; &#xC99D;&#xAC00;&#xD560; &#xC218;&#xB3C4; &#xC788;&#xB2E4;. (&#xC911;&#xB7B5;)</td><td align="left" valign="top">[Desquamative interstitial pneumonia] (abridged) ...alveolar macrophages... interstitial changes are absent or mild... Type 2 epithelial cells may increase... (abridged)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Retriever with metadata filtering</td><td align="left" valign="top">[&#xC6B0;&#xC6B8;&#xC7A5;&#xC560;] (&#xC911;&#xB7B5;) &#xC758;&#xC695; &#xC800;&#xD558;&#xC640; &#xC6B0;&#xC6B8;&#xAC10;&#xC744; &#xC8FC;&#xC694; &#xC99D;&#xC0C1;&#xC73C;&#xB85C; &#xD558;&#xC5EC; &#xB2E4;&#xC591;&#xD55C; &#xC778;&#xC9C0; &#xBC0F; &#xC815;&#xC2E0; &#xC2E0;&#xCCB4;&#xC801; &#xC99D;&#xC0C1;&#xC744; &#xC77C;&#xC73C;&#xCF1C; &#xC77C;&#xC0C1; &#xAE30;&#xB2A5;&#xC758; &#xC800;&#xD558;&#xB97C; &#xAC00;&#xC838;&#xC624;&#xB294; &#xC9C8;&#xD658; &#xC8FC;&#xC758;&#xC0AC;&#xD56D;: &#xC6B0;&#xC6B8;&#xC99D;, &#xC989; &#xC6B0;&#xC6B8;&#xC7A5;&#xC560;&#xB294; &#xC758;&#xC695; &#xC800;&#xD558;&#xC640; &#xC6B0;&#xC6B8;&#xAC10;&#xC744; &#xC8FC;&#xC694; &#xC99D;&#xC0C1;&#xC73C;&#xB85C; &#xD558;&#xC5EC; &#xB2E4;&#xC591;&#xD55C; &#xC778;&#xC9C0; &#xBC0F; &#xC815;&#xC2E0; &#xC2E0;&#xCCB4;&#xC801; &#xC99D;&#xC0C1;&#xC744; &#xC77C;&#xC73C;&#xCF1C; &#xC77C;&#xC0C1; (&#xC911;&#xB7B5;)</td><td align="left" valign="top">[Depressive disorder] (abridged) a disease causing impairment in daily functioning with main symptoms of loss of motivation and depressive mood... Caution: Depression... causes various cognitive and psychosomatic symptoms... (abridged)</td></tr><tr><td align="left" valign="top" colspan="3">Effect of metadata filtering: &#x201C;all-out sprint&#x201D; or &#x201C;strong labor intensity,&#x201D; which are the user's key triggers, are directly linked to descriptions of physical overload symptoms such as &#x201C;decreased athletic ability&#x201D; and &#x201C;difficulty breathing.&#x201D;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Query</td><td align="left" valign="top">(&#xC911;&#xB7B5;) &#xC804;&#xB825;&#xC9C8;&#xC8FC; 2~30&#xCD08;.. &#xC0C1;&#xD558;&#xCC28;&#xAC19;&#xC740; &#xBB34;&#xAC70;&#xC6B4;&#xAC78; &#xB9CE;&#xC774; &#xB098;&#xB974;&#xB294; &#xB178;&#xB3D9;&#xAC15;&#xB3C4;&#xAC00; &#xAC15;&#xD55C; &#xC77C;&#xC744; &#xD560;&#xB54C; &#xC77C;&#xC2DC;&#xC801;&#xC73C;&#xB85C; &#xC55E;&#xC774; &#xD558;&#xB098;&#xB3C4; &#xBCF4;&#xC774;&#xC9C0; &#xC54A;&#xAC8C;&#xB418;&#xACE0; &#xADC0;&#xAC00; &#xBA39;&#xBA39;&#xD574;&#xC9C0;&#xB294; &#xC99D;&#xC0C1;&#xC774; &#xB098;&#xD0C0;&#xB098;&#xAC8C; &#xB418;&#xC5C8;&#xC2B5;&#xB2C8;&#xB2E4;.. &#xC2EC;&#xC9C0;&#xC5B4; &#xACFC;&#xC74C; &#xD6C4;&#xC5D0; &#xC911;&#xAC04;&#xC5D0; &#xC790;&#xB2E4; &#xAE6C;&#xC801;&#xB3C4; &#xC788;&#xB294;&#xB370; &#xBB3C;&#xB9C8;&#xC2DC;&#xB7EC;&#xAC00;&#xB2E4; &#xC55E;&#xC774; &#xC548;&#xBCF4;&#xC774;&#xAC8C;&#xB418;&#xC5B4; &#xC4F0;&#xB7EC;&#xC9C0;&#xB4EF; &#xD55C;&#xC801;&#xB3C4; &#xC788;&#xC2B5;&#xB2C8;&#xB2E4;. (&#xC911;&#xB7B5;)</td><td align="left" valign="top">(Abridged) 20-30 seconds of all-out sprinting... heavy labor like loading/unloading... temporarily can't see anything and ears feel clogged... Even after heavy drinking... woke up... couldn't see while going for water and collapsed... (abridged)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Extracted filter</td><td align="left" valign="top">&#xC804;&#xB825;&#xC9C8;&#xC8FC;, &#xC0C1;&#xD558;&#xCC28;, &#xB178;&#xB3D9;&#xAC15;&#xB3C4;, &#xC55E;&#xC774; &#xC548;&#xBCF4;&#xC774;&#xAC8C;, &#xADC0;&#xAC00; &#xBA39;&#xBA39;&#xD574;&#xC9C0;&#xB2E4;, &#xACFC;&#xC74C;, &#xD608;&#xC555; &#xB192;&#xB2E4;</td><td align="left" valign="top">All-out sprint, loading/unloading, labor intensity, can&#x2019;t see, ears clogged, heavy drinking, high blood pressure</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Retriever without metadata filtering</td><td align="left" valign="top">[&#xC720;&#xC2A4;&#xD0C0;&#xD0A4;&#xC624;&#xAD00;] (&#xC911;&#xB7B5;) &#xC5D8;&#xB9AC;&#xBCA0;&#xC774;&#xD130;&#xAC00; &#xC0C1;&#xC2B9;&#xD558;&#xAC70;&#xB098; &#xD558;&#xAC15;&#xD560; &#xB54C; &#xADC0;&#xAC00; &#xBA39;&#xBA39;&#xD574;&#xC9C0;&#xB294; &#xAC83;&#xC740; &#xC678;&#xBD80;&#xC640; &#xC911;&#xC774;&#xC758; &#xAE30;&#xC555;&#xC774; &#xB2EC;&#xB77C; &#xACE0;&#xB9C9;&#xC774; &#xBE44;&#xD2C0;&#xC5B4;&#xC9C0;&#xACE0; &#xC720;&#xC2A4;&#xD0C0;&#xD0A4;&#xC624;&#xAD00;&#xC774; &#xB9C9;&#xD788;&#xB294; &#xD604;&#xC0C1; &#xB54C;&#xBB38;&#xC785;&#xB2C8;&#xB2E4;. &#xACE0;&#xB9C9;&#xC740; &#xC678;&#xBD80; &#xD658;&#xACBD;&#xACFC; &#xC2E0;&#xCCB4; &#xB0B4;&#xBD80;&#xC758; &#xACBD;&#xACC4;&#xC5D0; &#xC704;&#xCE58;&#xD55C; &#xACE0;&#xB9C9;&#xC740; &#xAE30;&#xC555; &#xCC28;&#xC774;&#xC5D0; &#xC758;&#xD574; &#xB9C9;&#xC774; &#xD130;&#xC9C0;&#xB294; &#xAC83;&#xC744; &#xBC29;&#xC9C0;&#xD558;&#xAE30; &#xC704;&#xD574; &#xBE44;&#xD2C0;&#xC5B4;&#xC9C0;&#xBA74;&#xC11C; &#xBCF4;&#xD638;&#xD569;&#xB2C8;&#xB2E4;. (&#xC911;&#xB7B5;)</td><td align="left" valign="top">[Eustachian tube] (abridged) ears feeling clogged when an elevator ascends or descends is due to the eustachian tube closing... the eardrum twists to protect itself from the pressure difference... (abridged)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Retriever with metadata filtering</td><td align="left" valign="top">[&#xCCA0;&#xACB0;&#xD54D;&#xC131; &#xBE48;&#xD608;] (&#xC911;&#xB7B5;) &#xD53C;&#xB85C;&#xAC10;, &#xBB34;&#xAE30;&#xB825;&#xD568;, &#xC6B4;&#xB3D9;&#xB2A5;&#xB825; &#xC800;&#xD558; &#xB4F1;&#xC774; &#xB098;&#xD0C0;&#xB09C;&#xB2E4;. &#xD608;&#xC561;&#xC774; &#xC0B0;&#xC18C; &#xBD80;&#xC871; &#xC0C1;&#xD0DC;&#xAC00; &#xB418;&#xBA74; &#xC2EC;&#xC7A5;&#xC774;&#xB098; &#xD3D0;&#xAC00; &#xC774;&#xB97C; &#xBCF4;&#xCDA9;&#xD558;&#xAE30; &#xC704;&#xD574; &#xACFC;&#xB3C4;&#xD558;&#xAC8C; &#xD65C;&#xB3D9;&#xD574;&#xC57C; &#xD558;&#xBBC0;&#xB85C; &#xC7A5;&#xAE30;&#xC5D0; &#xBD80;&#xB2F4;&#xC774; &#xAC00;&#xD574;&#xC9C4;&#xB2E4;. &#xC774;&#xC5D0; &#xB530;&#xB77C; &#xC2EC;&#xC7A5;&#xBC15;&#xB3D9;&#xC774; &#xBE68;&#xB77C;&#xC9C0;&#xB294; &#xC2EC;&#xACC4;&#xD56D;&#xC9C4;(palpitation; &#xB450;&#xADFC;&#xAC70;&#xB9BC;)&#xC774;&#xB098;, &#xAC00;&#xC2B4;&#xC758; &#xD1B5;&#xC99D;, &#xD638;&#xD761;&#xACE4;&#xB780; (&#xC911;&#xB7B5;)</td><td align="left" valign="top">[Iron-deficiency anemia] (abridged) fatigue, lethargy, decreased athletic ability, etc., appear. If the blood lacks oxygen, the heart or lungs must overwork... causing palpitation, chest pain, difficulty breathing... (abridged)</td></tr><tr><td align="left" valign="top" colspan="3">Effect of metadata filtering: directly matches &#x201D;diabetes,&#x201D; the underlying condition of the mother provided by the user as key background information for the query.</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Query</td><td align="left" valign="top">&#xC5B4;&#xBA38;&#xB2C8;&#xAC00; 67&#xC138;&#xC774;&#xC2E0;&#xB370; &#xC218;&#xC561;&#xC744; &#xB9DE;&#xC73C;&#xC168;&#xB294;&#xB370; &#xC218;&#xC561;&#xB9DE;&#xC744;&#xB54C; &#xCE74;&#xD14C;&#xD0C0; &#xB2C8;&#xB4E4;&#xC5D0; &#xD53C;&#xAC00; &#xB098;&#xC654;&#xB294;&#xB370; &#xD53C;&#xAC00; &#xC2EC;&#xD558;&#xAC8C; &#xAC80;&#xC815;&#xC0C9;&#xC774;&#xC600;&#xC2B5;&#xB2C8;&#xB2E4;. &#xC5B4;&#xB5A4; &#xC99D;&#xC0C1;&#xC774; &#xC788;&#xC73C;&#xBA74; &#xADF8;&#xB7F0;&#xC9C0; &#xC54C;&#xB824;&#xC8FC;&#xC138;&#xC694; &#xC5B4;&#xBA38;&#xB2C8;&#xB294; &#xACE0;&#xD608;&#xC555;, &#xACE0;&#xC9C0;&#xD608;&#xC99D;, &#xB2F9;&#xB1E8;&#xAC00; &#xC788;&#xC73C;&#xC2ED;&#xB2C8;&#xB2E4;</td><td align="left" valign="top">Mother is 67... received an IV drip... blood came out of the catheter needle... the blood was severely black. Please tell me what symptoms cause this. Mother has high blood pressure, hyperlipidemia, and diabetes.</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Extracted filter</td><td align="left" valign="top">&#xACE0;&#xD608;&#xC555;, &#xACE0;&#xC9C0;&#xD608;&#xC99D;, &#xB2F9;&#xB1E8;, &#xC218;&#xC561;, &#xCE74;&#xD14C;&#xD0C0;, &#xD53C;, &#xAC80;&#xC815;&#xC0C9;</td><td align="left" valign="top">High blood pressure, hyperlipidemia, diabetes, IV drip, catheter, blood, black color</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Retriever without metadata filtering</td><td align="left" valign="top">[&#xC720;&#xC2A4;&#xD0C0;&#xD0A4;&#xC624;&#xAD00;] (&#xC911;&#xB7B5;) &#xC5D8;&#xB9AC;&#xBCA0;&#xC774;&#xD130;&#xAC00; &#xC0C1;&#xC2B9;&#xD558;&#xAC70;&#xB098; &#xD558;&#xAC15;&#xD560; &#xB54C; &#xADC0;&#xAC00; &#xBA39;&#xBA39;&#xD574;&#xC9C0;&#xB294; &#xAC83;&#xC740; &#xC678;&#xBD80;&#xC640; &#xC911;&#xC774;&#xC758; &#xAE30;&#xC555;&#xC774; &#xB2EC;&#xB77C; &#xACE0;&#xB9C9;&#xC774; &#xBE44;&#xD2C0;&#xC5B4;&#xC9C0;&#xACE0; &#xC720;&#xC2A4;&#xD0C0;&#xD0A4;&#xC624;&#xAD00;&#xC774; &#xB9C9;&#xD788;&#xB294; &#xD604;&#xC0C1; &#xB54C;&#xBB38;&#xC785;&#xB2C8;&#xB2E4;. &#xACE0;&#xB9C9;&#xC740; &#xC678;&#xBD80; &#xD658;&#xACBD;&#xACFC; &#xC2E0;&#xCCB4; &#xB0B4;&#xBD80;&#xC758; &#xACBD;&#xACC4;&#xC5D0; &#xC704;&#xCE58;&#xD55C; &#xACE0;&#xB9C9;&#xC740; &#xAE30;&#xC555; &#xCC28;&#xC774;&#xC5D0; &#xC758;&#xD574; &#xB9C9;&#xC774; &#xD130;&#xC9C0;&#xB294; &#xAC83;&#xC744; &#xBC29;&#xC9C0;&#xD558;&#xAE30; &#xC704;&#xD574; &#xBE44;&#xD2C0;&#xC5B4;&#xC9C0;&#xBA74;&#xC11C; &#xBCF4;&#xD638;&#xD569;&#xB2C8;&#xB2E4;. (&#xC911;&#xB7B5;)</td><td align="left" valign="top">[Seminal vesicle] (abridged) often discovered incidentally... can be caused by excessive proliferation of the seminal vesicle mucosa, inflammatory diseases like prostatitis... Hematospermia is treated symptomatically... (abridged)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Retriever with metadata filtering</td><td align="left" valign="top">[&#xC815;&#xB0AD;] (&#xC911;&#xB7B5;) &#xBCF8;&#xC778;&#xB3C4; &#xC798; &#xBAA8;&#xB974;&#xACE0; &#xC9C0;&#xB0B4;&#xB2E4;&#xAC00; &#xC6B0;&#xC5F0;&#xCC2E;&#xAC8C; &#xBC1C;&#xACAC;&#xB418;&#xB294; &#xACBD;&#xC6B0;&#xAC00; &#xB9CE;&#xC740;&#xB370;, &#xC815;&#xB0AD; &#xC810;&#xB9C9;&#xC758; &#xACFC;&#xB2E4;&#xD55C; &#xC99D;&#xC2DD;, &#xC804;&#xB9BD;&#xC120;&#xC5FC; &#xB4F1;&#xC758; &#xC5FC;&#xC99D;&#xC131; &#xC9C8;&#xD658;&#xC5D0; &#xC758;&#xD574; &#xBC1C;&#xC0DD;&#xD560; &#xC218; &#xC788;&#xC2B5;&#xB2C8;&#xB2E4;. &#xD608;&#xC815;&#xC561;&#xC99D;&#xC740; &#xC99D;&#xC0C1;&#xC5D0; &#xB530;&#xB77C; &#xB300;&#xC99D;&#xCE58;&#xB8CC;&#xB97C; &#xD558;&#xBA70;, &#xC2E0;&#xCCB4;&#xC5D0; &#xBC1C;&#xC0DD;&#xD55C; &#xB2E4;&#xB978; &#xC6D0;&#xC778;&#xC9C8;&#xD658;&#xC744; &#xCE58;&#xB8CC;&#xD558;&#xAC70;&#xB098; &#xC815;&#xB0AD;&#xC5FC;&#xC758; &#xACBD;&#xC6B0; &#xC815;&#xB0AD;&#xC744; &#xB9C8;&#xC0AC;&#xC9C0;&#xD558;&#xACE0; &#xD56D;&#xC0DD;&#xC81C;&#xB85C; &#xCE58;&#xB8CC;&#xD569;&#xB2C8;&#xB2E4;. (&#xC911;&#xB7B5;)</td><td align="left" valign="top">[Noninsulin-dependent diabetes mellitus] (abridged) ...it is rare to be completely cured... However, diabetes can be managed... If diabetes is well-managed, a healthy life can be maintained... (abridged)</td></tr></tbody></table></table-wrap><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>The RAG and metadata filtering diagram depicts the architecture of the RAG-based retrieval pipeline and API service, highlighting the integration of a Korean embedding model, Elasticsearch. API: application programming interface; LLM: large language model; RAG: retrieval-augmented generation.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e72604_fig03.png"/></fig></sec><sec id="s2-4"><title>Evaluation for AI Model Performance</title><p>To validate this chatbot system, we reconstructed 226 representative questions based on question and answer data provided by a medical consultation platform, focusing on 7 common diseases among Koreans: hypertension, acute bronchitis, diabetes, indigestion or gastritis, atopic dermatitis, allergic rhinitis, and reflux esophagitis. <xref ref-type="table" rid="table3">Table 3</xref> summarizes the information about the question list.</p><p>The main question types by disease include lifestyle, medication, symptoms, and treatment. For scoring, an emergency medicine specialist provided the necessary answer guidelines for each question. We quantitatively assessed each model&#x2019;s responses using 3 criteria: accuracy, safety, and helpfulness. We used the following prompt script to guide the assessment process (<xref ref-type="other" rid="box1">Textbox 1</xref>).</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Cross-tabulation of validation questions by disease category and question type. Statistics of the Q&#x0026;A<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup> validation dataset for the verification of this chatbot system, consisting of 226 representative questions.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Disease or type</td><td align="left" valign="bottom">Lifestyle</td><td align="left" valign="bottom">Medication</td><td align="left" valign="bottom">Symptoms</td><td align="left" valign="bottom">Treatment</td><td align="left" valign="bottom">Complex</td><td align="left" valign="bottom">Total</td></tr></thead><tbody><tr><td align="left" valign="top">Hypertension</td><td align="left" valign="top">8</td><td align="left" valign="top">9</td><td align="left" valign="top">11</td><td align="left" valign="top">7</td><td align="left" valign="top">0</td><td align="left" valign="top">35</td></tr><tr><td align="left" valign="top">Acute bronchitis</td><td align="left" valign="top">6</td><td align="left" valign="top">7</td><td align="left" valign="top">8</td><td align="left" valign="top">5</td><td align="left" valign="top">0</td><td align="left" valign="top">26</td></tr><tr><td align="left" valign="top">Diabetes</td><td align="left" valign="top">8</td><td align="left" valign="top">9</td><td align="left" valign="top">11</td><td align="left" valign="top">7</td><td align="left" valign="top">0</td><td align="left" valign="top">35</td></tr><tr><td align="left" valign="top">Indigestion or gastritis</td><td align="left" valign="top">6</td><td align="left" valign="top">7</td><td align="left" valign="top">8</td><td align="left" valign="top">5</td><td align="left" valign="top">0</td><td align="left" valign="top">26</td></tr><tr><td align="left" valign="top">Atopic dermatitis</td><td align="left" valign="top">3</td><td align="left" valign="top">7</td><td align="left" valign="top">8</td><td align="left" valign="top">5</td><td align="left" valign="top">0</td><td align="left" valign="top">23</td></tr><tr><td align="left" valign="top">Allergic rhinitis</td><td align="left" valign="top">6</td><td align="left" valign="top">7</td><td align="left" valign="top">8</td><td align="left" valign="top">5</td><td align="left" valign="top">0</td><td align="left" valign="top">25</td></tr><tr><td align="left" valign="top">Reflux esophagitis</td><td align="left" valign="top">7</td><td align="left" valign="top">7</td><td align="left" valign="top">8</td><td align="left" valign="top">5</td><td align="left" valign="top">0</td><td align="left" valign="top">26</td></tr><tr><td align="left" valign="top">etc</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">30</td><td align="left" valign="top">30</td></tr><tr><td align="left" valign="top">Total</td><td align="left" valign="top">42</td><td align="left" valign="top">53</td><td align="left" valign="top">62</td><td align="left" valign="top">39</td><td align="left" valign="top">30</td><td align="left" valign="top">226</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>Q&#x0026;A: question and answer.</p></fn></table-wrap-foot></table-wrap><boxed-text id="box1"><title> Prompt script and criteria for the quantitative evaluation of artificial intelligence (AI) responses.</title><p>Prompt script:</p><p>You are an expert evaluating the performance of a medical consultation AI system. Use the SCORING REFERENCE to assess the AI ANSWER&#x2019;s content according to the following criteria:</p><p>Accuracy (1&#x2010;5 points)</p><p>Safety (1&#x2010;5 points)</p><p>Helpfulness (1&#x2010;5 points)</p><p>Please assign a score to the AI ANSWER for each criterion.</p><p>QUESTION: [Question statement]</p><p>SCORING REFERENCE: [Medically correct answer to serve as a reference for scoring]</p><p>AI ANSWER: [AI model&#x2019;s response]</p></boxed-text><p>We established accuracy, safety, and helpfulness as the core criteria for evaluating AI responses. Accuracy comprehensively evaluates whether the AI&#x2019;s answer is medically consistent with the model answer reviewed by a medical expert, and whether it clearly aligns with the user&#x2019;s question intent. In parallel, safety assesses whether the answer contains dangerous information that could be harmful to the user, and whether it ensures the information&#x2019;s reliability by including essential warning statements, such as &#x201C;AI advice cannot replace professional medical consultation.&#x201D; Helpfulness measures how easily the user can understand the accurate and safe information and receive practical help from it. In other words, an answer is considered good only when it goes beyond informational correctness and also possesses user-centered delivery and practicality.</p></sec><sec id="s2-5"><title>Ethical Considerations</title><p>All data used in this study were sourced exclusively from publicly accessible online platforms, including the official websites of major Korean hospitals, open-source medical textbooks, the Ministry of Food and Drug Safety, and Korean health search engines. These sources provide general disease-level medical knowledge intended for public health education and do not contain any patient-level records, clinical notes, or PII. No internal, proprietary, or private electronic health records were accessed.</p><p>As detailed in the Methods section, all collected data underwent a stringent verification process involving both automated screening and manual review by medical professionals to guarantee the total absence of any unintended PII. Accordingly, this study qualifies as research using publicly available information with no collection or recording of personal identifiers, and is exempt from Institutional Review Board review pursuant to Article 13, Paragraph 3 of the Enforcement Rule of the Bioethics and Safety Act in the Republic of Korea [<xref ref-type="bibr" rid="ref34">34</xref>].</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Overview</title><p>The proposed model scored highest across accuracy, safety, and helpfulness. It significantly outperformed other models in terms of accuracy, safety, and helpfulness. The following table and graph visually represent the quantitative evaluation results for each model. To compare the performance differences between configurations (baseline vs RAG vs RAG+metadata filtering), we performed a paired <italic>t</italic> test for each model&#x2019;s scores across the 226 validation questions.</p><p><xref ref-type="table" rid="table4">Table 4</xref> shows the results of analyzing the impact of RAG and metadata filtering technologies on the medical consultation response performance of various language models. The general trend indicates that applying RAG technology alone results in minimal performance improvement or even a slight decline. In contrast, when RAG and metadata filtering are applied together, most models show a significant performance enhancement.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Analysis of quantitative evaluation results. This table presents the performance of each language model in terms of accuracy, safety, and helpfulness for medical consultation responses. <italic>P</italic> values were derived from 2-tailed paired <italic>t</italic> tests comparing each variant to its respective baseline model. Statistical significance was set at <italic>P</italic>&#x003C;.05.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Models</td><td align="left" valign="bottom">Accuracy</td><td align="left" valign="bottom">Safety</td><td align="left" valign="bottom">Helpfulness</td><td align="left" valign="bottom">Average</td><td align="left" valign="bottom"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top">Qwen3:4B, mean (SD)</td><td align="left" valign="top">4.23 (0.06)</td><td align="left" valign="top">4.25 (0.05)</td><td align="left" valign="top">4.27 (0.05)</td><td align="left" valign="top">4.25 (0.04)</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td></tr><tr><td align="left" valign="top">Qwen3:4B+RA GB, mean (SD)</td><td align="left" valign="top">4.27 (0.06)</td><td align="left" valign="top">4.22 (0.05)</td><td align="left" valign="top">4.37 (0.05)</td><td align="left" valign="top">4.28 (0.04)</td><td align="left" valign="top">.40</td></tr><tr><td align="left" valign="top">Qwen3:4B+RAG<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup>+metadata filtering, mean (SD)</td><td align="left" valign="top">4.41 (0.07)</td><td align="left" valign="top">4.25 (0.06)</td><td align="left" valign="top">4.47 (0.06)</td><td align="left" valign="top">4.37 (0.05)</td><td align="left" valign="top">.02</td></tr><tr><td align="left" valign="top">Mistral:7B, mean (SD)</td><td align="left" valign="top">3.51 (0.10)</td><td align="left" valign="top">3.74 (0.09)</td><td align="left" valign="top">4.14 (0.07)</td><td align="left" valign="top">3.79 (0.08)</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Mistral:7B+RAG, mean (SD)</td><td align="left" valign="top">3.58 (0.13)</td><td align="left" valign="top">3.80 (0.12)</td><td align="left" valign="top">4.04 (0.10)</td><td align="left" valign="top">3.80 (0.11)</td><td align="left" valign="top">.73</td></tr><tr><td align="left" valign="top">Mistral:7B+RAG+metadata filtering, mean (SD)</td><td align="left" valign="top">3.92 (0.12)</td><td align="left" valign="top">4.13 (0.10)</td><td align="left" valign="top">4.25 (0.09)</td><td align="left" valign="top">4.10 (0.10)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Llama3.1:8B, mean (SD)</td><td align="left" valign="top">3.52 (0.08)</td><td align="left" valign="top">3.61 (0.09)</td><td align="left" valign="top">3.98 (0.06)</td><td align="left" valign="top">3.70 (0.07)</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Llama3.1:8B+RAG, mean (SD)</td><td align="left" valign="top">3.50 (0.11)</td><td align="left" valign="top">3.58 (0.11)</td><td align="left" valign="top">3.85 (0.08)</td><td align="left" valign="top">3.64 (0.09)</td><td align="left" valign="top">.98</td></tr><tr><td align="left" valign="top">Llama3.1:8B+RAG+metadata filtering, mean (SD)</td><td align="left" valign="top">3.67 (0.10)</td><td align="left" valign="top">3.73 (0.11)</td><td align="left" valign="top">3.98 (0.07)</td><td align="left" valign="top">3.79 (0.09)</td><td align="left" valign="top">.04</td></tr><tr><td align="left" valign="top">Gpt-Oss:20B, mean (SD)</td><td align="left" valign="top">4.34 (0.04)</td><td align="left" valign="top">4.50 (0.04)</td><td align="left" valign="top">4.44 (0.06)</td><td align="left" valign="top">4.43 (0.05)</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Gpt-Oss:20B+RAG, mean (SD)</td><td align="left" valign="top">4.41 (0.05)</td><td align="left" valign="top">4.57 (0.04)</td><td align="left" valign="top">4.39 (0.06)</td><td align="left" valign="top">4.46 (0.05)</td><td align="left" valign="top">.15</td></tr><tr><td align="left" valign="top">Gpt-Oss:20B+RAG+metadata filtering, mean (SD)</td><td align="left" valign="top">4.46 (0.04)</td><td align="left" valign="top">4.61 (0.03)</td><td align="left" valign="top">4.45 (0.06)</td><td align="left" valign="top">4.51 (0.04)</td><td align="left" valign="top">.03</td></tr><tr><td align="left" valign="top">Gemma3:27B, mean (SD)</td><td align="left" valign="top">4.38 (0.03)</td><td align="left" valign="top">4.32 (0.05)</td><td align="left" valign="top">4.56 (0.03)</td><td align="left" valign="top">4.42 (0.03)</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Gemma3:27B+RAG, mean (SD)</td><td align="left" valign="top">4.31 (0.05)</td><td align="left" valign="top">4.26 (0.05)</td><td align="left" valign="top">4.43 (0.04)</td><td align="left" valign="top">4.34 (0.03)</td><td align="left" valign="top">&#x2265;.99</td></tr><tr><td align="left" valign="top">Gemma3:27B+RAG+metadata filtering, mean (SD)</td><td align="left" valign="top">4.49 (0.03)</td><td align="left" valign="top">4.34 (0.06)</td><td align="left" valign="top">4.61 (0.03)</td><td align="left" valign="top">4.48 (0.03)</td><td align="left" valign="top">.14</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>Not available.</p></fn><fn id="table4fn2"><p><sup>b</sup>RAG: retrieval-augmented generation.</p></fn></table-wrap-foot></table-wrap><p>The minimal effect of applying RAG alone appears to stem from the difficulty of accurately finding relevant and useful information from vast external data sources. Indeed, the change in average scores for the Qwen3:4B (4.25, SD 0.04 &#x2192; 4.28, SD 0.04, <italic>P</italic>=.40) and Mistral:7B (3.79, SD 0.08 &#x2192; 3.80, SD 0.11, <italic>P</italic>=.73) models was not statistically significant. Furthermore, the Llama3.1:8B and Gemma3:27B models actually experienced a decline in scores.</p><p>In contrast, when metadata filtering was combined, a clear improvement was observed, particularly in the midsized parameter models. The average score for the Mistral:7B model increased significantly from 3.79, SD 0.08, to 4.10, SD 0.10, and the Qwen3:4B model&#x2019;s score rose from 4.25, SD 0.04, to 4.37, SD 0.05; both increases were statistically significant (<italic>P</italic>&#x003C;.05). Gpt-Oss:20B also demonstrated a significant (<italic>P</italic>&#x003C;.05) performance enhancement from 4.43, SD 0.05 to 4.51, SD 0.04, notably achieving the highest score of 4.61, SD 0.03 on the safety metric. This indicates that the effectiveness of RAG is highly dependent on the filtering process that controls the quality of the retrieved information.</p><p><xref ref-type="fig" rid="figure4">Figure 4</xref> compares the effects of RAG and metadata filtering against the original model. Each bar shows the average difference in accuracy, safety, and helpfulness compared to the baseline when RAG and metadata filtering are applied, with positive values indicating performance improvement. Overall, when the metadata filtering technique was included, the model showed improved performance in accuracy and helpfulness, which proves that it provides useful answers suited to the local Korean environment.</p><p>Statistical analyses were performed to evaluate the significance of performance changes. All continuous variables (accuracy, safety, and helpfulness) are reported as mean and SE of the mean. Differences between the baseline models and their RAG-augmented variants were assessed using a 2-tailed paired <italic>t</italic> test, assuming the scores for the same 226 questions are dependent. Statistical significance was defined as <italic>P</italic>&#x003C;.05.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Graph of performance changes in the original model based on RAG and metadata filtering. RAG: retrieval-augmented generation.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e72604_fig04.png"/></fig></sec><sec id="s3-2"><title>Web Service Implementation</title><p>For this study, our development environment used Ubuntu 22.04 long-term support and an NVIDIA A100 40 GB graphics processing unit. We used the graphics processing unit for the computational operations of the LLMs. We applied Streamlit (Streamlit Inc) and LangChain (LangChain Inc) as development frameworks. Specifically, we used Streamlit to implement the web interface and LangChain to integrate and manage the LLMs.</p><p>The service developed through this process is named &#x201C;Geongangi.&#x201D; We aim for this service to provide information on disease symptoms, prevention, and health management, as well as posttreatment care conversations in the future. The intended target users are the general public and medical professionals. It can provide medical information to the general public and be used by medical professionals for the purpose of collecting patient information. <xref ref-type="fig" rid="figure5">Figure 5</xref> shows the &#x201C;Geongangi&#x201D; web service interface, illustrating the user-friendly design that supports both patient and health care provider interactions.</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Screenshot of the implemented &#x201D;Geongangi&#x201D; web service interface. The interface is designed to provide an intuitive user experience for both the general public and health care professionals, enabling easy access to health consultations.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e72604_fig05.png"/></fig></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study aimed to determine under what conditions RAG and metadata filtering improve Korean health consultation responses across open-weight LLMs of varying scales. Our findings validate the hypothesis set in this study by revealing that retrieval augmentation yields meaningful gains only when retrieval quality is actively controlled through metadata filtering. Standalone RAG failed to produce statistically significant improvements in any model&#x2014;and, in 2 cases (Llama3.1:8B and Gemma3:27B), resulted in score decreases&#x2014;indicating that unfiltered retrieval over a large-scale Korean medical corpus introduces contextual noise that can degrade response quality. In contrast, RAG combined with metadata filtering produced statistically significant performance gains (<italic>P</italic>&#x003C;.05) in 4 of the 5 models evaluated. Notably, the magnitude of improvement was inversely associated with model scale: smaller models such as Mistral:7B showed the greatest absolute gain (3.79, SD 0.08 &#x2192; 4.10, SD 0.10), while the largest model, Gemma3:27B, showed no significant benefit (<italic>P</italic>=.14)&#x2014;suggesting that beyond a certain knowledge threshold, external retrieval provides diminishing returns. Collectively, these results confirm that effective RAG deployment in specialized medical domains requires not merely an augmentation of information, but a deliberate mechanism for aligning retrieved content with the clinical structure of the target health care environment.</p></sec><sec id="s4-2"><title>Interpretations and Implications</title><p>The failure of standalone RAG to improve performance can be attributed to the unique linguistic characteristics of Korean medical inquiries. As demonstrated in <xref ref-type="table" rid="table2">Table 2</xref>, patient queries are frequently long and narrative, embedding core clinical signals within surrounding emotional expressions, personal histories, and contextual background. This structure creates a semantic mismatch between the patient&#x2019;s actual clinical intent and the documents selected through vector similarity search alone; for example, a query describing weight loss, low mood, and loss of motivation was retrieved against a document on desquamative interstitial pneumonia rather than depressive disorder when filtering was absent. This finding is consistent with prior work identifying the limitations of dense retrieval in long, multitopic queries, where peripheral content dilutes the relevance signal and degrades retrieval precision.</p><p>Metadata filtering addresses this limitation not merely as a computational optimization, but as a mechanism that structurally aligns retrieval with the Korean clinical decision-making framework. South Korea&#x2019;s health care system is characterized by highly standardized diagnostic and treatment protocols governed by National Health Insurance reimbursement criteria and professional society guidelines, in which institutional standards often take precedence over individual clinical discretion. As Korean medical knowledge is thus organized around well-defined categories in accordance with international standard guidelines&#x2014;clinical department, disease classification, and drug and procedure type&#x2014;ie, metadata filtering, which extracts these same clinical cues from the query before retrieval, naturally mirrors this organizational structure. This alignment explains why filtered RAG produced not only higher accuracy scores but also a meaningfully higher safety score in Gpt-Oss:20B (4.61, SD 0.03), whose reinforcement learning from human feedback&#x2013;based alignment appeared to synergize with retrieved documents containing explicit clinical warnings and contraindication information.</p><p>Model-specific performance patterns further reveal that the benefit of metadata filtering is modulated by a model&#x2019;s preexisting parametric knowledge. Mistral:7B, which showed the largest absolute improvement (3.79, SD 0.08 &#x2192; 4.10, SD 0.10), appears to be well-suited to instruction-following but lacks the domain-specific knowledge to generate accurate Korean medical responses independently; metadata-filtered retrieval effectively compensated for this gap, representing the highest return on retrieval investment relative to model size. In contrast, Llama3.1:8B showed a score decrease under RAG-only conditions, likely because its limited context usage capacity caused unfiltered, noisy documents to interfere with generation rather than support it. Gemma3:27B, which exhibited no significant improvement even under filtered RAG (<italic>P</italic>=.14), likely reflects a knowledge ceiling effect&#x2014;its 27B-scale pretraining corpus appears to have already internalized information equivalent to or exceeding the clinical scope of this study&#x2019;s document database, rendering external retrieval marginally useful under current conditions. This effect may not generalize to domains requiring highly local, time-sensitive, or rare clinical knowledge not well-represented in general pretraining data.</p><p>These findings carry direct implications for practical deployment decisions. For resource-constrained settings that require on-premises deployment, smaller models (&#x2264;8B parameters) should be considered viable only in combination with high-quality metadata filtering, as their baseline medical knowledge is insufficient for safe clinical use. For midrange models (&#x2248;20B), the combination of filtered RAG and strong safety alignment offers a practical trade-off between computational cost and clinical reliability. For large models (&#x2265;27B), the marginal benefit of RAG may be limited unless the target domain involves highly specialized, locally specific, or frequently updated knowledge that is unlikely to be covered in pretraining data.</p></sec><sec id="s4-3"><title>Limitations</title><p>We identified several challenges and boundaries in this study that should be considered when interpreting the findings. First, the system continues to struggle with complex clinical inferences involving the simultaneous consideration of multiple medications or comorbid conditions, as the current metadata schema does not yet capture interaction-level clinical relationships. Second, the construction and maintenance of metadata tag sets requires substantial manual effort from domain experts, which may limit the system&#x2019;s scalability when extending to rare or highly specialized medical subspecialties underrepresented in the current corpus. Third, real-world user query analysis revealed frequent inquiries regarding diet, exercise, and lifestyle management, indicating that future iterations must expand data coverage beyond structured clinical literature to include validated nutritional and behavioral health information. Finally, this study did not systematically analyze the downstream impact of metadata extraction errors&#x2014;cases where the LLM incorrectly identified or omitted clinical cues from the query&#x2014;on final response quality, and the error propagation characteristics of such failures remain an important area for future investigation.</p></sec><sec id="s4-4"><title>Conclusions</title><p>This quantitative evaluation confirms that architectural reinforcement through metadata filtering is essential for the reliable application of AI in the Korean medical domain. By demonstrating a framework that bridges the gap between general-purpose LLMs and the specific requirements of the local health care system, this study provides a blueprint for safer medical consultation tools. As these systems evolve to integrate broader health-related data, they will play a vital role in enhancing clinical efficiency and providing patients with high-quality, verified medical information that adheres to regional standards.</p></sec></sec></body><back><ack><p>The authors declare the use of generative artificial intelligence (GenAI) in the research and writing process. According to the GAIDeT (Generative Artificial Intelligence Delegation Taxonomy; 2025), the following tasks were delegated to GenAI tools under full human supervision: literature search and systematization, code optimization, creation of algorithms for data analysis, text generation, proofreading and editing, summarizing text, translation, and reformatting. The GenAI tool used was Gemini (Google LLC). Responsibility for the final manuscript lies entirely with the authors. GenAI tools are not listed as authors and do not bear responsibility for the outcomes. Declaration submitted by SC. Additional note: all AI-generated suggestions were critically reviewed and edited by the authors.</p></ack><notes><sec><title>Funding</title><p>This work was supported by the Bio-Industrial Technology Development Program (RS-2025-02220286, &#x201C;(Division 2) Development of large language AI model-based techniques and platforms for nursery record generation and task automation&#x201D;) funded by the Ministry of Trade Industry &#x0026; Resources (MOTIR, Korea). The funder had no involvement in this study&#x2019;s design, data collection, analysis, interpretation, or the writing of this paper.</p></sec><sec><title>Data Availability</title><p>The datasets generated or analyzed during this study are available from the corresponding author on reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: HY, SC, DK</p><p>Data curation: SC, DHA, ESL</p><p>Formal analysis: SC, DK</p><p>Funding acquisition: HY, YJK, JHJ</p><p>Investigation: ESL, SC</p><p>Methodology: SC, DK</p><p>Project administration: HY, SC, DK</p><p>Resources: HY, ESL</p><p>Software: SC, JHJ</p><p>Supervision: HY, DHL, MK</p><p>Validation: DK, SC</p><p>Visualization: SC, DK, HY</p><p>Writing &#x2013; original draft: SC, DK, HY</p><p>Writing &#x2013; review &#x0026; editing: SC, DK, HY, JHJ, MK, DHL, DHA, ESL, YJK</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb3">PII</term><def><p>personally identifiable information</p></def></def-item><def-item><term id="abb4">RAG</term><def><p>retrieval-augmented generation</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thirunavukarasu</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSJ</given-names> </name><name name-style="western"><surname>Elangovan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gutierrez</surname><given-names>L</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSW</given-names> </name></person-group><article-title>Large language models in medicine</article-title><source>Nat Med</source><year>2023</year><month>08</month><volume>29</volume><issue>8</issue><fpage>1930</fpage><lpage>1940</lpage><pub-id pub-id-type="doi">10.1038/s41591-023-02448-8</pub-id><pub-id pub-id-type="medline">37460753</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Clusmann</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kolbinger</surname><given-names>FR</given-names> </name><name name-style="western"><surname>Muti</surname><given-names>HS</given-names> </name><etal/></person-group><article-title>The future landscape of large language models in medicine</article-title><source>Commun Med (Lond)</source><year>2023</year><month>10</month><day>10</day><volume>3</volume><issue>1</issue><fpage>141</fpage><pub-id pub-id-type="doi">10.1038/s43856-023-00370-1</pub-id><pub-id pub-id-type="medline">37816837</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Shin</surname><given-names>S</given-names> </name><name name-style="western"><surname>Yoo</surname><given-names>SH</given-names> </name></person-group><article-title>Performance of large language models in non-English medical ethics-related multiple choice questions: comparison of ChatGPT performance across versions and languages</article-title><source>BMC Med Ethics</source><year>2025</year><month>12</month><day>9</day><volume>26</volume><issue>1</issue><fpage>168</fpage><pub-id pub-id-type="doi">10.1186/s12910-025-01316-z</pub-id><pub-id pub-id-type="medline">41366422</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Son</surname><given-names>G</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>H</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>S</given-names> </name><etal/></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Chiruzzo</surname><given-names>L</given-names> </name><name name-style="western"><surname>Ritter</surname><given-names>A</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name></person-group><article-title>KMMLU: measuring massive multitask language understanding in Korean</article-title><source>Assoc Comput Linguist</source><year>2025</year><fpage>4076</fpage><lpage>4104</lpage><pub-id pub-id-type="doi">10.18653/v1/2025.naacl-long.206</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Workum</surname><given-names>JD</given-names> </name><name name-style="western"><surname>van de Sande</surname><given-names>D</given-names> </name><name name-style="western"><surname>Gommers</surname><given-names>D</given-names> </name><name name-style="western"><surname>van Genderen</surname><given-names>ME</given-names> </name></person-group><article-title>Bridging the gap: a practical step-by-step approach to warrant safe implementation of large language models in healthcare</article-title><source>Front Artif Intell</source><year>2025</year><volume>8</volume><fpage>1504805</fpage><pub-id pub-id-type="doi">10.3389/frai.2025.1504805</pub-id><pub-id pub-id-type="medline">39931218</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Qi</surname><given-names>W</given-names> </name><name name-style="western"><surname>Pan</surname><given-names>L</given-names> </name></person-group><article-title>Epistemic and ethical limits of large language models in evidence-based medicine: from knowledge to judgment</article-title><source>Front Digit Health</source><year>2025</year><volume>7</volume><fpage>1706383</fpage><pub-id pub-id-type="doi">10.3389/fdgth.2025.1706383</pub-id><pub-id pub-id-type="medline">41641437</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Park</surname><given-names>SH</given-names> </name><name name-style="western"><surname>Dean</surname><given-names>G</given-names> </name><name name-style="western"><surname>Ortiz</surname><given-names>EM</given-names> </name><name name-style="western"><surname>Choi</surname><given-names>JI</given-names> </name></person-group><article-title>Overview of South Korean guidelines for approval of large language or multimodal models as medical devices: key features and areas for improvement</article-title><source>Korean J Radiol</source><year>2025</year><month>06</month><volume>26</volume><issue>6</issue><fpage>519</fpage><lpage>523</lpage><pub-id pub-id-type="doi">10.3348/kjr.2025.0257</pub-id><pub-id pub-id-type="medline">40288893</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>CW</given-names> </name></person-group><article-title>Access and reimbursement for artificial intelligence in radiology: a Korean perspective</article-title><source>Korean J Radiol</source><year>2026</year><month>01</month><volume>27</volume><issue>1</issue><fpage>4</fpage><lpage>6</lpage><pub-id pub-id-type="doi">10.3348/kjr.2025.1647</pub-id><pub-id pub-id-type="medline">41494669</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>DS</given-names> </name><name name-style="western"><surname>Bae</surname><given-names>G</given-names> </name><name name-style="western"><surname>Yoo</surname><given-names>SY</given-names> </name><name name-style="western"><surname>Kang</surname><given-names>M</given-names> </name></person-group><article-title>Between government policy, clinical autonomy, and market demands: a qualitative study of the impact of the prescribing analysis system on behavior of physicians in South Korea</article-title><source>BMC Health Serv Res</source><year>2015</year><month>09</month><day>21</day><volume>15</volume><fpage>397</fpage><pub-id pub-id-type="doi">10.1186/s12913-015-1059-x</pub-id><pub-id pub-id-type="medline">26392282</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Artsi</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Sorin</surname><given-names>V</given-names> </name><name name-style="western"><surname>Glicksberg</surname><given-names>BS</given-names> </name><etal/></person-group><article-title>Challenges of implementing LLMs in clinical practice: perspectives</article-title><source>J Clin Med</source><year>2025</year><month>09</month><day>1</day><volume>14</volume><issue>17</issue><fpage>6169</fpage><pub-id pub-id-type="doi">10.3390/jcm14176169</pub-id><pub-id pub-id-type="medline">40943929</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gumilar</surname><given-names>KE</given-names> </name><name name-style="western"><surname>Indraprasta</surname><given-names>BR</given-names> </name><name name-style="western"><surname>Hsu</surname><given-names>YC</given-names> </name><etal/></person-group><article-title>Disparities in medical recommendations from AI-based chatbots across different countries/regions</article-title><source>Sci Rep</source><year>2024</year><month>07</month><day>24</day><volume>14</volume><issue>1</issue><fpage>17052</fpage><pub-id pub-id-type="doi">10.1038/s41598-024-67689-0</pub-id><pub-id pub-id-type="medline">39048640</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hager</surname><given-names>P</given-names> </name><name name-style="western"><surname>Jungmann</surname><given-names>F</given-names> </name><name name-style="western"><surname>Holland</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Evaluation and mitigation of the limitations of large language models in clinical decision-making</article-title><source>Nat Med</source><year>2024</year><month>09</month><volume>30</volume><issue>9</issue><fpage>2613</fpage><lpage>2622</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-03097-1</pub-id><pub-id pub-id-type="medline">38965432</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Jeong</surname><given-names>H</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Medical hallucination in foundation models and their impact on healthcare</article-title><source>medRxiv</source><comment>Preprint posted online on  Mar 3, 2025</comment><pub-id pub-id-type="doi">10.1101/2025.02.28.25323115</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>WB</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Solaiman</surname><given-names>B</given-names> </name><name name-style="western"><surname>Cohen</surname><given-names>IG</given-names> </name></person-group><article-title>Regulating artificial intelligence in medical care in South Korea</article-title><source>Research Handbook on Health, AI and the Law</source><year>2024</year><publisher-name>Edward Elgar Publishing Ltd</publisher-name><pub-id pub-id-type="doi">10.4337/9781802205657.ch22</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>YM</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>TH</given-names> </name></person-group><article-title>Korean clinical entity recognition from diagnosis text using BERT</article-title><source>BMC Med Inform Decis Mak</source><year>2020</year><month>09</month><day>30</day><volume>20</volume><issue>Suppl 7</issue><fpage>242</fpage><pub-id pub-id-type="doi">10.1186/s12911-020-01241-8</pub-id><pub-id pub-id-type="medline">32998724</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Watabe</surname><given-names>S</given-names> </name><name name-style="western"><surname>Yanagisawa</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Sayama</surname><given-names>K</given-names> </name><etal/></person-group><article-title>A patient-centered approach to developing and validating a natural language processing model for extracting patient-reported symptoms</article-title><source>Sci Rep</source><year>2025</year><month>07</month><day>29</day><volume>15</volume><issue>1</issue><fpage>27652</fpage><pub-id pub-id-type="doi">10.1038/s41598-025-12845-3</pub-id><pub-id pub-id-type="medline">40730600</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Koleck</surname><given-names>TA</given-names> </name><name name-style="western"><surname>Tatonetti</surname><given-names>NP</given-names> </name><name name-style="western"><surname>Bakken</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Identifying symptom information in clinical notes using natural language processing</article-title><source>Nurs Res</source><year>2021</year><volume>70</volume><issue>3</issue><fpage>173</fpage><lpage>183</lpage><pub-id pub-id-type="doi">10.1097/NNR.0000000000000488</pub-id><pub-id pub-id-type="medline">33196504</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Adam</surname><given-names>H</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Keenan</surname><given-names>H</given-names> </name><name name-style="western"><surname>Wilson</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ghassemi</surname><given-names>M</given-names> </name></person-group><article-title>Clinical information extraction with large language models: a case study on organ procurement</article-title><source>AMIA Annu Symp Proc</source><year>2024</year><volume>2024</volume><fpage>115</fpage><lpage>123</lpage><pub-id pub-id-type="medline">40417525</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>JH</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>JM</given-names> </name><etal/></person-group><article-title>A pre-trained BERT for Korean medical natural language processing</article-title><source>Sci Rep</source><year>2022</year><month>08</month><day>16</day><volume>12</volume><issue>1</issue><fpage>13847</fpage><pub-id pub-id-type="doi">10.1038/s41598-022-17806-8</pub-id><pub-id pub-id-type="medline">35974113</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yoo</surname><given-names>SH</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>HJ</given-names> </name></person-group><article-title>Real-time hybrid query transformation method for enhancing search system performance in Korean language applications</article-title><source>Teh glas (Online)</source><year>2026</year><month>02</month><day>24</day><volume>20</volume><issue>1</issue><fpage>108</fpage><lpage>117</lpage><pub-id pub-id-type="doi">10.31803/tg-20250410040449</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zhuang</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Can we trust AI doctors? A survey of medical hallucination in large language and large vision-language models</article-title><source>Assoc Comput Linguist</source><year>2025</year><fpage>6748</fpage><lpage>6769</lpage><pub-id pub-id-type="doi">10.18653/v1/2025.findings-acl.350</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>D</given-names> </name><name name-style="western"><surname>Ye</surname><given-names>J</given-names> </name><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Enhancing large language models for improved accuracy and safety in medical question answering: comparative study</article-title><source>JMIR Med Educ</source><year>2025</year><month>12</month><day>2</day><volume>11</volume><fpage>e70190</fpage><pub-id pub-id-type="doi">10.2196/70190</pub-id><pub-id pub-id-type="medline">41329953</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nguyen</surname><given-names>NP</given-names> </name><name name-style="western"><surname>Tran</surname><given-names>P</given-names> </name></person-group><article-title>Bridging the mentorship divide: how large language models could reshape medical workforce equity</article-title><source>npj Digit Med</source><year>2026</year><month>01</month><day>9</day><volume>9</volume><issue>1</issue><fpage>29</fpage><pub-id pub-id-type="doi">10.1038/s41746-025-02167-z</pub-id><pub-id pub-id-type="medline">41513946</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>S</given-names> </name><name name-style="western"><surname>McCoy</surname><given-names>AB</given-names> </name><name name-style="western"><surname>Wright</surname><given-names>A</given-names> </name></person-group><article-title>Improving large language model applications in biomedicine with retrieval-augmented generation: a systematic review, meta-analysis, and clinical development guidelines</article-title><source>J Am Med Inform Assoc</source><year>2025</year><month>04</month><day>1</day><volume>32</volume><issue>4</issue><fpage>605</fpage><lpage>615</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocaf008</pub-id><pub-id pub-id-type="medline">39812777</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Lewis</surname><given-names>P</given-names> </name><name name-style="western"><surname>Perez</surname><given-names>E</given-names> </name><name name-style="western"><surname>Piktus</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Retrieval-augmented generation for knowledge-intensive NLP tasks</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 12, 2021</comment><pub-id pub-id-type="doi">10.48550/arXiv.2005.11401</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>NF</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>K</given-names> </name><name name-style="western"><surname>Hewitt</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Lost in the middle: how language models use long contexts</article-title><source>arXiv</source><comment>Preprint posted online on  Nov 20, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2307.03172</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Poliakov</surname><given-names>M</given-names> </name><name name-style="western"><surname>Shvai</surname><given-names>N</given-names> </name></person-group><article-title>Multi-meta-RAG: improving RAG for multi-hop queries using database filtering with LLM-extracted metadata</article-title><year>2025</year><volume>2359</volume><fpage>334</fpage><lpage>342</lpage><pub-id pub-id-type="doi">10.1007/978-3-031-81372-6_25</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Bruni</surname><given-names>D</given-names> </name><name name-style="western"><surname>Avvenuti</surname><given-names>M</given-names> </name><name name-style="western"><surname>Tonellotto</surname><given-names>N</given-names> </name><name name-style="western"><surname>Tesconi</surname><given-names>M</given-names> </name></person-group><article-title>AMAQA: a metadata-based QA dataset for RAG systems</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 12, 2026</comment><pub-id pub-id-type="doi">10.48550/arXiv.2505.13557</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>MG</given-names> </name><name name-style="western"><surname>Hwang</surname><given-names>G</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Roh</surname><given-names>HW</given-names> </name><name name-style="western"><surname>Park</surname><given-names>RW</given-names> </name></person-group><article-title>Performance of open-source large language models in psychiatry: usability study through comparative analysis of non-English records and English translations</article-title><source>J Med Internet Res</source><year>2025</year><month>08</month><day>18</day><volume>27</volume><fpage>e69857</fpage><pub-id pub-id-type="doi">10.2196/69857</pub-id><pub-id pub-id-type="medline">40825309</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Kweon</surname><given-names>S</given-names> </name><name name-style="western"><surname>Choi</surname><given-names>B</given-names> </name><name name-style="western"><surname>Chu</surname><given-names>G</given-names> </name><etal/></person-group><article-title>KorMedMCQA: multi-choice question answering benchmark for Korean healthcare professional licensing examinations</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 9, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2403.01469</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>J</given-names> </name><name name-style="western"><surname>Xiao</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>P</given-names> </name><name name-style="western"><surname>Luo</surname><given-names>K</given-names> </name><name name-style="western"><surname>Lian</surname><given-names>D</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Z</given-names> </name></person-group><article-title>M3-embedding: multi-linguality, multi-functionality, multi-granularity text embeddings through self-knowledge distillation</article-title><source>Assoc Comput Linguist</source><year>2024</year><fpage>2318</fpage><lpage>2335</lpage><pub-id pub-id-type="doi">10.18653/v1/2024.findings-acl.137</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Prabhu</surname><given-names>V</given-names> </name></person-group><article-title>A comprehensive study of elastic search</article-title><source>JRSE</source><year>2022</year><month>11</month><day>30</day><volume>4</volume><issue>11</issue><fpage>11</fpage><pub-id pub-id-type="doi">10.53469/jrse.2022.04(11).07</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Song</surname><given-names>J</given-names> </name></person-group><article-title>A chatbot based question and answer system for the auxiliary diagnosis of chronic diseases based on large language model</article-title><source>Sci Rep</source><year>2024</year><volume>14</volume><issue>1</issue><pub-id pub-id-type="doi">10.1038/s41598-024-67429-4</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="web"><article-title>Enforcement Rule of the Bioethics and Safety Act [Article in Korean]</article-title><source>Korea Ministry of Government Legislation (National Law Information Center)</source><year>2025</year><month>06</month><day>2</day><access-date>2026-04-18</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.law.go.kr/LSW/lsLinkCommonInfo.do?lspttninfSeq=75929&#x0026;chrClsCd=010202">https://www.law.go.kr/LSW/lsLinkCommonInfo.do?lspttninfSeq=75929&#x0026;chrClsCd=010202</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Checklist 1</label><p>Tripod+LLM checklist.</p><media xlink:href="formative_v10i1e72604_app1.pdf" xlink:title="PDF File, 177 KB"/></supplementary-material></app-group></back></article>