<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v9i1e54803</article-id><article-id pub-id-type="doi">10.2196/54803</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Synthetic Data-Driven Approaches for Chinese Medical Abstract Sentence Classification: Computational Study</article-title></title-group><contrib-group><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Li</surname><given-names>Jiajia</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Wang</surname><given-names>Zikai</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yu</surname><given-names>Longxuan</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Liu</surname><given-names>Hui</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Song</surname><given-names>Haitao</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib></contrib-group><aff id="aff1"><institution>Shanghai Artificial Intelligence Research Institute Co., Ltd</institution><addr-line>Shanghai</addr-line><country>China</country></aff><aff id="aff2"><institution>Xiangfu Laboratory</institution><addr-line>Jiaxing</addr-line><country>China</country></aff><aff id="aff3"><institution>School of Chemistry and Chemical Engineering, Shanghai Jiao Tong University</institution><addr-line>Shanghai</addr-line><country>China</country></aff><aff id="aff4"><institution>Inner Mongolia Academy of Science and Technology</institution><addr-line>Hohhot</addr-line><country>China</country></aff><aff id="aff5"><institution>University of California San Diego</institution><addr-line>San Diego</addr-line><addr-line>CA</addr-line><country>United States</country></aff><aff id="aff6"><institution>Shanghai Civil Aviation College</institution><addr-line>Shanghai</addr-line><country>China</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Mavragani</surname><given-names>Amaryllis</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Wang</surname><given-names>Yifan</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Haitao Song, PhD, Shanghai Artificial Intelligence Research Institute Co., Ltd, No. 930, Jianchuan Road, Minhang District, Shanghai, 200240, China, 8618817771407; <email>songhaitao@sairi.com.cn</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>19</day><month>3</month><year>2025</year></pub-date><volume>9</volume><elocation-id>e54803</elocation-id><history><date date-type="received"><day>14</day><month>04</month><year>2024</year></date><date date-type="rev-recd"><day>28</day><month>11</month><year>2024</year></date><date date-type="accepted"><day>28</day><month>11</month><year>2024</year></date></history><copyright-statement>&#x00A9; Zikai Wang, Longxuan Yu, Hui Liu, Haitao Song. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 19.3.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2025/1/e54803"/><abstract><sec><title>Background</title><p>Medical abstract sentence classification is crucial for enhancing medical database searches, literature reviews, and generating new abstracts. However, Chinese medical abstract classification research is hindered by a lack of suitable datasets. Given the vastness of Chinese medical literature and the unique value of traditional Chinese medicine, precise classification of these abstracts is vital for advancing global medical research.</p></sec><sec><title>Objective</title><p>This study aims to address the data scarcity issue by generating a large volume of labeled Chinese abstract sentences without manual annotation, thereby creating new training datasets. Additionally, we seek to develop more accurate text classification algorithms to improve the precision of Chinese medical abstract classification.</p></sec><sec sec-type="methods"><title>Methods</title><p>We developed 3 training datasets (dataset #1, dataset #2, and dataset #3) and a test dataset to evaluate our model. Dataset #1 contains 15,000 abstract sentences translated from the PubMed dataset into Chinese. Datasets #2 and #3, each with 15,000 sentences, were generated using GPT-3.5 from 40,000 Chinese medical abstracts in the CSL database. Dataset #2 used titles and keywords for pseudolabeling, while dataset #3 aligned abstracts with category labels. The test dataset includes 87,000 sentences from 20,000 abstracts. We used SBERT embeddings for deeper semantic analysis and evaluated our model using clustering (SBERT-DocSCAN) and supervised methods (SBERT-MEC). Extensive ablation studies and feature analyses were conducted to validate the model&#x2019;s effectiveness and robustness.</p></sec><sec sec-type="results"><title>Results</title><p>Our experiments involved training both clustering and supervised models on the 3 datasets, followed by comprehensive evaluation using the test dataset. The outcomes demonstrated that our models outperformed the baseline metrics. Specifically, when trained on dataset #1, the SBERT-DocSCAN model registered an impressive accuracy and <italic>F</italic><sub>1</sub>-score of 89.85% on the test dataset. Concurrently, the SBERT-MEC algorithm exhibited comparable performance with an accuracy of 89.38% and an identical <italic>F</italic><sub>1</sub>-score. Training on dataset #2 yielded similarly positive results for the SBERT-DocSCAN model, achieving an accuracy and <italic>F</italic><sub>1</sub>-score of 89.83%, while the SBERT-MEC algorithm recorded an accuracy of 86.73% and an <italic>F</italic><sub>1</sub>-score of 86.51%. Notably, training with dataset #3 allowed the SBERT-DocSCAN model to attain the best with an accuracy and <italic>F</italic><sub>1</sub>-score of 91.30%, whereas the SBERT-MEC algorithm also showed robust performance, obtaining an accuracy of 90.39% and an <italic>F</italic><sub>1</sub>-score of 90.35%. Ablation analysis highlighted the critical role of integrated features and methodologies in improving classification efficiency.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Our approach addresses the challenge of limited datasets for Chinese medical abstract classification by generating novel datasets. The deployment of SBERT-DocSCAN and SBERT-MEC models significantly enhances the precision of classifying Chinese medical abstracts, even when using synthetic datasets with pseudolabels.</p></sec></abstract><kwd-group><kwd>medical abstract sentence classification</kwd><kwd>large language models</kwd><kwd>synthetic datasets</kwd><kwd>deep learning</kwd><kwd>Chinese medical</kwd><kwd>dataset</kwd><kwd>traditional Chinese medicine</kwd><kwd>global medical research</kwd><kwd>algorithm</kwd><kwd>robustness</kwd><kwd>efficiency</kwd><kwd>accuracy</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>In the realms of machine learning and artificial intelligence, the significance of data cannot be overstated, yet accessing authentic real-world datasets often presents substantial hurdles, including elevated costs, extended timeframes, and privacy issues. To navigate these obstacles, there is a growing pivot toward the utilization of synthetic datasets. While synthetic datasets have predominantly been associated with computer vision applications, the landscape is changing for natural language processing (NLP) due to advancements in text generation capabilities, largely attributed to the multihead self-attention mechanism integral to the transformer family of models. This trend began with the transformer itself [<xref ref-type="bibr" rid="ref1">1</xref>] and has since given rise to models such as Bidirectional Encoder Representations from Transformers (BERT) [<xref ref-type="bibr" rid="ref2">2</xref>], OpenAI GPT [<xref ref-type="bibr" rid="ref3">3</xref>], Transformer-XL [<xref ref-type="bibr" rid="ref4">4</xref>], OpenAI GPT-2 [<xref ref-type="bibr" rid="ref5">5</xref>], and Grover [<xref ref-type="bibr" rid="ref6">6</xref>]. In recent years, large language models have transformed the field of NLP, demonstrating exceptional performance in a wide range of tasks. These models generate increasingly coherent text, prompting researchers to explore the potential of synthetic datasets. Particularly noteworthy, Zellers et al [<xref ref-type="bibr" rid="ref6">6</xref>] claimed that their Grover model for conditional text generation outperforms human-generated text in both style and content, especially in the &#x201C;fake news&#x201D; and &#x201C;propaganda&#x201D; categories, as evaluated by human raters. This highlights the promise of synthetic datasets to improve NLP performance and the continued need to develop text generation techniques.</p></sec><sec id="s1-2"><title>Prior Work</title><sec id="s1-2-1"><title>Sentence-Level Text Classification</title><p>Text classification at the sentence level has been explored using various deep learning models, such as convolutional neural networks (CNNs) and recurrent neural networks (RNNs). Kim [<xref ref-type="bibr" rid="ref7">7</xref>] first proposed a single-layer CNN with pretrained word embeddings for text classification, achieving excellent results. Yang et al [<xref ref-type="bibr" rid="ref8">8</xref>] developed a 2-level attention mechanism using gate recurrent units for document classification, while Conneau et al [<xref ref-type="bibr" rid="ref9">9</xref>] introduced a CNN-nonstatic model with character-level CNNs and an average pooling layer. Recently, pretrained models, such as BERT [<xref ref-type="bibr" rid="ref2">2</xref>], RoBERTa [<xref ref-type="bibr" rid="ref10">10</xref>], XLNet [<xref ref-type="bibr" rid="ref11">11</xref>], and ALBERT [<xref ref-type="bibr" rid="ref12">12</xref>], have also been used for sentence-level text classification tasks. These transformer-based models generate sentence representations and can be combined with various classifiers to achieve state-of-the-art performance. Additionally, hybrid approaches using word-level and character-level CNNs initialized with ELMo [<xref ref-type="bibr" rid="ref13">13</xref>] or BERT embeddings have been explored to improve the robustness and performance of sentence-level text classification models. Overall, pretrained models have significantly advanced the state-of-the-art in sentence-level text classification, and further research in this area is expected to yield even more sophisticated models.</p></sec><sec id="s1-2-2"><title>Unsupervised Text Clustering</title><p>Unsupervised text clustering is an important task in NLP that groups similar text documents without relying on labeled examples. While traditional methods such as hierarchical agglomerative clustering [<xref ref-type="bibr" rid="ref14">14</xref>], k-means clustering [<xref ref-type="bibr" rid="ref15">15</xref>], nonnegative matrix factorization [<xref ref-type="bibr" rid="ref16">16</xref>], and latent dirichlet allocation [<xref ref-type="bibr" rid="ref17">17</xref>] have been widely used for this task, recent advances in pretrained embeddings have led to new and competitive methods. These include paragraph vector [<xref ref-type="bibr" rid="ref18">18</xref>] and USE+KMeans [<xref ref-type="bibr" rid="ref19">19</xref>], which have shown promising results in various text clustering benchmarks. More recently, BERT-based methods, such as BERT-EMD [<xref ref-type="bibr" rid="ref20">20</xref>], have been proposed for unsupervised text clustering. Additionally, SBERT [<xref ref-type="bibr" rid="ref21">21</xref>] and DocSCAN [<xref ref-type="bibr" rid="ref22">22</xref>] are the most recent methods that generate high-quality text embeddings and use graph-based clustering for unsupervised document clustering. Together, these developments highlight the continued importance of unsupervised text clustering in NLP research, with recent methods based on pretrained embeddings, and SBERT in particular, showing promising results.</p></sec><sec id="s1-2-3"><title>Synthetic Data</title><p>Synthetic data, generated using various techniques such as generative adversarial network (GAN)-based text generation and language model-based data augmentation, has become a popular way to expand small datasets in NLP. These synthetic datasets aim to improve model performance and generalization by providing additional examples for training. For instance, Xu et al [<xref ref-type="bibr" rid="ref23">23</xref>] used a GAN-based method to generate synthetic data for image captioning, which showed promising results. Similarly, Wang and Wan [<xref ref-type="bibr" rid="ref24">24</xref>] expanded GAN-based text generation to create synthetic datasets for sentiment analysis, which performed similarly to real data. Additionally, language model&#x2013;based data augmentation, such as using GPT-2 for auto-completion, has been effective in generating synthetic data for NLP tasks [<xref ref-type="bibr" rid="ref5">5</xref>]. Zhang et al [<xref ref-type="bibr" rid="ref25">25</xref>] improved Chinese text classification using a language model&#x2013;based data augmentation technique, while Zhou et al [<xref ref-type="bibr" rid="ref26">26</xref>] showed that their language model&#x2013;based data augmentation method improved low-resource language modeling. However, the use of synthetic data in NLP also has its limitations and potential drawbacks, which require further research. It is important to explore the advantages and disadvantages of synthetic datasets, as well as the different techniques used to generate them, to fully understand their impact on model performance and generalization.</p></sec></sec><sec id="s1-3"><title>The Goal of This Study</title><p>This paper aims to explore the creation and use of synthetic datasets to address the lack of real-world datasets for Chinese medical abstract classification, as there are currently no open-source datasets with sentence-level classification labels for Chinese medical abstracts, as shown in <xref ref-type="fig" rid="figure1">Figure 1</xref>. In response to this challenge, we have harnessed the capabilities of GPT-3.5 to generate distinct datasets for the classification of Chinese medical abstracts. Moreover, we designed and trained the clustering and supervised models for this task. Through this innovative approach, this study not only showcases the immense potential of synthetic datasets to bridge the gaps inherent in real-world datasets but also illuminates their profound impact on enhancing the performance of NLP tasks.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Goal of this study. It has been observed that Chinese medical abstract classification faces a significant number of limitations due to the lack of corresponding labeled datasets. Achieving accurate sentence-level classification in these abstracts will be instrumental in contributing Chinese medical information to the global medical field.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e54803_fig01.png"/></fig></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Overall Framework</title><p>The methodology used in this paper is structured into three distinct sections, systematically addressing the creation and application of synthetic datasets for Chinese medical abstract classification:</p><list list-type="order"><list-item><p>Synthetic dataset generation: the process begins by translating the PubMed dataset [<xref ref-type="bibr" rid="ref27">27</xref>] into Chinese and generating 2 distinct synthetic abstract datasets based on provided titles, keywords, disciplines, and categories by GPT-3.5.</p></list-item><list-item><p>Unsupervised clustering: this step involves fine-tuning the SBERT model with the synthetic datasets and then using the DocSCAN algorithm to cluster the generated sentence embeddings in an unsupervised manner, termed SBERT-DocSCAN (<xref ref-type="fig" rid="figure2">Figure 2</xref>).</p></list-item><list-item><p>Supervised classification: a new supervised method, SBERT-MEC, is proposed to classify the synthetic data set, enhancing the ability to accurately categorize synthetic medical abstracts (<xref ref-type="fig" rid="figure2">Figure 2</xref>).</p></list-item></list><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Framework of the proposed methods for abstract classification. The unsupervised clustering method (SBERT-DocSCAN) leverages SBERT embeddings and DocSCAN to group sentences into abstract sections based on clustering. The supervised learning method (SBERT-MEC) uses a multiencoder cascade (MEC) to enhance classification accuracy by extracting and integrating features for abstract sentence classification.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e54803_fig02.png"/></fig></sec><sec id="s2-2"><title>Synthetic Dataset Generation</title><p>As there is currently no Chinese dataset for sentence-level classification in the medical abstract field, in this step, we used OpenAI&#x2019;s text generation model, GPT-3.5, to generate 3 types of small synthetic datasets, which only contain around 15,000 sentences each yet still perform well on classification tasks. The first one is the translated PubMed dataset, which is translated into Chinese from the PubMed 200k RCT dataset by DeepL [<xref ref-type="bibr" rid="ref28">28</xref>], and we choose 15,000 sentences with clear labels as dataset #1. The second dataset is created by using GPT-3.5 to generate abstracts based on title, keywords, discipline, and category from the CSL dataset [<xref ref-type="bibr" rid="ref29">29</xref>]. The third dataset is also generated using GPT-3.5, but instead, it is paraphrased with rewritten abstracts assigned pseudolabels as dissimilar to the original text as possible.</p><p>Generating a diverse training dataset using large language models is a challenge. Even when the generation temperature is set to a high value, these models still tend to generate highly repetitive datasets that lack the diversity required for effective language model training. To address this issue, we selected the CSL corpus as the base corpus, which contains over 40,000 Chinese medical abstracts. Although these abstracts are not annotated at the sentence level, 20,000 of them have clear structure division into the 4 parts mentioned above. We extracted these 20,000 abstracts and manually labeled them as the test dataset. For the remaining 20,000 abstracts without clear structure division, we removed some low-quality data and the corpus of abstracts, then extracted a portion of them as the input corpus for GPT-3.5, which includes corresponding titles, keywords, subjects, and categories. These 4 types of data were inputted into GPT-3.5 to generate abstracts in 4 parts: purpose, method, results, and conclusion. These abstracts were then cleaned to produce a dataset of 15,000 sentences with pseudolabels (dataset #2). Similarly, the structure-less abstracts were inputted into GPT-3.5, which rewrote them to generate clear structure abstracts in 4 parts, resulting in a dataset of 15,000 sentences with pseudolabels (dataset #3). The construction process of the above datasets is shown in <xref ref-type="fig" rid="figure3">Figure 3</xref>.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Construction process of the synthetic datasets used for sentence-level classification in the medical abstract field. Dataset #1 is derived by translating the PubMed 200k RCT dataset into Chinese using DeepL and selecting 15,000 sentences with clear labels. Dataset #2 is generated by GPT-3.5, using titles, keywords, disciplines, and categories from the CSL dataset to produce structured abstracts divided into 4 parts (objective, methods, results, and conclusions). Dataset #3 is created by paraphrasing structure-less abstracts using GPT-3.5, generating diverse and rewritten abstracts with pseudolabels. The test dataset comprises 20,000 manually labeled abstracts from the CSL corpus with clear structural divisions. For each of the models in the paper (eg, SBERT), we train them on the training sets separately to produce corresponding ones (eg, SBERT #1, SBERT #2, and SBERT #3).</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e54803_fig03.png"/></fig></sec><sec id="s2-3"><title>Unsupervised Clustering</title><p>In the second part, we fine-tuned the sentence transformer and then used the DocSCAN algorithm to cluster the synthetic datasets. We chose the sbert-chinese-general-v2 model, which is a model pretrained on the SimCLUE dataset [<xref ref-type="bibr" rid="ref30">30</xref>], as the base model due to its outstanding performance on embedding Chinese sentences. We fine-tuned the pretrained model on the 3 datasets into 3 models, which is quite crucial as it leads to the overall distribution more inclined toward the functional aspects of each sentence rather than the literal content. We then used these fine-tuned models to embed the input data, followed by clustering by the DocSCAN algorithm which is an algorithm relying on the intuition that a datapoint and its nearest neighbors in representation space often share the same class label, and outperform others on unsupervised classification tasks. Specifically, in this part, we generated embeddings for each sentence in the test set using the 3 fine-tuned sentence transformers in the first part and then used the DocSCAN algorithm to cluster these embeddings for unsupervised abstract classification, which uses the SCAN loss <inline-formula><mml:math id="ieqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="normal">L</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="italic">S</mml:mi></mml:mrow></mml:mrow></mml:mrow></mml:msub></mml:mrow></mml:mstyle></mml:math></inline-formula> to fine-tune the model <inline-formula><mml:math id="ieqn2"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="italic">f</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mrow><mml:mi mathvariant="italic">x</mml:mi></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mstyle></mml:math></inline-formula>, defined as:</p><disp-formula id="equWL1"><mml:math id="eqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="normal">L</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mrow><mml:mo>|</mml:mo><mml:mi>D</mml:mi><mml:mo>|</mml:mo></mml:mrow></mml:mfrac><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>x</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mi>D</mml:mi></mml:mrow></mml:munder><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>k</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mi>x</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:munder><mml:mrow><mml:mi mathvariant="normal">l</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">g</mml:mi></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>f</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>x</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x22C5;</mml:mo><mml:mi>f</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>k</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:mi>&#x03BB;</mml:mi><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mi>C</mml:mi></mml:mrow></mml:munder><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mi mathvariant="normal">l</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">g</mml:mi></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>which can be broken down into a consistency loss and an auxiliary loss. The consistency loss aims to ensure that the same label is assigned to a data point and its neighbors. Our model <inline-formula><mml:math id="ieqn3"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="italic">f</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mo>&#x22C5;</mml:mo><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mstyle></mml:math></inline-formula> computes a label for a given data point <inline-formula><mml:math id="ieqn4"><mml:mi>x</mml:mi></mml:math></inline-formula> from the dataset <italic>D</italic>, as well as for each data point <italic>k</italic> in the set of mined neighbors from <italic>x</italic> in <inline-formula><mml:math id="ieqn5"><mml:msub><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>x</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>. To achieve this, we calculate the dot product between the output distribution (normalized by <inline-formula><mml:math id="ieqn6"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mrow><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">f</mml:mi><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">m</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">x</mml:mi></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mo>&#x22C5;</mml:mo><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mstyle></mml:math></inline-formula> of datapoint <inline-formula><mml:math id="ieqn7"><mml:mi>x</mml:mi></mml:math></inline-formula> and its neighbor <inline-formula><mml:math id="ieqn8"><mml:mi>k</mml:mi></mml:math></inline-formula>. The auxiliary loss is used to achieve regularization via entropy, scaled by a weight <inline-formula><mml:math id="ieqn9"><mml:mi mathvariant="normal">&#x03BB;</mml:mi></mml:math></inline-formula>. This loss encourages the model to distribute probability mass across all clusters <inline-formula><mml:math id="ieqn10"><mml:mi>C</mml:mi></mml:math></inline-formula>, where <inline-formula><mml:math id="ieqn11"><mml:msub><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> denotes the assigned probability of cluster <inline-formula><mml:math id="ieqn12"><mml:mi>i</mml:mi><mml:mi> </mml:mi></mml:math></inline-formula> in <inline-formula><mml:math id="ieqn13"><mml:mi>C</mml:mi></mml:math></inline-formula> by the model. Without this term, there could be a shortcut where all examples collapse into a single cluster. The entropy term ensures that the distribution of class labels resulting from applying DocSCAN tends to be roughly uniform.</p></sec><sec id="s2-4"><title>Supervised Classification</title><p>In the third part, we develop a supervised learning method, SBERT-MEC, which leverages the proposed multiencoder cascade (MEC) module for feature extraction, designed to enhance the classification of synthetic datasets through refined weak supervision. <xref ref-type="fig" rid="figure2">Figure 2</xref> shows the framework of SBERT-MEC and the modeling process. The first step of the SBERT-MEC is to input the sentence embeddings generated by SBERT into the MEC module, which includes 3 encoders: linear transformation, autoencoder model, and denoising autoencoder model. Formally, given a 768-dimensional sentence embedding <inline-formula><mml:math id="ieqn14"><mml:mi>V</mml:mi></mml:math></inline-formula>, the linear transformation <inline-formula><mml:math id="ieqn15"><mml:mi> </mml:mi><mml:mi>V</mml:mi></mml:math></inline-formula> transforms the sentence embedding <inline-formula><mml:math id="ieqn16"><mml:mi>V</mml:mi></mml:math></inline-formula> into <inline-formula><mml:math id="ieqn17"><mml:mi>W</mml:mi></mml:math></inline-formula> by adding a small random number <inline-formula><mml:math id="ieqn18"><mml:mi>n</mml:mi></mml:math></inline-formula>:</p><disp-formula id="equWL2"><mml:math id="eqn2"><mml:mi>W</mml:mi><mml:mo>=</mml:mo><mml:mi>L</mml:mi><mml:mfenced separators="|"><mml:mrow><mml:mi>V</mml:mi></mml:mrow></mml:mfenced><mml:mo>=</mml:mo><mml:mfenced separators="|"><mml:mrow><mml:msub><mml:mrow><mml:mi>V</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mi>n</mml:mi><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>V</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mi>n</mml:mi><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>V</mml:mi></mml:mrow><mml:mrow><mml:mn>768</mml:mn></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:mfenced></mml:math></disp-formula><p>The autoencoder model can be represented as follows:</p><disp-formula id="equWL3"><mml:math id="eqn3"><mml:mi>W</mml:mi><mml:mi> </mml:mi><mml:mo>=</mml:mo><mml:mi> </mml:mi><mml:msub><mml:mrow><mml:mi>A</mml:mi><mml:mi>E</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x03B8;</mml:mi></mml:mrow></mml:msub><mml:mo>(</mml:mo><mml:mi>V</mml:mi><mml:mo>)</mml:mo></mml:math></disp-formula><p>where <inline-formula><mml:math id="ieqn19"><mml:mi mathvariant="normal">A</mml:mi><mml:mi mathvariant="normal">E</mml:mi></mml:math></inline-formula> is a fully connected network that maps <inline-formula><mml:math id="ieqn20"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="italic">V</mml:mi></mml:mrow><mml:mrow><mml:mrow/></mml:mrow></mml:mrow></mml:mrow></mml:mrow></mml:mstyle></mml:math></inline-formula> into a duplicate embedding <inline-formula><mml:math id="ieqn21"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mrow><mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="italic">W</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mrow/></mml:mrow></mml:mrow></mml:mrow></mml:mrow></mml:mstyle></mml:math></inline-formula> and <inline-formula><mml:math id="ieqn22"><mml:mi>&#x03B8;</mml:mi></mml:math></inline-formula> is the parameters of <inline-formula><mml:math id="ieqn23"><mml:mi mathvariant="normal">A</mml:mi><mml:mi mathvariant="normal">E</mml:mi></mml:math></inline-formula>. The autoencoder is trained by duplicate pairs from the Quora dataset [<xref ref-type="bibr" rid="ref31">31</xref>]. The denoising autoencoder (<inline-formula><mml:math id="ieqn24"><mml:mi mathvariant="normal">D</mml:mi><mml:mi mathvariant="normal">A</mml:mi></mml:math></inline-formula>), in another way, adds Gaussian noise to the sentence embedding and forms a noisy embedding <inline-formula><mml:math id="ieqn25"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="italic">V</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="italic">N</mml:mi></mml:mrow></mml:mrow></mml:mrow></mml:msub></mml:mrow></mml:mstyle></mml:math></inline-formula>, and the training process forces output <inline-formula><mml:math id="ieqn26"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="italic">W</mml:mi></mml:mrow></mml:mrow></mml:mrow></mml:mstyle></mml:math></inline-formula> to be as close to the original embedding <inline-formula><mml:math id="ieqn27"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="italic">V</mml:mi></mml:mrow></mml:mrow></mml:mrow></mml:mstyle></mml:math></inline-formula> as possible:</p><disp-formula id="equWL4"><mml:math id="eqn4"><mml:msub><mml:mrow><mml:mi>V</mml:mi></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>V</mml:mi><mml:mo>+</mml:mo><mml:mi>&#x03F5;</mml:mi><mml:mo>,</mml:mo><mml:mi> </mml:mi><mml:mi> </mml:mi><mml:mi>&#x03F5;</mml:mi><mml:mo>&#x223C;</mml:mo><mml:mi>N</mml:mi><mml:mfenced separators="|"><mml:mrow><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mi> </mml:mi><mml:msup><mml:mrow><mml:mi>&#x03C3;</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:mi>I</mml:mi></mml:mrow></mml:mfenced></mml:math></disp-formula><disp-formula id="equWL5"><mml:math id="eqn5"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mi>W</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mo>=</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:msub><mml:mrow><mml:mi mathvariant="normal">D</mml:mi><mml:mi mathvariant="normal">A</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x03B8;</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>V</mml:mi><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mstyle></mml:math></disp-formula><p>To further enhance performance, 3 embeddings are input into the dynamic convolutional layer for mutual supervision. Specifically, for each embedding, the embeddings generated by the other 2 encoders are concatenated together and used as inputs to supervise this embedding. Finally, 3 output embeddings of the dynamic convolutional layers are concatenated with the original embedding and then put into a classifier to predict test data. This process is repeated for the 3 models fine-tuned in the first part respectively.</p><p>To summarize, first, we create synthetic datasets by different methods and leverage them to fine-tune sentence transformers. These datasets are small in size while effective in training classification tasks. Second, fine-tuned models are used on an unsupervised classification algorithm DocSCAN to analyze the performance of these datasets. Further, fine-tuned models are used on a weakly supervised algorithm SBERT-MEC proposed by us, which performs data augmentation on embeddings. This process demonstrates the performances of small synthetic datasets again, as well as the effectiveness of our algorithm.</p></sec><sec id="s2-5"><title>Ethical Considerations</title><p>This study did not involve human participants, animal subjects, or personally identifiable data. All data used in the research were either synthetic or publicly available, ensuring compliance with ethical guidelines. As such, no ethical approval or informed consent was required. The study adheres to principles of academic integrity and research transparency.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Dataset</title><p>For empirical evaluation, we crafted 3 datasets: 2 synthetic ones generated using GPT-3.5, and a pre-existing dataset. Each synthetic dataset comprises approximately 15,000 sentences pertinent to medical abstracts. These datasets underwent a meticulous generation process, including cleaning and pseudolabel assignment for sentences within each abstract section. Detailed methodology for creating the synthetic datasets is outlined in the section on Synthetic Dataset Generation. <xref ref-type="table" rid="table1">Table 1</xref> illustrates the distribution of sentences across various sections within each dataset.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Distribution of development datasets. Dataset #1 (translated PubMed dataset with labeled sentences), dataset #2 (GPT-3.5-generated abstracts with pseudolabels), and dataset #3 (paraphrased GPT-3.5-generated abstracts). Each dataset contains 15,000 sentences, divided into 4 sections: objective, methods, results, and conclusion.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Dataset</td><td align="left" valign="bottom" colspan="4">Classes of sentences</td><td align="left" valign="bottom">Total</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Objective</td><td align="left" valign="bottom">Methods</td><td align="left" valign="bottom">Results</td><td align="left" valign="bottom">Conclusion</td><td align="left" valign="bottom"/></tr></thead><tbody><tr><td align="left" valign="top">Dataset #1</td><td align="left" valign="top">3750</td><td align="left" valign="top">3750</td><td align="left" valign="top">3750</td><td align="left" valign="top">3750</td><td align="left" valign="top">15,000</td></tr><tr><td align="left" valign="top">Dataset #2</td><td align="left" valign="top">3750</td><td align="left" valign="top">3750</td><td align="left" valign="top">3750</td><td align="left" valign="top">3750</td><td align="left" valign="top">15,000</td></tr><tr><td align="left" valign="top">Dataset #3</td><td align="left" valign="top">3537</td><td align="left" valign="top">4140</td><td align="left" valign="top">3826</td><td align="left" valign="top">3497</td><td align="left" valign="top">15,000</td></tr></tbody></table></table-wrap></sec><sec id="s3-2"><title>Baselines</title><p>We selected multiple algorithms of different kinds as baselines to demonstrate the performance of our algorithms. To begin with, we used classical non-BERT models. TextCNN is built upon the CNN paradigm, applying multiple convolutional filters and pooling operations to capture local features of varying lengths within the text. TextRNN [<xref ref-type="bibr" rid="ref32">32</xref>] leverages RNNs to model the sequential information in the text, capturing dependencies between words. TextRNN-Att [<xref ref-type="bibr" rid="ref33">33</xref>] extends TextRNN by incorporating attention mechanisms, enabling the model to focus on essential words. TextRCNN [<xref ref-type="bibr" rid="ref34">34</xref>] combines convolutional and RNNs, simultaneously considering word order and contextual information. DPCNN [<xref ref-type="bibr" rid="ref35">35</xref>] uses multiple layers of convolution and pooling operations to capture hierarchical text features. FastText [<xref ref-type="bibr" rid="ref36">36</xref>] is a simple and efficient text classification method based on the bag-of-words model, representing text at the word level.</p><p>Additionally, we used BERT-based methods. BERT is a transformer-based pretrained language model, capable of learning rich contextual representations. BERTCNN [<xref ref-type="bibr" rid="ref37">37</xref>] and BERT-RNN [<xref ref-type="bibr" rid="ref8">8</xref>] integrate CNNs and RNNs, respectively, on top of BERT, combining the contextual representations from BERT with the ability to extract local features. BERT-RCNN [<xref ref-type="bibr" rid="ref38">38</xref>] combines BERT with both convolutional and RNNs, simultaneously considering word order and contextual information. BERT-DPCNN [<xref ref-type="bibr" rid="ref39">39</xref>] incorporates a deep pyramid CNN on top of BERT, combining the contextual representations from BERT with multilevel feature extraction capabilities. ERNIE [<xref ref-type="bibr" rid="ref40">40</xref>] is another transformer-based pretrained language model, built upon BERT with further improvements and optimizations, with better performance in Chinese contexts.</p><p>In our ablation study, we conducted an experiment using the k-means algorithm to cluster SBERT embeddings for comparative analysis with DocSCAN. Additionally, we examined the impact of omitting the dynamic convolutional layer in the MEC module. Subsequent experiments involving BERT serve as an extension of this analysis, further exploring the effects of removing 3 encoders within MEC.</p></sec><sec id="s3-3"><title>Experiment Settings</title><p>In this study, we used the PyTorch framework and Python for all experiments. Data processing and model training were performed on an NVIDIA GeForce RTX 3090 (24G) GPU and an Intel(R) Core(TM) i9-10900K CPU @3.70GHz. Experimental parameters are detailed in <xref ref-type="table" rid="table2">Table 2</xref>. To mitigate overfitting, a dropout rate of 0.5 was used, randomly deactivating 50% of neuron connections during training. We conducted training over 30 epochs using either the Adam or BertAdam optimization algorithms, with a learning rate set at 5e-5. Training batches comprised 64 samples each, and the RNN featured a hidden size of 256. The CNN component used filter sizes of (2, 3, 4), with a total of 256 filters. Input sequence length was capped at 64 tokens, and the feature dimension was maintained at 768. Pretrained models used in the experiments included BERT (bert-base-chinese), ERNIE (ernie,-3.0-base-zh), and SBERT (sbert-chinese-general-v2), which were critical to the study&#x2019;s execution and result analysis.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Settings for training models.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Parameter</td><td align="left" valign="bottom">Setting</td></tr></thead><tbody><tr><td align="left" valign="top">Dropout</td><td align="left" valign="top">0.5</td></tr><tr><td align="left" valign="top">Epoch number</td><td align="left" valign="top">30</td></tr><tr><td align="left" valign="top">Optimizer</td><td align="left" valign="top">Adam/BertAdam</td></tr><tr><td align="left" valign="top">Learning rate</td><td align="left" valign="top"><inline-formula><mml:math id="ieqn28"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:msup><mml:mi>e</mml:mi><mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mn>5</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:mstyle></mml:math></inline-formula></td></tr><tr><td align="left" valign="top">Batch size</td><td align="left" valign="top">64</td></tr><tr><td align="left" valign="top">RNN<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup> hidden size</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref2">2</xref>]</td></tr><tr><td align="left" valign="top">CNN<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup> filter number</td><td align="left" valign="top">256</td></tr><tr><td align="left" valign="top">Max length</td><td align="left" valign="top">64</td></tr><tr><td align="left" valign="top">Feature dimension</td><td align="left" valign="top">768</td></tr><tr><td align="left" valign="top">BERT<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">Bert-base-chinese</td></tr><tr><td align="left" valign="top">ERNIE</td><td align="left" valign="top">Ernie-3.0-base-zh</td></tr><tr><td align="left" valign="top">SBERT</td><td align="left" valign="top">Sbert-chinese-general-v2</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>RNN: recurrent neural network.</p></fn><fn id="table2fn2"><p><sup>b</sup>CNN: convolutional neural network.</p></fn><fn id="table2fn3"><p><sup>c</sup>BERT: Bidirectional Encoder Representations from Transformers.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-4"><title>Evaluation Metrics</title><p>The experiments used accuracy and <italic>F</italic><sub>1</sub>-score to evaluate the model performance. The formulas for each index are as follows:</p><disp-formula id="equWL6"><mml:math id="eqn6"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mrow><mml:mi mathvariant="normal">A</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">u</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">y</mml:mi></mml:mrow><mml:mo>=</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mfrac><mml:mrow><mml:mi mathvariant="normal">T</mml:mi><mml:mi mathvariant="normal">P</mml:mi><mml:mo>+</mml:mo><mml:mi mathvariant="normal">T</mml:mi><mml:mi mathvariant="normal">N</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">T</mml:mi><mml:mi mathvariant="normal">P</mml:mi><mml:mo>+</mml:mo><mml:mi mathvariant="normal">T</mml:mi><mml:mi mathvariant="normal">N</mml:mi><mml:mo>+</mml:mo><mml:mi mathvariant="normal">F</mml:mi><mml:mi mathvariant="normal">P</mml:mi><mml:mo>+</mml:mo><mml:mi mathvariant="normal">F</mml:mi><mml:mi mathvariant="normal">N</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where TP (true positive) corresponds to the number of instances correctly predicted as positive; TN (true negative) corresponds to the number of instances correctly predicted as negative; FP (false positive) corresponds to the number of instances incorrectly predicted as positive; FN (false negative) corresponds to the number of instances incorrectly predicted as negative.</p><disp-formula id="equWL7"><mml:math id="eqn7"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mrow><mml:mi mathvariant="normal">P</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">n</mml:mi></mml:mrow><mml:mo>=</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mfrac><mml:mrow><mml:mi mathvariant="normal">T</mml:mi><mml:mi mathvariant="normal">P</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">T</mml:mi><mml:mi mathvariant="normal">P</mml:mi><mml:mo>+</mml:mo><mml:mi mathvariant="normal">F</mml:mi><mml:mi mathvariant="normal">P</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:mstyle></mml:math></disp-formula><disp-formula id="equWL8"><mml:math id="eqn8"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mrow><mml:mi mathvariant="normal">R</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">l</mml:mi><mml:mi mathvariant="normal">l</mml:mi></mml:mrow><mml:mo>=</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mfrac><mml:mrow><mml:mi mathvariant="normal">T</mml:mi><mml:mi mathvariant="normal">P</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">T</mml:mi><mml:mi mathvariant="normal">P</mml:mi><mml:mo>+</mml:mo><mml:mi mathvariant="normal">F</mml:mi><mml:mi mathvariant="normal">N</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:mstyle></mml:math></disp-formula><disp-formula id="equWL9"><mml:math id="eqn9"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mtext>-</mml:mtext><mml:mrow><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">e</mml:mi></mml:mrow><mml:mo>=</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mfrac><mml:mrow><mml:mn>2</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mrow><mml:mi mathvariant="normal">P</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">n</mml:mi></mml:mrow><mml:mo>&#x00D7;</mml:mo><mml:mrow><mml:mi mathvariant="normal">R</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">l</mml:mi><mml:mi mathvariant="normal">l</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mi mathvariant="normal">P</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">n</mml:mi><mml:mo>+</mml:mo><mml:mi mathvariant="normal">R</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">l</mml:mi><mml:mi mathvariant="normal">l</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:mstyle></mml:math></disp-formula><p>In the above formulations, the <inline-formula><mml:math id="ieqn29"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mrow><mml:mi mathvariant="normal">P</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">n</mml:mi></mml:mrow></mml:mrow></mml:mstyle></mml:math></inline-formula> measures the proportion of correctly predicted positive instances out of the total instances predicted as positive and <inline-formula><mml:math id="ieqn30"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mrow><mml:mi mathvariant="normal">R</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">l</mml:mi><mml:mi mathvariant="normal">l</mml:mi></mml:mrow></mml:mrow></mml:mstyle></mml:math></inline-formula> measures the proportion of correctly predicted positive instances out of the total actual positive instances. <inline-formula><mml:math id="ieqn31"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mtext>-</mml:mtext><mml:mrow><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">e</mml:mi></mml:mrow></mml:mrow></mml:mstyle></mml:math></inline-formula> is the harmonic mean of precision and recall, providing a single value that considers both metrics. This technique is highly effective for imbalanced datasets, where disparities in class sizes can skew model performance. It helps ensure fair representation of all classes, enhancing model reliability.</p></sec><sec id="s3-5"><title>Model Performance</title><p><xref ref-type="table" rid="table3">Table 3</xref> shows the performance of various algorithms across different training datasets when evaluated on the test dataset. Notably, when trained on dataset #1, the SBERT-DocSCAN algorithm emerges as the leading performer, securing an accuracy and <italic>F</italic><sub>1</sub>-score of 0.8985 on the test dataset. This standout performance highlights the algorithm&#x2019;s capability to classify medical domain data with high precision. Additionally, the SBERT-MEC algorithm also displays comparable performance on the same dataset, with an accuracy and <italic>F</italic><sub>1</sub>-score of 0.8938, making it the second most effective algorithm in our evaluation. For Data se t#2, the SBERT-DocSCAN and SBERT-MEC algorithms again demonstrate superior performance. SBERT-DocSCAN leads with exceptional accuracy and <italic>F</italic><sub>1</sub>-score of 0.8983, reinforcing its effectiveness in managing generated data. Meanwhile, SBERT-MEC remains a strong contender with an accuracy and <italic>F</italic><sub>1</sub>-score of 0.8673, marking it as the second most proficient algorithm for this dataset. Furthermore, considering dataset #3, SBERT-DocSCAN and SBERT-MEC algorithms consistently demonstrate outstanding performance. SBERT-DocSCAN achieves the highest accuracy and <italic>F</italic><sub>1</sub>-score of 0.9130, affirming its effectiveness in handling paraphrased data. Similarly, the SBERT-MEC algorithm achieves the second-highest accuracy and <italic>F</italic><sub>1</sub>-score of 0.9039 and 0.9035, respectively, highlighting its competence in dealing with paraphrased data. Additionally, <xref ref-type="table" rid="table4">Table 4</xref> illustrates the confusion matrices of the SBERT-DocSCAN model on the test dataset using different training datasets.</p><p>In summary, the comprehensive analysis of the table elucidates that the SBERT-DocSCAN and SBERT-MEC algorithms consistently outperform other algorithms across multiple datasets. The superior performance of these algorithms in terms of accuracy and F1 scores underscores their significance and efficacy in the domain of text classification.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Comparative performance of various models on the test dataset<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup>.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Method</td><td align="left" valign="bottom" colspan="2">Training on dataset #1</td><td align="left" valign="bottom" colspan="2">Training on dataset #2</td><td align="left" valign="bottom" colspan="2">Training on dataset #3</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Accuracy</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom">Accuracy</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom">Accuracy</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td></tr></thead><tbody><tr><td align="left" valign="top">TextCNN</td><td align="char" char="." valign="top">0.6122</td><td align="char" char="." valign="top">0.6074</td><td align="char" char="." valign="top">0.7843</td><td align="char" char="." valign="top">0.7795</td><td align="char" char="." valign="top">0.6789</td><td align="char" char="." valign="top">0.6755</td></tr><tr><td align="left" valign="top">TextRNN</td><td align="char" char="." valign="top">0.5045</td><td align="char" char="." valign="top">0.5413</td><td align="char" char="." valign="top">0.7315</td><td align="char" char="." valign="top">0.7321</td><td align="char" char="." valign="top">0.7295</td><td align="char" char="." valign="top">0.7239</td></tr><tr><td align="left" valign="top">TextRNN-Att</td><td align="char" char="." valign="top">0.6439</td><td align="char" char="." valign="top">0.6421</td><td align="char" char="." valign="top">0.7114</td><td align="char" char="." valign="top">0.6955</td><td align="char" char="." valign="top">0.7521</td><td align="char" char="." valign="top">0.7521</td></tr><tr><td align="left" valign="top">TextRCNN</td><td align="char" char="." valign="top">0.6562</td><td align="char" char="." valign="top">0.6543</td><td align="char" char="." valign="top">0.7642</td><td align="char" char="." valign="top">0.7643</td><td align="char" char="." valign="top">0.7439</td><td align="char" char="." valign="top">0.7392</td></tr><tr><td align="left" valign="top">DPCNN</td><td align="char" char="." valign="top">0.6953</td><td align="char" char="." valign="top">0.6967</td><td align="char" char="." valign="top">0.6947</td><td align="char" char="." valign="top">0.6988</td><td align="char" char="." valign="top">0.6758</td><td align="char" char="." valign="top">0.6759</td></tr><tr><td align="left" valign="top">FastText</td><td align="char" char="." valign="top">0.6562</td><td align="char" char="." valign="top">0.6543</td><td align="char" char="." valign="top">0.7208</td><td align="char" char="." valign="top">0.7101</td><td align="char" char="." valign="top">0.7391</td><td align="char" char="." valign="top">0.7371</td></tr><tr><td align="left" valign="top">BERT<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="char" char="." valign="top">0.7842</td><td align="char" char="." valign="top">0.7857</td><td align="char" char="." valign="top">0.8426</td><td align="char" char="." valign="top">0.8400</td><td align="char" char="." valign="top">0.7348</td><td align="char" char="." valign="top">0.7083</td></tr><tr><td align="left" valign="top">BERT-CNN</td><td align="char" char="." valign="top">0.7338</td><td align="char" char="." valign="top">0.7340</td><td align="char" char="." valign="top">0.8242</td><td align="char" char="." valign="top">0.8213</td><td align="char" char="." valign="top">0.7181</td><td align="char" char="." valign="top">0.7136</td></tr><tr><td align="left" valign="top">BERT-RNN</td><td align="char" char="." valign="top">0.8548</td><td align="char" char="." valign="top">0.8546</td><td align="char" char="." valign="top">0.8536</td><td align="char" char="." valign="top">0.8532</td><td align="char" char="." valign="top">0.7955</td><td align="char" char="." valign="top">0.7882</td></tr><tr><td align="left" valign="top">BERT-RCNN</td><td align="char" char="." valign="top">0.8245</td><td align="char" char="." valign="top">0.8253</td><td align="char" char="." valign="top">0.8313</td><td align="char" char="." valign="top">0.8252</td><td align="char" char="." valign="top">0.8364</td><td align="char" char="." valign="top">0.8332</td></tr><tr><td align="left" valign="top">BERT-DPCNN</td><td align="char" char="." valign="top">0.7839</td><td align="char" char="." valign="top">0.7850</td><td align="char" char="." valign="top">0.8186</td><td align="char" char="." valign="top">0.8186</td><td align="char" char="." valign="top">0.8301</td><td align="char" char="." valign="top">0.8304</td></tr><tr><td align="left" valign="top">ERNIE</td><td align="char" char="." valign="top">0.8801</td><td align="char" char="." valign="top">0.8808</td><td align="char" char="." valign="top">0.8681</td><td align="char" char="." valign="top">0.8675</td><td align="char" char="." valign="top">0.8895</td><td align="char" char="." valign="top">0.8882</td></tr><tr><td align="left" valign="top">SBERT-Kmeans</td><td align="char" char="." valign="top">0.8875</td><td align="char" char="." valign="top">0.8875</td><td align="char" char="." valign="top">0.8709</td><td align="char" char="." valign="top">0.8709</td><td align="char" char="." valign="top">0.8788</td><td align="char" char="." valign="top">0.8788</td></tr><tr><td align="left" valign="top">SBERT-DocSCAN</td><td align="char" char="." valign="top">0.8985</td><td align="char" char="." valign="top">0.8985</td><td align="char" char="." valign="top">0.8983</td><td align="char" char="." valign="top">0.8983</td><td align="char" char="." valign="top">0.9130</td><td align="char" char="." valign="top">0.9130</td></tr><tr><td align="left" valign="top">SBERT-MEC</td><td align="char" char="." valign="top">0.8938</td><td align="char" char="." valign="top">0.8938</td><td align="char" char="." valign="top">0.8673</td><td align="char" char="." valign="top">0.8651</td><td align="char" char="." valign="top">0.9039</td><td align="char" char="." valign="top">0.9035</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>It includes traditional models, BERT-based improved models, and the model proposed in this paper. These models were trained on 3 distinct datasets and subsequently evaluated on the test dataset in terms of accuracy and <italic>F</italic><sub>1</sub>-score.</p></fn><fn id="table3fn2"><p><sup>b</sup>BERT: Bidirectional Encoder Representations from Transformers.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Confusion matrices for the SBERT-DocSCAN model on the test dataset<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup>.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Label or cluster</td><td align="left" valign="top">0</td><td align="left" valign="top">1</td><td align="left" valign="top">2</td><td align="left" valign="top">3</td><td align="left" valign="top">Total</td><td align="left" valign="top">Recall</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="7">Model training on dataset #1</td></tr><tr><td align="left" valign="top">&#x2003;Objective</td><td align="left" valign="top">20,302</td><td align="left" valign="top">1304</td><td align="left" valign="top">25</td><td align="left" valign="top">490</td><td align="left" valign="top">22,051</td><td align="left" valign="top">0.9175</td></tr><tr><td align="left" valign="top">&#x2003;Methods</td><td align="left" valign="top">1533</td><td align="left" valign="top">19,825</td><td align="left" valign="top">448</td><td align="left" valign="top">137</td><td align="left" valign="top">21,943</td><td align="left" valign="top">0.9035</td></tr><tr><td align="left" valign="top">&#x2003;Results</td><td align="left" valign="top">179</td><td align="left" valign="top">901</td><td align="left" valign="top">18,676</td><td align="left" valign="top">2075</td><td align="left" valign="top">21,831</td><td align="left" valign="top">0.8555</td></tr><tr><td align="left" valign="top">&#x2003;Conclusions</td><td align="left" valign="top">649</td><td align="left" valign="top">280</td><td align="left" valign="top">861</td><td align="left" valign="top">19,914</td><td align="left" valign="top">21,704</td><td align="left" valign="top">0.9175</td></tr><tr><td align="left" valign="top">&#x2003;Total</td><td align="left" valign="top">22,593</td><td align="left" valign="top">22,310</td><td align="left" valign="top">20,010</td><td align="left" valign="top">22,616</td><td align="left" valign="top">78,647<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="left" valign="top">0.8985</td></tr><tr><td align="left" valign="top">&#x2003;Precision</td><td align="left" valign="top">0.8595</td><td align="left" valign="top">0.8886</td><td align="left" valign="top">0.9333</td><td align="left" valign="top">0.8805</td><td align="left" valign="top">0.8985</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td></tr><tr><td align="left" valign="top" colspan="7">Model training on dataset #2</td></tr><tr><td align="left" valign="top">&#x2003;Objective</td><td align="left" valign="top">20,250</td><td align="left" valign="top">1247</td><td align="left" valign="top">24</td><td align="left" valign="top">530</td><td align="left" valign="top">22,051</td><td align="left" valign="top">0.9183</td></tr><tr><td align="left" valign="top">&#x2003;Methods</td><td align="left" valign="top">705</td><td align="left" valign="top">20,496</td><td align="left" valign="top">397</td><td align="left" valign="top">345</td><td align="left" valign="top">21,943</td><td align="left" valign="top">0.934</td></tr><tr><td align="left" valign="top">&#x2003;Results</td><td align="left" valign="top">210</td><td align="left" valign="top">626</td><td align="left" valign="top">18,005</td><td align="left" valign="top">2990</td><td align="left" valign="top">21,831</td><td align="left" valign="top">0.8247</td></tr><tr><td align="left" valign="top">&#x2003;Conclusions</td><td align="left" valign="top">102</td><td align="left" valign="top">141</td><td align="left" valign="top">1585</td><td align="left" valign="top">19,876</td><td align="left" valign="top">21,704</td><td align="left" valign="top">0.8983</td></tr><tr><td align="left" valign="top">&#x2003;Total</td><td align="left" valign="top">21,267</td><td align="left" valign="top">22,510</td><td align="left" valign="top">20,011</td><td align="left" valign="top">23,741</td><td align="left" valign="top">78,647<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="left" valign="top">0.8983</td></tr><tr><td align="left" valign="top">&#x2003;Precision</td><td align="left" valign="top">0.9522</td><td align="left" valign="top">0.9105</td><td align="left" valign="top">0.8998</td><td align="left" valign="top">0.8372</td><td align="left" valign="top">0.8983</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top" colspan="7">Model training on dataset #3</td></tr><tr><td align="left" valign="top">&#x2003;Objective</td><td align="left" valign="top">20,401</td><td align="left" valign="top">251</td><td align="left" valign="top">129</td><td align="left" valign="top">1447</td><td align="left" valign="top">22,051</td><td align="left" valign="top">0.9178</td></tr><tr><td align="left" valign="top">&#x2003;Methods</td><td align="left" valign="top">505</td><td align="left" valign="top">19,673</td><td align="left" valign="top">1325</td><td align="left" valign="top">29</td><td align="left" valign="top">21,943</td><td align="left" valign="top">0.9137</td></tr><tr><td align="left" valign="top">&#x2003;Results</td><td align="left" valign="top">288</td><td align="left" valign="top">1655</td><td align="left" valign="top">19,864</td><td align="left" valign="top">593</td><td align="left" valign="top">21,831</td><td align="left" valign="top">0.8868</td></tr><tr><td align="left" valign="top">&#x2003;Conclusions</td><td align="left" valign="top">749</td><td align="left" valign="top">252</td><td align="left" valign="top">386</td><td align="left" valign="top">19,982</td><td align="left" valign="top">21,704</td><td align="left" valign="top">0.9131</td></tr><tr><td align="left" valign="top">&#x2003;Total</td><td align="left" valign="top">21,943</td><td align="left" valign="top">21,831</td><td align="left" valign="top">21,704</td><td align="left" valign="top">22,051</td><td align="left" valign="top">78,647<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="left" valign="top">0.913</td></tr><tr><td align="left" valign="top">&#x2003;Precision</td><td align="left" valign="top">0.9297</td><td align="left" valign="top">0.9012</td><td align="left" valign="top">0.9152</td><td align="left" valign="top">0.9062</td><td align="left" valign="top">0.913</td><td align="left" valign="top">&#x2014;</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>This table displays the confusion matrices for the SBERT-DocSCAN model, which was trained separately on 3 distinct training datasets. The matrices detail the accuracy and recall for each category when evaluated on the test dataset.</p></fn><fn id="table4fn2"><p><sup>b</sup>Out of 87,529.</p></fn><fn id="table4fn3"><p><sup>c</sup>Not applicable.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-6"><title>Ablation Study</title><p>To assess the impact of various submodules, we conducted ablation studies. We began by training 2 clustering models with identical structures (SBERT-DocSCAN). The key distinction between them was that 1 model was fine-tuned on our datasets, while the other was not. In addition, we trained 3 supervised models on different datasets and evaluated their performance on the test dataset. Notably, 1 of these models was the SBERT-MEC, which lacked the proposed dynamic convolution (DC) module. The results, presented in <xref ref-type="table" rid="table5">Table 5</xref>, clearly demonstrate that the SBERT-DocSCAN method, when fine-tuned with SBERT, outperforms others in terms of efficiency. Furthermore, within the supervised learning category, the SBERT-MEC model equipped with the DC module surpassed those lacking this module, underlining the value of the DC module in enhancing model performance.</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Performance of ablation model on test dataset<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup>.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Method</td><td align="left" valign="bottom" colspan="2">Training on dataset #1</td><td align="left" valign="bottom" colspan="2">Training on dataset #2</td><td align="left" valign="bottom" colspan="2">Training on dataset #3</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Accuracy</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom">Accuracy</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom">Accuracy</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td></tr></thead><tbody><tr><td align="left" valign="top">SBERT-DocSCAN (without finetune)</td><td align="char" char="." valign="top">0.3891</td><td align="char" char="." valign="top">0.3895</td><td align="char" char="." valign="top">0.3891</td><td align="char" char="." valign="top">0.3895</td><td align="char" char="." valign="top">0.3891</td><td align="char" char="." valign="top">0.3895</td></tr><tr><td align="left" valign="top">SBERT-DocSCAN (with finetune)</td><td align="char" char="." valign="top">0.8985</td><td align="char" char="." valign="top">0.8985</td><td align="char" char="." valign="top">0.8983</td><td align="char" char="." valign="top">0.8983</td><td align="char" char="." valign="top">0.9130</td><td align="char" char="." valign="top">0.9130</td></tr><tr><td align="left" valign="top">BERT<sup><xref ref-type="table-fn" rid="table5fn2">b</xref></sup></td><td align="char" char="." valign="top">0.8330</td><td align="char" char="." valign="top">0.8324</td><td align="char" char="." valign="top">0.8424</td><td align="char" char="." valign="top">0.8421</td><td align="char" char="." valign="top">0.8406</td><td align="char" char="." valign="top">0.8397</td></tr><tr><td align="left" valign="top">SBERT-MEC<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup> (without DC)<sup><xref ref-type="table-fn" rid="table5fn4">d</xref></sup></td><td align="char" char="." valign="top">0.8714</td><td align="char" char="." valign="top">0.8717</td><td align="char" char="." valign="top">0.8390</td><td align="char" char="." valign="top">0.8332</td><td align="char" char="." valign="top">0.8736</td><td align="char" char="." valign="top">0.8735</td></tr><tr><td align="left" valign="top">SBERT-MEC (with DC)</td><td align="char" char="." valign="top">0.8938</td><td align="char" char="." valign="top">0.8938</td><td align="char" char="." valign="top">0.8673</td><td align="char" char="." valign="top">0.8651</td><td align="char" char="." valign="top">0.9039</td><td align="char" char="." valign="top">0.9035</td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>This table compares the performance of 2 SBERT-DocSCAN clustering models&#x2014;one fine-tuned on our datasets and the other not&#x2014;and 3 supervised models, including an SBERT-MEC model without the dynamic convolution module. All models were evaluated on the test dataset.</p></fn><fn id="table5fn2"><p><sup>b</sup>BERT: Bidirectional Encoder Representations from Transformers.</p></fn><fn id="table5fn3"><p><sup>c</sup>MEC: multiencoder cascade.</p></fn><fn id="table5fn4"><p><sup>d</sup>DC: dynamic convolution.</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study addresses the critical lack of real-world datasets for Chinese medical abstract classification by leveraging GPT-3.5 to generate synthetic datasets and developing models tailored for this task. Our findings confirm that synthetic datasets, when carefully designed, can match or surpass the performance of manually labeled datasets in sentence-level classification tasks. The SBERT-DocSCAN and SBERT-MEC models developed in this study demonstrate the potential of clustering and supervised approaches to effectively classify Chinese medical abstracts, illustrating the profound impact of synthetic datasets on enhancing NLP tasks.</p><p>Classical non-BERT models, such as TextCNN [<xref ref-type="bibr" rid="ref7">7</xref>], TextRNN [<xref ref-type="bibr" rid="ref32">32</xref>], and TextRCNN [<xref ref-type="bibr" rid="ref34">34</xref>], exhibit strong performance in capturing local or sequential features. However, these models lack the deep contextual understanding provided by transformer-based models. While FastText [<xref ref-type="bibr" rid="ref36">36</xref>] offers a lightweight and efficient alternative, its bag-of-words approach limits its ability to capture complex semantic relationships. These limitations underscore the advantages of leveraging pretrained language models in tasks that demand rich contextual understanding.</p><p>BERT-based models significantly improve performance by providing deep contextual representations. The success of models like BERT-CNN [<xref ref-type="bibr" rid="ref37">37</xref>] and BERT-RCNN [<xref ref-type="bibr" rid="ref38">38</xref>] aligns with prior studies, which highlight the effectiveness of combining BERT embeddings with convolutional and recurrent structures for enhanced local and sequential feature extraction. However, our proposed SBERT-DocSCAN and SBERT-MEC models outperform these baselines, indicating the added value of advanced clustering methods and DC layers. Specifically, DocSCAN [<xref ref-type="bibr" rid="ref22">22</xref>], with its graph-based clustering approach, demonstrates superior clustering quality compared to k-means.</p><p>The ablation study further emphasizes the contributions of individual components in our models. The removal of the dynamic convolutional layer in the MEC module resulted in a notable decline in performance, highlighting its role in refining language representations. Similarly, the omission of the 3 encoders within MEC led to a significant reduction in accuracy, underscoring the importance of multiencoder architecture in capturing diverse linguistic features. These findings align with the broader NLP literature [<xref ref-type="bibr" rid="ref39">39</xref>], which emphasizes the benefits of combining multiple feature extraction techniques for enhanced model performance.</p><p>Our exploration of ERNie [<xref ref-type="bibr" rid="ref40">40</xref>], a BERT-based model optimized for Chinese contexts, further validates the importance of leveraging models tailored to specific languages and domains. While ERNie offers improvements over standard BERT in Chinese text classification, it does not surpass the performance of our SBERT-based approaches. This suggests that task-specific architectural innovations, such as the integration of clustering methods and DC modules, can provide greater benefits than domain-specific pretraining alone.</p><p>Overall, these comparisons and ablation studies highlight the robustness and versatility of our proposed methods, providing valuable insights into the design of models for sentence-level text classification tasks. By demonstrating the effectiveness of integrating advanced clustering methods, DC layers, and multiencoder architectures, this study contributes to the growing body of research focused on optimizing transformer-based models for real-world applications.</p></sec><sec id="s4-2"><title>Limitations</title><p>While promising, our proposed methods have 2 main limitations. First, the quality of synthetic data heavily relies on the GPT-3.5 model, with performance contingent on effective prompt design. Refining prompt engineering strategies will be essential for future improvements. Second, the 2-stage approach used in both models results in an increased parameter size, raising concerns about computational resource efficiency. Further optimization is necessary to address these scalability challenges, especially in real-world applications where resources may be limited.</p></sec><sec id="s4-3"><title>Conclusions</title><p>This study demonstrates the significant potential of synthetic datasets, generated using GPT-3.5, in addressing the scarcity of labeled datasets for Chinese medical abstract classification. Our findings reveal that compact synthetic datasets can achieve performance comparable to, and in some cases surpass, that of manually labeled datasets. The proposed SBERT-DocSCAN and SBERT-MEC models further highlight the benefits of combining advanced clustering techniques, multiencoder architectures, and DC modules, showcasing their ability to enhance sentence-level classification tasks in specialized domains. These contributions provide valuable insights for leveraging generative artificial intelligence in NLP applications.</p><p>Beyond this specific task, this work underscores the transformative potential of synthetic datasets in reducing reliance on costly manual labeling, enabling broader adoption of NLP technologies in resource-limited fields. Future research can expand upon these findings by exploring more sophisticated data generation strategies, optimizing model architectures for efficiency, and fostering interdisciplinary collaborations to develop tailored solutions for complex, real-world challenges. By bridging gaps in data availability, this study provides a foundation for advancing NLP capabilities in medical and other specialized domains.</p></sec></sec></body><back><ack><p>This research was supported by Shanghai Science and Technology Commission Research Project 24YF2731300, 23692107300, and 202121511101200, Postdoctoral Fellowship Program of CPSF under grant GZC20231604, the Xiangfu Lab Youth Program XF052024B0100, and Shanghai Municipal Commission of Economy and Information Technology (SHEITC) Project 2024-GZL-RGZN-01020.</p></ack><notes><sec><title>Data Availability</title><p>The datasets generated or analyzed during this study are available from the corresponding author upon reasonable request.</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">BERT</term><def><p>Bidirectional Encoder Representations from Transformers</p></def></def-item><def-item><term id="abb2">CNN</term><def><p>convolutional neural network</p></def></def-item><def-item><term id="abb3">DC</term><def><p>dynamic convolution</p></def></def-item><def-item><term id="abb4">GAN</term><def><p>generative adversarial network</p></def></def-item><def-item><term id="abb5">MEC</term><def><p>multiencoder cascade</p></def></def-item><def-item><term id="abb6">NLP</term><def><p>natural language processing</p></def></def-item><def-item><term id="abb7">RNN</term><def><p>recurrent neural network</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Vaswani</surname><given-names>A</given-names> </name><name name-style="western"><surname>Shazeer</surname><given-names>N</given-names> </name><name name-style="western"><surname>Parmar</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Attention is all you need</article-title><conf-name>31st International Conference on Neural Information Processing Systems (NeurlPS)</conf-name><conf-date>Dec 4-9, 2017</conf-date><conf-loc>Long Beach, CA, United States</conf-loc><pub-id pub-id-type="doi">10.5555/3295222.3295349</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Devlin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>MW</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>K</given-names> </name><name name-style="western"><surname>Toutanova K.</surname><given-names>B</given-names> </name></person-group><article-title>Pre-training of deep bidirectional transformers for language understanding</article-title><conf-name>2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</conf-name><conf-date>Jun 2-7, 2019</conf-date><conf-loc>Minneapolis, MN, United States</conf-loc><pub-id pub-id-type="doi">10.18653/v1/N19-1423</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Radford</surname><given-names>A</given-names> </name><name name-style="western"><surname>Narasimhan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Salimans</surname><given-names>T</given-names> </name><name name-style="western"><surname>Sutskever</surname><given-names>I</given-names> </name></person-group><article-title>Improving language understanding by generative pre-training</article-title><source>OpenAI</source><year>2020</year><access-date>2020-09-25</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf">https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf</ext-link></comment></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Dai</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Carbonell</surname><given-names>J</given-names> </name><name name-style="western"><surname>Le</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Salakhutdinov</surname><given-names>R</given-names> </name></person-group><article-title>Transformer-XL: attentive language models beyond a fixed-length context</article-title><conf-name>Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics</conf-name><conf-date>Jul 28 to Aug 2, 2019</conf-date><conf-loc>Florence, Italy</conf-loc><pub-id pub-id-type="doi">10.18653/v1/P19-1285</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Radford</surname><given-names>A</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Child</surname><given-names>R</given-names> </name><name name-style="western"><surname>Luan</surname><given-names>D</given-names> </name><name name-style="western"><surname>Amodei</surname><given-names>D</given-names> </name><name name-style="western"><surname>Sutskever</surname><given-names>I</given-names> </name></person-group><article-title>Language models are unsupervised multitask learners</article-title><source>Open AI</source><access-date>2025-03-03</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf">https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf</ext-link></comment></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Zellers</surname><given-names>R</given-names> </name><name name-style="western"><surname>Holtzman</surname><given-names>A</given-names> </name><name name-style="western"><surname>Rashkin</surname><given-names>H</given-names> </name><name name-style="western"><surname>Bisk</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Farhadi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Roesner</surname><given-names>F</given-names> </name><etal/></person-group><article-title>Defending against neural fake news</article-title><conf-name>33rd International Conference on Neural Information Processing Systems (NeurlPS)</conf-name><conf-date>Dec 8-14, 2019</conf-date><conf-loc>Vancouver, BC, Canada</conf-loc><pub-id pub-id-type="doi">10.5555/3454287.3455099</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>Y</given-names> </name></person-group><article-title>Convolutional neural networks for sentence classification</article-title><conf-name>Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP)</conf-name><conf-date>Oct 25-29, 2014</conf-date><conf-loc>Doha, Qatar</conf-loc><pub-id pub-id-type="doi">10.3115/v1/D14-1181</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>D</given-names> </name><name name-style="western"><surname>Dyer</surname><given-names>C</given-names> </name><name name-style="western"><surname>He</surname><given-names>X</given-names> </name><name name-style="western"><surname>Smola</surname><given-names>A</given-names> </name><name name-style="western"><surname>Hovy</surname><given-names>E</given-names> </name></person-group><article-title>Hierarchical attention networks for document classification</article-title><conf-name>Proceedings of the 2016 Conference of the North American Chapter of the Association for Computational Linguistics</conf-name><conf-date>Jun 12-17, 2016</conf-date><conf-loc>San Diego, CA, United States</conf-loc><pub-id pub-id-type="doi">10.18653/v1/N16-1174</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Conneau</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kiela</surname><given-names>D</given-names> </name><name name-style="western"><surname>Schwenk</surname><given-names>H</given-names> </name><name name-style="western"><surname>Barrault</surname><given-names>L</given-names> </name><name name-style="western"><surname>Bordes</surname><given-names>A</given-names> </name></person-group><article-title>Supervised learning of universal sentence representations from natural language inference data</article-title><conf-date>Sep 9-11, 2017</conf-date><conf-loc>Copenhagen, Denmark</conf-loc><pub-id pub-id-type="doi">10.18653/v1/D17-1070</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Yinhan</surname><given-names>L</given-names> </name><name name-style="western"><surname>Myle</surname><given-names>O</given-names> </name><name name-style="western"><surname>Naman</surname><given-names>G</given-names> </name><name name-style="western"><surname>Jingfei</surname><given-names>D</given-names> </name><name name-style="western"><surname>Mandar</surname><given-names>J</given-names> </name><name name-style="western"><surname>Danqi</surname><given-names>C</given-names> </name><etal/></person-group><article-title>RoBERTa: a robustly optimized BERT pretraining approach</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 26, 2019</comment><pub-id pub-id-type="doi">10.48550/arXiv.1907.11692</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Dai</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Carbonell</surname><given-names>J</given-names> </name><name name-style="western"><surname>Salakhutdinov</surname><given-names>RR</given-names> </name><name name-style="western"><surname>Le</surname><given-names>QV</given-names> </name></person-group><article-title>Xlnet: generalized autoregressive pretraining for language understanding</article-title><conf-name>33rd International Conference on Neural Information Processing Systems (NeurlPS)</conf-name><conf-date>Dec 8-14, 2019</conf-date><conf-loc>Vancouver, BC, Canada</conf-loc><pub-id pub-id-type="doi">10.5555/3454287.3454804</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Lan</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>M</given-names> </name><name name-style="western"><surname>Goodman</surname><given-names>S</given-names> </name><name name-style="western"><surname>Gimpel</surname><given-names>K</given-names> </name><name name-style="western"><surname>Sharma</surname><given-names>P</given-names> </name><name name-style="western"><surname>Soricut</surname><given-names>R</given-names> </name></person-group><article-title>Albert: a lite BERT for self-supervised learning of language representations</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 26, 2019</comment><pub-id pub-id-type="doi">10.48550/arXiv.1909.11942</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Peters</surname><given-names>M</given-names> </name><name name-style="western"><surname>Neumann</surname><given-names>M</given-names> </name><name name-style="western"><surname>Iyyer</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Deep contextualized word representations</article-title><conf-name>2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</conf-name><conf-date>Jun 1-6, 2018</conf-date><conf-loc>New Orleans, LA, United States</conf-loc><pub-id pub-id-type="doi">10.18653/v1/N18-1202</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><collab>Ward Jr JH</collab></person-group><article-title>Hierarchical grouping to optimize an objective function</article-title><source>J Am Stat Assoc</source><year>1963</year><month>03</month><volume>58</volume><issue>301</issue><fpage>236</fpage><lpage>244</lpage><pub-id pub-id-type="doi">10.1080/01621459.1963.10500845</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>MacQueen</surname><given-names>J</given-names> </name></person-group><article-title>Some methods for classification and analysis of multivariate observations</article-title><conf-name>5th Berkeley Symposium on Mathematical Statistics and Probability</conf-name><conf-date>Dec 27, 1965 to Jan 7, 1966</conf-date><conf-loc>Oakland, CA, USA</conf-loc></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>DD</given-names> </name><name name-style="western"><surname>Seung</surname><given-names>HS</given-names> </name></person-group><article-title>Learning the parts of objects by non-negative matrix factorization</article-title><source>Nat New Biol</source><year>1999</year><month>10</month><day>21</day><volume>401</volume><issue>6755</issue><fpage>788</fpage><lpage>791</lpage><pub-id pub-id-type="doi">10.1038/44565</pub-id><pub-id pub-id-type="medline">10548103</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Blei</surname><given-names>DM</given-names> </name><name name-style="western"><surname>Ng</surname><given-names>AY</given-names> </name><name name-style="western"><surname>Jordan</surname><given-names>MI</given-names> </name></person-group><article-title>Latent dirichlet allocation</article-title><source>J Mach Learn Res</source><year>2003</year><month>03</month><day>1</day><volume>3</volume><fpage>993</fpage><lpage>1022</lpage><pub-id pub-id-type="doi">10.5555/944919.944937</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Le</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Mikolov</surname><given-names>T</given-names> </name></person-group><article-title>Distributed representations of sentences and documents</article-title><conf-name>31st International Conference on International Conference on Machine Learning (ICML)</conf-name><conf-date>Jun 21-25, 2014</conf-date><conf-loc>Beijing, China</conf-loc><pub-id pub-id-type="doi">10.5555/3044805.3045025</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Cer</surname><given-names>D</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Kong</surname><given-names>S yi</given-names> </name><etal/></person-group><article-title>Universal sentence encoder for english</article-title><conf-name>Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing: System Demonstrations</conf-name><conf-date>Oct 31 to Nov 4, 2018</conf-date><conf-loc>Brussels, Belgium</conf-loc><pub-id pub-id-type="doi">10.18653/v1/D18-2029</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>H</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>R</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>M</given-names> </name><name name-style="western"><surname>Jin</surname><given-names>Y</given-names> </name></person-group><article-title>BERT-EMD: many-to-many layer mapping for BERT compression with earth mover&#x2019;s distance</article-title><conf-name>Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)</conf-name><conf-date>Nov 16-20, 2020</conf-date><pub-id pub-id-type="doi">10.18653/v1/2020.emnlp-main.242</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Reimers</surname><given-names>N</given-names> </name><name name-style="western"><surname>Gurevych</surname><given-names>I</given-names> </name></person-group><article-title>Sentence-bert: sentence embeddings using siamese bert-networks</article-title><conf-name>Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)</conf-name><conf-date>Nov 3-7, 2019</conf-date><conf-loc>Hong Kong, China</conf-loc><pub-id pub-id-type="doi">10.18653/v1/D19-1410</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Stammbach</surname><given-names>D</given-names> </name><name name-style="western"><surname>Ash</surname><given-names>E</given-names> </name></person-group><article-title>DocSCAN: unsupervised text classification via learning from neighbors</article-title><access-date>2025-03-03</access-date><conf-name>Proceedings of the 18th Conference on Natural Language Processing (KONVENS 2022)</conf-name><conf-date>Sep 12-15, 2022</conf-date><conf-loc>Potsdam, Germany</conf-loc><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2022.konvens-1.4">https://aclanthology.org/2022.konvens-1.4</ext-link></comment></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Xu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>P</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>Q</given-names> </name><etal/></person-group><article-title>AttnGAN: fine-grained text to image generation with attentional generative adversarial networks</article-title><conf-name>2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name><conf-date>Jun 18-23, 2018</conf-date><conf-loc>Salt Lake City, UT, United States</conf-loc><pub-id pub-id-type="doi">10.1109/CVPR.2018.00143</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>K</given-names> </name><name name-style="western"><surname>Wan</surname><given-names>X</given-names> </name></person-group><article-title>SentiGAN: generating sentimental texts via mixture adversarial networks</article-title><conf-name>27th International Joint Conference on Artificial Intelligence (IJCAI)</conf-name><conf-date>Jul 13-19, 2018</conf-date><conf-loc>Stockholm, Sweden</conf-loc><pub-id pub-id-type="doi">10.5555/3304222.3304388</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>J</given-names> </name><name name-style="western"><surname>LeCun</surname><given-names>Y</given-names> </name></person-group><article-title>Character-level convolutional networks for text classification</article-title><conf-name>28th International Conference on Neural Information Processing Systems (NeurlPS)</conf-name><conf-date>Dec 8-13, 2014</conf-date><conf-loc>Montreal, QC, Canada</conf-loc><pub-id pub-id-type="doi">10.5555/2969239.2969312</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Zhou</surname><given-names>R</given-names> </name><name name-style="western"><surname>Li</surname><given-names>X</given-names> </name><name name-style="western"><surname>He</surname><given-names>R</given-names> </name><name name-style="western"><surname>Bing</surname><given-names>L</given-names> </name><name name-style="western"><surname>Cambria</surname><given-names>E</given-names> </name><name name-style="western"><surname>Si</surname><given-names>L</given-names> </name><etal/></person-group><article-title>MELM: data augmentation with masked entity language modeling for low-resource NER</article-title><conf-name>The 60th Annual Meeting of the Association for Computational Linguistics</conf-name><conf-date>May 22-27, 2022</conf-date><conf-loc>Dublin, Ireland</conf-loc><pub-id pub-id-type="doi">10.18653/v1/2022.acl-long.160</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="web"><source>PubMed</source><access-date>2023-10-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://pubmed.ncbi.nlm.nih.gov">https://pubmed.ncbi.nlm.nih.gov</ext-link></comment></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="web"><source>DeepL</source><access-date>2023-10-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.deepl.com/translator">https://www.deepl.com/translator</ext-link></comment></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Yudong</surname><given-names>L</given-names> </name><name name-style="western"><surname>Yuqing</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Zhe</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Linlin</surname><given-names>S</given-names> </name><name name-style="western"><surname>Weijie</surname><given-names>L</given-names> </name><name name-style="western"><surname>Weiquan</surname><given-names>M</given-names> </name><etal/></person-group><article-title>CSL: a large-scale chinese scientific literature dataset</article-title><access-date>2025-03-03</access-date><conf-name>29th International Conference on Computational Linguistics</conf-name><conf-date>Oct 12-17, 2022</conf-date><conf-loc>Gyeongju, Republic of Korea</conf-loc></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="web"><article-title>CLUEbenchmark/SimCLUE</article-title><source>GitHub</source><access-date>2024-01-08</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/CLUEbenchmark/SimCLUE">https://github.com/CLUEbenchmark/SimCLUE</ext-link></comment></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Aghaebrahimian</surname><given-names>A</given-names> </name></person-group><article-title>Quora question answer dataset</article-title><conf-name>20th International Conference on Text, Speech, and Dialogue</conf-name><conf-date>Aug 27-31, 2017</conf-date><conf-loc>Prague, Czech Republic</conf-loc><pub-id pub-id-type="doi">10.1007/978-3-319-64206-2_8</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Pengfei</surname><given-names>L</given-names> </name><name name-style="western"><surname>Xipeng</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Xuanjing</surname><given-names>H</given-names> </name></person-group><article-title>Recurrent neural network for text classification with multi-task learning</article-title><conf-name>25th International Joint Conference on Artificial Intelligence (IJCAI)</conf-name><conf-date>Jul 9-15, 2016</conf-date><conf-loc>New York City, NY, United States</conf-loc><pub-id pub-id-type="doi">10.5555/3060832.3061023</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Zhou</surname><given-names>P</given-names> </name><name name-style="western"><surname>Shi</surname><given-names>W</given-names> </name><name name-style="western"><surname>Tian</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Attention-based bidirectional long short-term memory networks for relation classification</article-title><conf-name>The 54th Annual Meeting of the Association for Computational Linguistics</conf-name><conf-date>Aug 7-12, 2016</conf-date><conf-loc>Berlin, Germany</conf-loc><pub-id pub-id-type="doi">10.18653/v1/P16-2034</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>R</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Cao</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>T</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name></person-group><article-title>Convolutional recurrent neural networks for text classification</article-title><conf-name>2019 International Joint Conference on Neural Networks (IJCNN)</conf-name><conf-date>Jul 14-19, 2019</conf-date><conf-loc>Budapest, Hungary</conf-loc><pub-id pub-id-type="doi">10.1109/IJCNN.2019.8852406</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>R</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>T</given-names> </name></person-group><article-title>Deep pyramid convolutional neural networks for text categorization</article-title><conf-name>55th Annual Meeting of the Association for Computational Linguistics</conf-name><conf-date>Jul 30 to Aug 4, 2017</conf-date><conf-loc>Vancouver, Canada</conf-loc><pub-id pub-id-type="doi">10.18653/v1/P17-1052</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Joulin</surname><given-names>A</given-names> </name><name name-style="western"><surname>Grave</surname><given-names>E</given-names> </name><name name-style="western"><surname>Bojanowski</surname><given-names>P</given-names> </name><name name-style="western"><surname>Mikolov</surname><given-names>T</given-names> </name></person-group><article-title>Bag of tricks for efficient text classification</article-title><conf-name>15th Conference of the European Chapter of the Association for Computational Linguistics</conf-name><conf-date>Apr 3-7, 2017</conf-date><conf-loc>Valencia, Spain</conf-loc><pub-id pub-id-type="doi">10.18653/v1/E17-2068</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kaur</surname><given-names>K</given-names> </name><name name-style="western"><surname>Kaur</surname><given-names>P</given-names> </name></person-group><article-title>BERT-CNN: improving BERT for requirements classification using CNN</article-title><source>Procedia Comput Sci</source><year>2023</year><volume>218</volume><fpage>2604</fpage><lpage>2611</lpage><pub-id pub-id-type="doi">10.1016/j.procs.2023.01.234</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Kaur</surname><given-names>K</given-names> </name><name name-style="western"><surname>Kaur</surname><given-names>P</given-names> </name></person-group><article-title>BERT-RCNN: an automatic classification of app reviews using transfer learning based RCNN deep model</article-title><source>Research Square</source><comment>Preprint posted online on  Jan 24, 2023</comment><pub-id pub-id-type="doi">10.21203/rs.3.rs-2503700/v1</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Yanjun</surname><given-names>L</given-names> </name><name name-style="western"><surname>Haijun</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Weimin</surname><given-names>P</given-names> </name><name name-style="western"><surname>Rujia</surname><given-names>F</given-names> </name></person-group><article-title>Microblog rumor detection based on bert-DPCNN</article-title><source>Artificial Intelligence in China</source><year>2021</year><fpage>524</fpage><lpage>530</lpage><pub-id pub-id-type="doi">10.1007/978-981-15-8599-9_60</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Han</surname><given-names>X</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>M</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Q</given-names> </name></person-group><article-title>ERNIE: enhanced language representation with informative entities</article-title><conf-name>57th Annual Meeting of the Association for Computational Linguistics</conf-name><conf-date>Jul 28 to Aug 2, 2019</conf-date><conf-loc>Florence, Italy</conf-loc><pub-id pub-id-type="doi">10.18653/v1/P19-1139</pub-id></nlm-citation></ref></ref-list></back></article>