<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v10i1e78622</article-id><article-id pub-id-type="doi">10.2196/78622</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Evaluation of the Accuracy of Probabilistic Record Linkage Across Sociodemographic Categories in 4 Databases: Exploratory Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes" equal-contrib="yes"><name name-style="western"><surname>Barboi</surname><given-names>Cristina</given-names></name><degrees>MS, MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Ouyang</surname><given-names>Fangqian</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lembcke</surname><given-names>Lauren</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Martin</surname><given-names>Andrew</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Griffith</surname><given-names>Ashley</given-names></name><degrees>MHA</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Allen</surname><given-names>Katie S</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Li</surname><given-names>Xiaochun</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Xu</surname><given-names>Huiping</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Grannis</surname><given-names>Shaun J</given-names></name><degrees>MS, MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff6">6</xref></contrib></contrib-group><aff id="aff1"><institution>Center for Biomedical Informatics, Regenstrief Institute</institution><addr-line>101 W 10th Street</addr-line><addr-line>Indianapolis</addr-line><addr-line>IN</addr-line><country>United States</country></aff><aff id="aff2"><institution>Department of Anesthesiology, Indiana University School of Medicine</institution><addr-line>Indianapolis</addr-line><addr-line>IN</addr-line><country>United States</country></aff><aff id="aff3"><institution>Department of Biostatistics and Health Data Science, School of Medicine, Indiana University</institution><addr-line>Indianapolis</addr-line><addr-line>IN</addr-line><country>United States</country></aff><aff id="aff4"><institution>Regenstrief Data Services, Regenstrief Institute</institution><addr-line>Indianapolis</addr-line><addr-line>IN</addr-line><country>United States</country></aff><aff id="aff5"><institution>Department of Health Policy and Management, Indiana University</institution><addr-line>Indianapolis</addr-line><addr-line>IN</addr-line><country>United States</country></aff><aff id="aff6"><institution>Department of Family Medicine, Indiana University School of Medicine</institution><addr-line>Indianapolis</addr-line><addr-line>IN</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Schwartz</surname><given-names>Amy</given-names></name></contrib><contrib contrib-type="editor"><name name-style="western"><surname>Balcarras</surname><given-names>Matthew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Adegoke</surname><given-names>Kola</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Cristina Barboi, MS, MD, Center for Biomedical Informatics, Regenstrief Institute, 101 W 10th Street, Indianapolis, IN, 46202, United States, 1 262 8538872, 1 317-274-0275; <email>cbarboi@iu.edu</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>26</day><month>2</month><year>2026</year></pub-date><volume>10</volume><elocation-id>e78622</elocation-id><history><date date-type="received"><day>05</day><month>06</month><year>2025</year></date><date date-type="rev-recd"><day>12</day><month>01</month><year>2026</year></date><date date-type="accepted"><day>12</day><month>01</month><year>2026</year></date></history><copyright-statement>&#x00A9; Cristina Barboi, Fangqian Ouyang, Lauren Lembcke, Andrew Martin, Ashley Griffith, Katie S Allen, Xiaochun Li, Huiping Xu, Shaun J Grannis. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 26.2.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2026/1/e78622"/><abstract><sec><title>Background</title><p>Accurate patient record linkage is essential for clinical care, health information exchange, research, and public health surveillance. However, linkage accuracy may vary across demographic groups due to differences in data completeness, quality, and the structural factors underlying how demographic information is captured.</p></sec><sec><title>Objective</title><p>This study aimed to explore whether probabilistic patient matching accuracy varies by age, sex, race, and ethnicity and to identify potential sources of bias that may influence matching performance.</p></sec><sec sec-type="methods"><title>Methods</title><p>We used 4 Indiana data sources&#x2014;the Indiana Network for Patient Care, Newborn Screening, Social Security Administration Death Master File, and Marion County Public Health Department&#x2014;and applied a modified Fellegi-Sunter probabilistic linkage algorithm accommodating missing data under a missing at random assumption. Gold standard match status was established through dual manual review with adjudication. For each dataset, matching sensitivity, positive predictive value, and <italic>F</italic><sub>1</sub>-scores were estimated and stratified by age, sex, race, and ethnicity. Data completeness, distinct value ratio, and Shannon entropy were assessed to characterize data quality. Ninety-five percent bootstrap CIs were used to assess significance.</p></sec><sec sec-type="results"><title>Results</title><p>The algorithm-matching <italic>F</italic><sub>1</sub>-score was greater than 0.82 for all age strata, ranging from 0.88 to 0.97 for sex, 0.85 to 0.99 for race, and 0.88 to 0.99 for ethnicity. Sensitivity ranged from 0.70 to 0.97 across age strata, 0.76 to 0.97 across sex, 0.85 to 0.99 across race, and 0.85 to 0.989 across ethnicity. Lower sensitivity and <italic>F</italic><sub>1</sub>-scores were consistently observed in strata with greater missingness or discordance, particularly in Newborn Screening and Social Security Administration Death Master File. Race and ethnicity exhibited the highest missingness and lowest informational diversity, coinciding with the largest declines in accuracy. Shannon entropy and distinct value ratio varied across demographic groups and were strongly associated with performance, indicating that both low and excessively high informational diversity can impair matching.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Probabilistic patient matching accuracy is not uniform across demographics and is strongly influenced by data quality and completeness. Although overall matching performance, as assessed by the <italic>F</italic><sub>1</sub>-score, remained above 0.8, it varied across datasets when stratified by sociodemographic characteristics. Sociodemographic data missingness is associated with lower matching accuracy, raising equity and ethical concerns for clinical, research, and public health applications. Routine demographic-stratified evaluations of matching accuracy, improved standardization of sociodemographic data, and fairness-aware linkage methods are essential to prevent the amplification of structural inequities in linked health datasets.</p></sec></abstract><kwd-group><kwd>record linkage</kwd><kwd>data quality</kwd><kwd>sociodemographic characteristics</kwd><kwd>matching algorithms</kwd><kwd>matching accuracy</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Clinical Significance</title><p>Patients receive care in many settings along their health care journey; however, their information is often stored in separate, organization-specific electronic health records. Each encounter&#x2014;whether in a hospital, outpatient setting, laboratory, or public health department&#x2014;generates new patient data that may not be accurately linked across health systems. Errors in patient identification can lead to duplicate or incomplete health records, jeopardizing patient safety and wasting health care time and resources.</p><p>Duplicate records occur in approximately 5% to 10% of hospital electronic health records, with up to 92% resulting from human error during registration [<xref ref-type="bibr" rid="ref1">1</xref>]. These errors increase health care costs&#x2014;averaging US $1950 per inpatient stay and more than US $800 per emergency department visit [<xref ref-type="bibr" rid="ref2">2</xref>]. Overwriting one patient&#x2019;s record with another&#x2019;s data introduces additional safety risk and can compromise clinical care. Identification inaccuracies also contribute to financial inefficiencies&#x2014;approximately 33% of all denied insurance claims are linked to patient identification errors, costing the US health care system over US $6 billion annually [<xref ref-type="bibr" rid="ref3">3</xref>]. Collectively, record linkage errors affect patients, providers, and payers and have both clinical and economic consequences.</p></sec><sec id="s1-2"><title>Background</title><p>Accurate linkage of individual patient records is essential for integrating information across encounters and creating reliable, longitudinal data resources. Record linkage supports basic intrasystem tasks&#x2014;such as matching laboratory results to the correct patient&#x2014;as well as more complex information exchange across health care organizations. Several methodologies exist for matching health care records, including (1) deterministic rule&#x2013;based approaches, (2) probabilistic methods that assign match weights or scores, (3) machine learning&#x2013;based models, and (4) hybrid approaches combining multiple techniques [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. Thus far, no single approach has emerged as universally superior [<xref ref-type="bibr" rid="ref6">6</xref>].</p><p>Matching algorithms rely on identifiers such as name, date of birth, social security number, address, and their performance depends on how complete, accurate, and consistently formatted these fields are [<xref ref-type="bibr" rid="ref6">6</xref>]. Consequently, effective patient linkage depends on both data quality [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>] and the robustness of the matching method [<xref ref-type="bibr" rid="ref9">9</xref>]. In this context, data quality reflects completeness&#x2014;a populated field rather than missing&#x2014;and correctness, the concordance of field values across datasets. Data quality is well established as foundational to reliable analytics and decision-making [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref12">12</xref>]. Structural and organizational factors can introduce systematic biases, and linkage errors arise when information is missing, incorrect, or inconsistent, leading to (1) missed matches (records belonging to the same individual are not linked) or (2) false matches (records from different individuals are erroneously linked) [<xref ref-type="bibr" rid="ref13">13</xref>]. Linkage errors do not occur at equal rates across populations. Patients from ethnic minorities, particularly those with naming conventions not aligned with Western standards, experience higher rates of missed or incorrect matches [<xref ref-type="bibr" rid="ref14">14</xref>].</p><p>Additionally, missing demographic data are more prevalent among specific populations due to social, linguistic, or contextual factors [<xref ref-type="bibr" rid="ref15">15</xref>]. Sociodemographic characteristics, such as age, ethnicity, race, social vulnerabilities, geographical setting, and health status, have been associated with lower data completeness and higher error rates [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. However, the degree to which matching performance itself varies across these factors has not been systematically examined. Given the documented disparities, identifying and understanding sociodemographic patterns associated with linkage errors is essential for recognizing and mitigating bias in patient record linkage.</p></sec><sec id="s1-3"><title>Study Objective</title><p>Unlike deterministic record-matching approaches that require exact agreement across fields, probabilistic methods assign probabilities based on weighted combinations of multiple attributes. This design makes them more sensitive to variations in data quality and demographic characteristics and therefore relevant for studying potential algorithmic bias. The Fellegi-Sunter (FS) framework is one of the most widely used and well-characterized probabilistic methods in operational US health care environments. In this study, we selected the FS framework to evaluate a core linkage approach representative of real-world practice. Although many operational systems incorporate additional deterministic or hybrid rules, such configurations are highly customized and not easily generalizable. Focusing on the probabilistic foundation most commonly deployed allows us to assess how demographic factors may influence match accuracy. Accordingly, the objective of this study was to explore whether probabilistic patient matching accuracy varies by age, sex, race, or ethnicity and to identify potential data-related sources of bias that may influence matching performance.</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Patient Demographic Data</title><p>This study used 4 Indiana-based patient demographic data sources. First, the Indiana Network for Patient Care (INPC), a statewide health information exchange containing over 47 million registration records across more than 100 clinical data sources, including emergency department visits, hospital admissions, and large outpatient health care clinics from across the state [<xref ref-type="bibr" rid="ref17">17</xref>]. Second, Newborn screening (NBS) data, consisting of demographic data derived from health information exchange-wide Health Level 7 messaging for children aged under 12 months. This dataset includes limited newborn screening information collected within 5 days of birth, mandated by Indiana law. Third, Social Security Administration Death Master File (SSA), containing records of individuals issued a Social Security number whose deaths were reported to the SSA. Fourth, Marion County Public Health Department (MCPHD), the county&#x2019;s infectious disease reporting database. Marion County is Indiana&#x2019;s largest county, with a population of just over 966,000.</p></sec><sec id="s2-2"><title>Generation of Patient Record Pairs</title><p>This study leveraged analytic datasets from previous studies on patient matching and record linkage [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]. The FS model was chosen for this project due to its widespread use and flexible maximum-likelihood framework. To accommodate missing data in real-world settings, we applied an established modification of the FS model under the missing at random (MAR) assumption [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref20">20</xref>]. Under the MAR assumption, missing fields are handled using the full-information likelihood approach, which incorporates all available data while excluding only the specific missing fields, assuming conditional independence across variables. Using the modified FS probabilistic algorithm [<xref ref-type="bibr" rid="ref21">21</xref>], we identified matches and nonmatches across four dataset pairs: (1) INPC to INPC (labeled as INPC), (2) NBS to NBS (labeled as NBS), (3) INPC to SSA (labeled as SSA), and (4) INPC to MCPHD (labeled as MCPHD). Matching variables included medical record number, first and last name, middle initial, sex, telephone number, address, ZIP code, social security number, and components of the date of birth.</p></sec><sec id="s2-3"><title>Manual Review of Record Pairs</title><p>To construct a gold standard reference, we randomly sampled record pairs from each dataset for manual review. For each sampled pair, complete reference data from both records were provided to the reviewers to support accurate adjudication [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref22">22</xref>]. Using a balanced, incomplete block design, 2 independent reviewers assessed each pair&#x2019;s match status [<xref ref-type="bibr" rid="ref20">20</xref>], while a third reviewer resolved any discrepancies. A total of 62,000 pairs were manually reviewed: 15,000 INPC, 15,000 NBS, 16,500 SSA, and 15,500 MCPHD. As the reviewers&#x2019; identities were inconsistent during the assessment of pairs&#x2019; match status, Cohen &#x03BA; no longer applied; therefore, the overall agreement rate was calculated. The detailed methodology and outcomes of this manual review process were described previously [<xref ref-type="bibr" rid="ref22">22</xref>]. For the current analysis, all gold standard referential pairs were stratified by race, ethnicity, sex, and age.</p></sec><sec id="s2-4"><title>Analysis</title><p>We conducted an exploratory analysis of sociodemographic variables&#x2014;age, sex, race, and ethnicity&#x2014;evaluating each variable independently for each record pair. Record pairs were assigned to a given stratum (eg, <italic>Asian</italic>) only when both records contained nonmissing, concordant values for that variable. <italic>Missing</italic> data (one or both fields absent) were distinguished from <italic>discordant</italic> data (both fields present but disagreeing). Pairs with discordant values for a specific variable were excluded from the performance calculations for that variable&#x2019;s stratum. However, they could contribute to analyses of other sociodemographic strata where their values were concordant. As a single record could appear in multiple pairings, individuals may be represented more than once. Missing data were addressed using the MAR-modified FS model, which uses all available information without defaulting to disagreement for missing values. Previous research shows that this modification improves linkage accuracy and <italic>F</italic><sub>1</sub>-scores compared with traditional missing data handling [<xref ref-type="bibr" rid="ref18">18</xref>]. The linkage algorithm was applied separately within each sociodemographic stratum, supporting the MAR assumption within these more homogeneous groups. While MAR is reasonable in many operational contexts, data can be missing not at random (MNAR). Evaluating MNAR scenarios would require alternative missingness models and sensitivity analyses, which we identified as an area of future work.</p></sec><sec id="s2-5"><title>Accuracy of Probabilistic Record Linkage</title><p>For each stratum, we compared the algorithm&#x2019;s declared match status with the gold standard referential reviewer classification. We estimated sensitivity, positive predictive value (PPV), and <italic>F</italic><sub>1</sub>-score, along with 95% CI derived from 1000 bootstrap samples, using the percentile method. CIs were reported to indicate the precision of estimated performance measures rather than to support formal hypothesis testing or draw inferential conclusions. Given the exploratory nature of this analysis, no corrections for multiple comparisons were applied.</p></sec><sec id="s2-6"><title>Data Quality Assessment</title><p>To evaluate data quality across sociodemographic strata, we calculated (1) missing data ratio (MDR), measures <italic>data completeness</italic> as percent of records in which a field was missing; (2) distinct value ratio (DVR), measures <italic>data uniqueness</italic> as number of unique values for a field divided by the number of nonmissing records; and (3) Shannon entropy (SE), a measure of <italic>information content</italic>, increasing when values are numerous and evenly distributed [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref23">23</xref>].</p></sec><sec id="s2-7"><title>Ethical Considerations</title><p>This research was approved by the Indiana University Institutional Review Board (IRB 170375536) under category 5 (research involving materials collected for nonresearch purposes), which granted a waiver for informed consent. The study used retrospective data collected during routine care, with no contact with individuals and minimal risk to privacy or welfare. Although datasets contained identifiable information, data were stored on Health Insurance Portability and Accountability Act&#x2013;compliant remote servers and accessed only by authorized personnel via encrypted connections. All project-related data were managed according to the university security protocols and institutional storage infrastructure. As confidentiality protections were deemed adequate, no ongoing institutional review board monitoring beyond the approval determination was required. All accessed data were governed through data sharing agreements with each contributing entity.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Reviewers&#x2019; Agreement on Match Status</title><p>During the creation of the referential gold standard&#x2013;linked datasets, reviewers agreed on the match status for 96.1% of record pairs; the remaining 3.9% required adjudication by a third reviewer due to disagreement.</p></sec><sec id="s3-2"><title>Data Quality Assessment</title><p><xref ref-type="table" rid="table1">Table 1</xref> summarizes the sociodemographic characteristics of the record pairs included in the referential gold standard datasets. For each variable, the table reports the number of record pairs (N) with nonmissing values in the INPC, NBS, SSA, and MCHD datasets; variables not collected in a dataset are marked &#x201C;n/a.&#x201D; Notably, race and ethnicity are absent in the SSA and MCPHD datasets; therefore, they were excluded from the analysis. In the NBS dataset, age was not assessed because all newborns were within 12 months of age.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Sociodemographic characteristics of the record pairs included in the referential gold standard datasets.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Sociodemographic variables</td><td align="left" valign="bottom">INPC<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> record pairs (n=15000)</td><td align="left" valign="bottom">NBS<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup> record pairs (n=15,000)</td><td align="left" valign="bottom">SSA<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> record pairs (n=16,500)</td><td align="left" valign="bottom">MCHD<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup> record pairs (n=15,500)</td></tr></thead><tbody><tr><td align="left" valign="top">Age (years)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;&#x003C;18</td><td align="left" valign="top">1746</td><td align="left" valign="top">N/A<sup><xref ref-type="table-fn" rid="table1fn5">e</xref></sup></td><td align="left" valign="top">22</td><td align="left" valign="top">4002</td></tr><tr><td align="left" valign="top">&#x2003;18&#x2010;65</td><td align="left" valign="top">10,178</td><td align="left" valign="top">N/A</td><td align="left" valign="top">2894</td><td align="left" valign="top">10,683</td></tr><tr><td align="left" valign="top">&#x2003;&#x003E;65</td><td align="left" valign="top">3035</td><td align="left" valign="top">N/A</td><td align="left" valign="top">9610</td><td align="left" valign="top">735</td></tr><tr><td align="left" valign="top">Sex</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;Male</td><td align="left" valign="top">4867</td><td align="left" valign="top">6280</td><td align="left" valign="top">N/A</td><td align="left" valign="top">4759</td></tr><tr><td align="left" valign="top">&#x2003;Female</td><td align="left" valign="top">7368</td><td align="left" valign="top">5503</td><td align="left" valign="top">N/A</td><td align="left" valign="top">6256</td></tr><tr><td align="left" valign="top">Race</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;White</td><td align="left" valign="top">8551</td><td align="left" valign="top">8712</td><td align="left" valign="top">N/A</td><td align="left" valign="top">N/A</td></tr><tr><td align="left" valign="top">&#x2003;Black</td><td align="left" valign="top">1323</td><td align="left" valign="top">2397</td><td align="left" valign="top">N/A</td><td align="left" valign="top">N/A</td></tr><tr><td align="left" valign="top">&#x2003;Asian</td><td align="left" valign="top">413</td><td align="left" valign="top">492</td><td align="left" valign="top">N/A</td><td align="left" valign="top">N/A</td></tr><tr><td align="left" valign="top">&#x2003;Pacific Islander</td><td align="left" valign="top">332</td><td align="left" valign="top">334</td><td align="left" valign="top">N/A</td><td align="left" valign="top">N/A</td></tr><tr><td align="left" valign="top">&#x2003;Native American</td><td align="left" valign="top">30</td><td align="left" valign="top">47</td><td align="left" valign="top">N/A</td><td align="left" valign="top">N/A</td></tr><tr><td align="left" valign="top">Ethnicity</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;Hispanic</td><td align="left" valign="top">275</td><td align="left" valign="top">1214</td><td align="left" valign="top">N/A</td><td align="left" valign="top">N/A</td></tr><tr><td align="left" valign="top">&#x2003;Not Hispanic</td><td align="left" valign="top">7483</td><td align="left" valign="top">9309</td><td align="left" valign="top">N/A</td><td align="left" valign="top">N/A</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>INPC: Indiana Network for Patient Care.</p></fn><fn id="table1fn2"><p><sup>b</sup>NBS: newborn screening.</p></fn><fn id="table1fn3"><p><sup>c</sup>SSA: Social Security Administration.</p></fn><fn id="table1fn4"><p><sup>d</sup>MCHD: Marion County Health Department.</p></fn><fn id="table1fn5"><p><sup>e</sup>N/A: not available.</p></fn></table-wrap-foot></table-wrap><p><xref ref-type="table" rid="table2">Table 2</xref> presents the degree of missingness and disagreement across demographic variables in the referential gold standard datasets. In the INPC dataset, ethnicity shows the highest level of missingness, followed by race. In the SSA dataset, missing values in the ethnicity, race, and sex fields are noteworthy. Within the NBS dataset, more record pairs have missing ethnicity and race values than discordant values, whereas the sex field shows more discordance than missingness.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Missingness and disagreement in demographic characteristics in the referential standard&#x2013;linked datasets.<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Sociodemographic variable and record missingness or disagreement</td><td align="left" valign="bottom">INPC<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup> dataset (n=15,000 pairs; 30,000 records)</td><td align="left" valign="bottom">NBS<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup> dataset (n=15,000 pairs; 30,000 records)</td><td align="left" valign="bottom">SSA<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup> dataset (n=16,500 pairs; 33,000 records)</td><td align="left" valign="bottom">MCPHD<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup> dataset (n=15,500 pairs; 31,000 records)</td></tr></thead><tbody><tr><td align="left" valign="top">Ethnicity</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Missing</td><td align="left" valign="top">14,266</td><td align="left" valign="top">8102</td><td align="left" valign="top">33,000</td><td align="left" valign="top">31,000</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Discordant</td><td align="left" valign="top">422</td><td align="left" valign="top">1398</td><td align="left" valign="top">N/A<sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup></td><td align="left" valign="top">N/A</td></tr><tr><td align="left" valign="top">Sex</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Missing</td><td align="left" valign="top">196</td><td align="left" valign="top">320</td><td align="left" valign="top">33,000</td><td align="left" valign="top">540</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Discordant</td><td align="left" valign="top">5334</td><td align="left" valign="top">6112</td><td align="left" valign="top">N/A</td><td align="left" valign="top">8430</td></tr><tr><td align="left" valign="top">Age</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Missing</td><td align="left" valign="top">16</td><td align="left" valign="top">N/A</td><td align="left" valign="top">52</td><td align="left" valign="top">2</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Discordant</td><td align="left" valign="top">66</td><td align="left" valign="top">N/A</td><td align="left" valign="top">7896</td><td align="left" valign="top">248</td></tr><tr><td align="left" valign="top">Race</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Missing</td><td align="left" valign="top">8504</td><td align="left" valign="top">5062</td><td align="left" valign="top">33,000</td><td align="left" valign="top">31,000</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Discordant</td><td align="left" valign="top">2060</td><td align="left" valign="top">2674</td><td align="left" valign="top">N/A</td><td align="left" valign="top">N/A</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>The data reflect the raw number of individual records within a pair with missing or discordant demographic values.</p></fn><fn id="table2fn2"><p><sup>b</sup>INPC: Indiana Network for Patient Care. </p></fn><fn id="table2fn3"><p><sup>c</sup>NBS: newborn screening. </p></fn><fn id="table2fn4"><p><sup>d</sup>SSA: Social Security Administration.</p></fn><fn id="table2fn5"><p><sup>e</sup>MCHD: Marion County Health Department.</p></fn><fn id="table2fn6"><p><sup>f</sup>N/A: not available.</p></fn></table-wrap-foot></table-wrap><p>The relative proportion of missing sex, race, and ethnicity values varies considerably across datasets, as presented in <xref ref-type="table" rid="table3">Table 3</xref>. Across datasets, the MDR ranges from 0.20 to 0.65 for race, 0.40 to 0.84 for ethnicity, and 0.003 to 0.5 for sex. As sex, race, and ethnicity each contain only a few valid categories, the DVR provides limited additional insight into data quality. SE, which generally measures the information diversity in the data, can be falsely elevated by artifacts, errors, or inconsistent encoding. SE values for sex were similar across datasets, ranging from 1.01 to 1.4, whereas the broader range observed for race and ethnicity probably reflected data quality issues rather than true variability (<xref ref-type="table" rid="table3">Table 3</xref>).</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Measures of data quality: missing data ratio (MDR), distinct values ratio (DVR), and Shannon entropy (SE) measured in bits.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Dataset and sociodemographic variable</td><td align="left" valign="bottom">MDR</td><td align="left" valign="bottom">DVR</td><td align="left" valign="bottom">SE (bits)</td></tr></thead><tbody><tr><td align="left" valign="top">INPC<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Sex</td><td align="char" char="." valign="top">0.0037</td><td align="char" char="." valign="top">0.0001</td><td align="char" char="." valign="top">1.0117</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Race</td><td align="char" char="." valign="top">0.6171</td><td align="char" char="." valign="top">0.0002</td><td align="char" char="." valign="top">1.2863</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Ethnicity</td><td align="char" char="." valign="top">0.8416</td><td align="char" char="." valign="top">0.0001</td><td align="char" char="." valign="top">0.6863</td></tr><tr><td align="left" valign="top">SSA<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Sex</td><td align="char" char="." valign="top">0.5012</td><td align="char" char="." valign="top">0.00009</td><td align="char" char="." valign="top">1.4946</td></tr><tr><td align="left" valign="top">MCHD<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Sex</td><td align="char" char="." valign="top">0.0089</td><td align="char" char="." valign="top">0.0001</td><td align="char" char="." valign="top">1.0623</td></tr><tr><td align="left" valign="top">NBS<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup></td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Sex</td><td align="char" char="." valign="top">0.0058</td><td align="char" char="." valign="top">0.0001</td><td align="char" char="." valign="top">1.0455</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Race</td><td align="char" char="." valign="top">0.2067</td><td align="char" char="." valign="top">0.0002</td><td align="char" char="." valign="top">1.6530</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Ethnicity</td><td align="char" char="." valign="top">0.4002</td><td align="char" char="." valign="top">0.0001</td><td align="char" char="." valign="top">1.3313</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>INPC: Indiana Network for Patient Care.</p></fn><fn id="table3fn2"><p><sup>b</sup>SSA: Social Security Administration.</p></fn><fn id="table3fn3"><p><sup>c</sup>MCHD: Marion County Health Department.</p></fn><fn id="table3fn4"><p><sup>d</sup>NBS: newborn screening.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-3"><title>Algorithm Matching Accuracy</title><p><xref ref-type="fig" rid="figure1">Figure 1</xref> and Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> summarize the matching accuracy and corresponding CIs across all sociodemographic strata. Across age groups, matching sensitivity ranged from 0.7 to 0.97, the PPV exceeded 0.88, and the <italic>F</italic><sub>1</sub>-scores were greater than 0.82. When stratified by sex, sensitivity ranged from 0.76 to 0.98, PPV ranged from 0.80 to 0.98, and <italic>F</italic><sub>1</sub>-score ranged from 0.84 to 0.97. Across race groups, the algorithm&#x2019;s matching sensitivity ranged from 0.58 to 0.99, PPV ranged from 0.85 to 0.98, and <italic>F</italic><sub>1</sub>-score ranged from 0.85 to 0.99. For ethnicity, the matching sensitivity ranged from 0.85 to 0.989, the PPV ranged between 0.88 and 0.99, and the <italic>F</italic><sub>1</sub>-score ranged from 0.88 to 0.99. Compared with INPC and MCPHD, the SSA dataset showed significantly lower sensitivity and <italic>F</italic><sub>1</sub>-scores across all age strata (&#x003C;18 y, 18-65 y, and &#x003E;65 y), as indicated by nonoverlapping 95% CIs. Similarly, for both males and females, matching sensitivity and <italic>F</italic><sub>1</sub>-scores in SSA and NBS were significantly lower than those observed in INPC and MCPHD. In the NBS dataset, the matching sensitivity, <italic>F</italic><sub>1</sub>-score, and PPV for Asians, Blacks, and White racial groups were lower than those in the INPC dataset, with statistically significant differences indicated by nonoverlapping 95% CIs. The same pattern was observed across Hispanic and non-Hispanic ethnicities, with all 3 performance metrics in NBS markedly lower than in INPC.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>(A) Sensitivity, (B) positive predictive value (PPV), and (C) <italic>F</italic><sub>1</sub>-score for probabilistic patient matching performance across demographic groups. Points indicate the estimated metric, and horizontal bars represent 95% CIs. Results are shown for 4 data sources. INPC: Indiana Network for Patient Care; MCHD: Marion County Public Health Department; NBS: newborn screening; SSA; Social Security Administration.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e78622_fig01.png"/></fig><p>Across all datasets, the largest variability in matching performance was observed among pairs with missing or discordant field values. Data quality&#x2013;performance plots (<italic>F</italic><sub>1</sub>-score vs DVR; <italic>F</italic><sub>1</sub>-score vs SE) show that the relationship between data quality and linkage accuracy varies across datasets, particularly for race and ethnicity in NBS and INPC (<xref ref-type="fig" rid="figure2">Figures 2</xref> and <xref ref-type="fig" rid="figure3">3</xref>). For age, performance differences among datasets appear to stem more from differences in date of birth documentation formats than from true demographic variability. In INPC, the algorithm maintained consistently high performance despite variation in data quality. In contrast, the NBS dataset showed relatively similar performance across race, ethnicity, and sex groups, despite substantial differences in data quality.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>DVR quality versus <italic>F</italic><sub>1</sub>-score plots. Mean refers to the DVR quality indicator (unitless ratio between 0 and 1), while MAR refers to the <italic>F</italic><sub>1</sub>-score using the missing at random approach (dimensionless value between 0 and 1). DVR quality indicator<inline-formula><mml:math id="ieqn1"><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msub><mml:mrow><mml:mi>q</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mfenced separators="|"><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:mi>m</mml:mi><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow></mml:mfenced><mml:mo>+</mml:mo><mml:msub><mml:mrow><mml:mi>q</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>(</mml:mo><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:mi>m</mml:mi><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>)</mml:mo><mml:mo>+</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>+</mml:mo><mml:msub><mml:mrow><mml:mi>q</mml:mi></mml:mrow><mml:mrow><mml:mi>P</mml:mi></mml:mrow></mml:msub><mml:mo>(</mml:mo><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:mi>m</mml:mi><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mi>P</mml:mi></mml:mrow></mml:mfrac></mml:math></inline-formula>; in which missing percentage=ms<sub>i</sub>. DVR: distinct value ratio; INPC: Indiana Network for Patient Care; MAR: missing at random; MCHD: Marion County Public Health Department; NBS: newborn screening; SSA: Social Security Administration.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e78622_fig02.png"/></fig><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Shannon entropy quality versus <italic>F</italic><sub>1</sub>-score plots. Mean refers to the Shannon entropy quality indicator (measured in bits), while MAR refers to the <italic>F</italic><sub>1</sub>-score using the missing at random approach (dimensionless value between 0 and 1). Entropy quality indicator<italic>=</italic><inline-formula><mml:math id="ieqn2"><mml:mfrac><mml:mrow><mml:msub><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mfenced separators="|"><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:mi>m</mml:mi><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow></mml:mfenced><mml:mo>+</mml:mo><mml:msub><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>(</mml:mo><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:mi>m</mml:mi><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>)</mml:mo><mml:mo>+</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>+</mml:mo><mml:msub><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mi>P</mml:mi></mml:mrow></mml:msub><mml:mo>(</mml:mo><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:mi>m</mml:mi><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mi>P</mml:mi></mml:mrow></mml:mfrac></mml:math></inline-formula><italic>;</italic> in which Shannon entropy=e<sub>i</sub>; missing percentage=ms<sub>i</sub>. INPC: Indiana Network for Patient Care; MAR: missing at random; MCHD: Marion County Public Health Department; NBS: newborn screening; SSA: Social Security Administration.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e78622_fig03.png"/></fig></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Summary of Key Findings</title><p>This study examined whether probabilistic patient matching accuracy varies by age, sex, race, and ethnicity and sought to identify data-related sources of bias that may influence matching performance. Across 4 large, operationally relevant datasets, matching sensitivity, PPV, and <italic>F</italic><sub>1</sub>-score varied by demographic strata, with several statistically significant differences indicated by nonoverlapping CIs. These findings show that probabilistic linkage accuracy is not demographically uniform and is influenced not only by algorithmic design but also by underlying data quality and completeness&#x2014;both of which can bias matching performance.</p></sec><sec id="s4-2"><title>Variation in Data Completeness (MDR) and Its Impact on Performance</title><p>Data completeness varied across datasets. Race and ethnicity exhibited the highest levels of missingness, consistent with prior work documenting gaps in demographic data capture [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>]. Age was generally complete except in the SSA dataset, which contained substantial discordance across linked records. These completeness patterns were directly reflected in performance: the most significant declines in sensitivity and <italic>F</italic><sub>1</sub>-score occurred in strata with missing or inconsistent fields, and the lowest overall performance was observed in the NBS and SSA datasets, where missingness was most pronounced.</p></sec><sec id="s4-3"><title>Variation in Data Quality (DVR and SE) and Its Impact on Performance</title><p>Data quality varied across demographic fields. While the DVR showed only modest variability, SE differed substantially, particularly for sex, race, and ethnicity, indicating increased variation in the information content of these fields. Fields with lower SE or DVR values, such as those with more missing data, standardized naming conventions, or limited variability, provide less discriminative information for probabilistic algorithms, increasing the risk of false matches or missed linkages. These patterns may disproportionately affect populations with less consistent data capture, such as individuals with non-English names, hyphenated surnames, or variable address formats. Fields with more distinct values or more evenly distributed unique values have higher DVR and SE. Such fields provide more discriminative information for the linkage algorithms, improving the algorithm accuracy. The theoretical maximum entropy reflects the upper limit of information a variable could contain if all its possible values were equally likely. Comparing a variable&#x2019;s observed entropy to this maximum indicates how fully the variable uses its potential informational range [<xref ref-type="bibr" rid="ref8">8</xref>]. The plots of data quality indicators&#x2014;SE and DVR&#x2014;versus <italic>F</italic><sub>1</sub>-score (<xref ref-type="fig" rid="figure2">Figures 2</xref> and <xref ref-type="fig" rid="figure3">3</xref>) demonstrate a correlation between the 2 variables and warrant further exploration in future studies. Previous studies have identified similar relationships between data quality and record linkage performance [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>].</p><p>Systematic assessment of data completeness and data quality using metrics such as the MDR, DVR, and SE extends and strengthens traditional probabilistic record linkage approaches by explicitly quantifying the informational value of matching fields and linking these properties to performance disparities. MDR captures the degree of missingness that directly reduces the discriminative power of identifiers, while DVR and SE characterize the uniqueness and information content of variables that are central to assigning match weights in probabilistic models. SE, in particular, has been used extensively to assess how effectively a variable differentiates individuals within a population, while distinct value&#x2013;based metrics have been shown to correlate with linkage performance by reflecting the degree of identifier uniqueness [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref29">29</xref>]. Previous studies have demonstrated that linkage accuracy is highly sensitive to both missingness and variability in identifying fields, and entropy-based measures have been widely used in record linkage and data integration to evaluate information content, guide field selection, and optimize matching rules [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref28">28</xref>]. Similarly, distinct value&#x2013;based metrics have been shown to correlate with linkage performance by reflecting the degree to which identifiers distinguish true matches from coincidental agreements [<xref ref-type="bibr" rid="ref6">6</xref>]. Compared with deterministic linkage methods, which rely on exact or rule-based agreement and have been shown to perform poorly in settings with heterogeneous data quality or incomplete identifiers [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref31">31</xref>], probabilistic approaches derived from the FS framework remain the most widely used in health care and administrative data linkage due to their flexibility, interpretability, and ability to accommodate partial agreement and missing data [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref32">32</xref>]. However, previous evaluations of probabilistic linkage have largely focused on aggregate accuracy metrics, often obscuring subgroup-specific errors and masking the role of data quality in generating disparate performance [<xref ref-type="bibr" rid="ref29">29</xref>].</p><p>By integrating these metrics into a probabilistic framework derived from the FS model, one of the most widely used and interpretable approaches for health care and administrative data linkage due to its ability to accommodate partial agreement and missing data [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref33">33</xref>], this study advances beyond deterministic and hybrid methods, which often perform poorly in settings with heterogeneous data quality or high missingness [<xref ref-type="bibr" rid="ref34">34</xref>-<xref ref-type="bibr" rid="ref36">36</xref>].</p><p>The framework deployed in this study improves upon these approaches by integrating a modified FS model under an MAR assumption with explicit, field-level data quality diagnostics and demographic-stratified performance assessment. This combination enables detection of prealgorithmic bias, directly links observed performance disparities to measurable data characteristics, and supports targeted mitigation strategies such as adaptive weighting, selective data cleaning, or alternative blocking schemes [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref36">36</xref>]. Compared with hybrid or black-box machine learning&#x2013;based linkage methods, which may offer gains in accuracy but often lack transparency and equity assessment capabilities [<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref38">38</xref>], this framework balances operational feasibility, interpretability, and fairness, aligning with best practices in modern record linkage and algorithmic fairness research and supporting its applicability in large-scale health information exchange environments. This integration improves transparency, interpretability, and equity assessment relative to black box or rule-based approaches, while remaining compatible with large-scale operational health information exchange systems [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref39">39</xref>,<xref ref-type="bibr" rid="ref40">40</xref>].</p></sec><sec id="s4-4"><title>Potential Data-Related Sources of Bias</title><p>Multiple structural and administrative factors contribute to missing or inconsistent demographic fields, creating sources of prealgorithmic bias&#x2014;bias embedded in the data before any matching occurs. Examples include self-reported or manually entered race and ethnicity, inconsistent naming conventions, and institutional practices such as assigning newborn race based on the birthing parent or the SSA&#x2019;s merging of race and ethnicity into a single field [<xref ref-type="bibr" rid="ref41">41</xref>]. These practices disproportionately affect racial and ethnic minorities and produce several forms of data-related bias. Inconsistent capture of race and ethnicity introduces measurement bias, while failure to record key demographic fields results in omitted-variable bias [<xref ref-type="bibr" rid="ref42">42</xref>]. Biased or incomplete data can cause algorithms to reproduce and amplify inequities. When models are trained on historical datasets that reflect past discrimination, they learn these patterns and unfairly penalize underrepresented groups. Missing or sparse data for marginalized populations leads to poorer performance for those groups.</p><p>Underrepresentation bias can arise when certain groups are sparsely represented in datasets used to tune or validate linkage models [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref43">43</xref>-<xref ref-type="bibr" rid="ref45">45</xref>].</p><p>Reviewer bias may further influence the construction of referential standards, particularly when adjudicators encounter unfamiliar names or naming conventions [<xref ref-type="bibr" rid="ref43">43</xref>,<xref ref-type="bibr" rid="ref46">46</xref>]. Finally, population-specific characteristics, such as the newborn population in the NBS dataset, whose records are newly created and collected under mandated screening workflows, can introduce additional variability in data completeness and quality.</p></sec><sec id="s4-5"><title>Clinical, Ethical, and Research Implications of Disparities in Patient Matching Accuracy</title><p>Differences in linkage accuracy across demographic groups have implications for clinical, ethical, and research practice. Disparities in patient matching accuracy can have cascading effects on health care analytics, policy, and clinical practice. When linkage accuracy is lower for underrepresented populations, their health data may be fragmented or misclassified, leading to biased reporting of outcomes and inequitable policy or clinical decisions. Record linkage errors have direct consequences for patient safety and care quality. Missed matches fragment longitudinal histories, obscuring essential information such as allergies, prior diagnoses, or medications, while false matches may merge records from different individuals, leading to inappropriate treatment, misdiagnosis, or exposure of sensitive information. These errors disproportionately affect populations whose demographic data are inconsistently captured&#x2014;including racial and ethnic minorities, immigrants, individuals with unstable housing, and very young or older patients&#x2014;who often experience higher missingness and discordance in key identifiers [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref47">47</xref>]. As a result, linkage inaccuracies can impair clinical decision-making, compromise continuity of care, and exacerbate existing health care disparities.</p><p>The ethical implications of linkage errors extend beyond clinical harm to include threats to privacy, autonomy, and fairness. Erroneous merges can reveal sensitive information and increase risks of reidentification, identity theft, or unauthorized disclosure. At the same time, missed matches can lead individuals to lose control over how their information is used across systems. These concerns are amplified for groups already affected by structural inequities, who face greater risks of linkage failure due to inconsistent or incomplete demographic data [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>]. Such patterns mirror findings in the algorithmic fairness literature [<xref ref-type="bibr" rid="ref48">48</xref>], which emphasizes that data completeness, representativeness, and informational balance in preprocessing must be monitored to prevent the reinforcement of structural inequities [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref44">44</xref>,<xref ref-type="bibr" rid="ref47">47</xref>,<xref ref-type="bibr" rid="ref49">49</xref>,<xref ref-type="bibr" rid="ref50">50</xref>]. Biased model outputs can also create reinforcing feedback loops, where discriminatory decisions generate new biased data that further entrench inequity [<xref ref-type="bibr" rid="ref51">51</xref>]. Ultimately, these mechanisms produce systematic performance disparities&#x2014;algorithms perform least well for underrepresented or inaccurately recorded populations, even when no discriminatory intent exists [<xref ref-type="bibr" rid="ref51">51</xref>]. Persistent errors can erode trust in health care organizations and public institutions, discouraging individuals from sharing information or seeking care, particularly among historically marginalized communities.</p><p>Record linkage errors also compromise the validity of research and public health surveillance. False matches introduce noise and bias estimates toward the null, whereas missed matches reduce sample size, weaken statistical power, and may undercount exposures or outcomes. These errors distort estimates of disease prevalence, treatment effectiveness, and health disparities, particularly when linkage success varies systematically across demographic groups. Unequal linkage accuracy can produce unrepresentative analytic datasets, misinform policy decisions, and result in inequitable resource allocation. As linkage errors are not randomly distributed&#x2014;disproportionately affecting racial/ethnic minorities, immigrants, and individuals with unstable housing&#x2014;these disparities can amplify existing inequities in population-level research and public health practice [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref47">47</xref>]. Accurate and consistent capture of sociodemographic information is therefore essential for evaluating algorithmic equity and ensuring fair and reliable downstream analyses [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>]. Accurate capture of race and ethnicity is therefore essential for evaluating and ensuring equitable algorithmic performance.</p></sec><sec id="s4-6"><title>Implications for Future Work</title><p>This study&#x2019;s strengths include the use of 4 large operational datasets, manually adjudicated gold standard pairs, and the integration of data quality metrics such as DVR and SE to contextualize performance variation. The findings underscore the importance of assessing linkage accuracy within demographic subgroups rather than relying on aggregate performance, which can obscure disparities.</p><p>Future work should evaluate deterministic and hybrid linkage approaches to determine whether demographic disparities persist across algorithm types. Fairness-aware linkage methods and improved demographic data capture&#x2014;especially standardized race and ethnicity fields&#x2014;represent promising mitigation strategies. Simulation-based analyses of MNAR mechanisms may provide insight into bias introduced by unobserved missingness. Further research should quantify the relationship between measures of data quality and the performance of the matching algorithm. Finally, research should explore the downstream consequences of linkage disparities for surveillance metrics, risk prediction models, and health equity analyses.</p></sec><sec id="s4-7"><title>Limitations</title><p>This study has several limitations. First, missing demographic data for race and ethnicity prevented comparisons of linkage performance across all race categories. Although current reporting guidance includes categories such as Native American and Pacific Islander, the small number of records in these groups precluded subgroup analyses. Sex-stratified analyses were similarly constrained by limited availability in some datasets.</p><p>Second, all data were drawn from a single US state, which may limit generalizability. Nonetheless, the inclusion of heterogeneous sources&#x2014;particularly a statewide health information exchange aggregating data from more than 100 clinical organizations&#x2014;supports broader applicability across diverse health system environments. Although these datasets originate in Indiana, the observed disparities in probabilistic linkage accuracy likely reflect fundamental issues of data completeness, standardization, and attribute weighting that are common across health systems. Replicating this work in multistate or international settings will be important to confirm broader relevance and to guide equity-focused improvements in patient matching methods.</p><p>Third, missing data were addressed using a modified FS model under a MAR assumption. We did not evaluate MNAR mechanisms, which may introduce additional forms of linkage bias. Future work will examine MNAR processes to characterize better and mitigate these effects.</p><p>Finally, jurisdiction-specific workflows (eg, newborn screening processes in the NBS dataset) may influence data availability and structure, potentially limiting comparability with datasets from other health systems.</p></sec><sec id="s4-8"><title>Conclusions</title><p>This study found statistically significant differences in probabilistic patient matching accuracy across age, sex, race, and ethnicity. These disparities were closely aligned with variation in data completeness, uniqueness, and informational richness across datasets, demonstrating that matching performance is shaped as much by data quality as by algorithm design. Datasets with greater missingness, discordance, or low information diversity, particularly in race and ethnicity fields, consistently showed lower sensitivity and <italic>F</italic><sub>1</sub>-scores. This underscores the critical role of data quality in linkage performance. Such deficits in data quality and completeness introduce prealgorithmic bias, including underrepresentation bias, which can directly affect the accuracy of probabilistic matching.</p><p>As linkage errors disproportionately affect groups with poorer or less standardized demographic data, these disparities raise important clinical, public health, and ethical concerns. Fragmented records, misclassified cases, and biased population estimates can perpetuate inequities in care delivery, surveillance, and policy. Routine demographic-stratified evaluations of matching accuracy are therefore essential to detect and mitigate algorithmic bias. Future work will focus on improving the capture and standardization of sociodemographic data, developing fairness-aware linkage methods, and assessing bias from records with missing or discordant fields.</p><p>When implementing patient matching systems, performance should be monitored across subpopulations to identify prealgorithmic bias, characterize which groups are at risk, and assess the potential severity of downstream harms. Policymakers should advocate for consistent frameworks to assess matching algorithms, establish accountability mechanisms, and standardize the collection of race, ethnicity, and other important demographic factors. Improving the quality of demographic data and monitoring equity in linkage performance are critical to ensure that linked health data support fair, reliable, and inclusive clinical, research, and public health decision-making.</p></sec></sec></body><back><ack><p>Generative artificial intelligence was not used in any part of the manuscript creation.</p></ack><notes><sec><title>Funding</title><p>This work was supported by grants from the Agency for Healthcare Research and Quality and the Patient-Centered Outcomes Research Institute.</p></sec><sec><title>Data Availability</title><p>The data used for this study are subject to the Health Insurance Portability and Accountability Act and cannot be made publicly available. Access to these data for reproducibility should be negotiated through Indiana University and the Regenstrief Institute.</p></sec></notes><fn-group><fn fn-type="con"><p>SJG and CB designed the study in consultation with LL, AM, XL, and HX. LL, FO, XL, and HX provided data analysis. CB, XL, and HX provided interpretation of the study results. CB and KSA wrote the manuscript with input from SJG, XL, and HX. All authors read, reviewed, and contributed critical revisions to the manuscript. AG contributed supervision and oversight.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">DVR</term><def><p>distinct value ratio</p></def></def-item><def-item><term id="abb2">FS</term><def><p>Fellegi-Sunter</p></def></def-item><def-item><term id="abb3">INPC</term><def><p>Indiana Network for Patient Care</p></def></def-item><def-item><term id="abb4">MAR</term><def><p>missing at random</p></def></def-item><def-item><term id="abb5">MCPHD</term><def><p>Marion County Public Health Department</p></def></def-item><def-item><term id="abb6">MDR</term><def><p>missing data ratio</p></def></def-item><def-item><term id="abb7">MNAR</term><def><p>missing not at random</p></def></def-item><def-item><term id="abb8">PPV</term><def><p>positive predictive value</p></def></def-item><def-item><term id="abb9">SE</term><def><p>Shannon entropy</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McClellan</surname><given-names>MA</given-names> </name></person-group><article-title>Duplicate medical records: a survey of Twin Cities healthcare organizations</article-title><source>AMIA Annu Symp Proc</source><year>2009</year><month>11</month><day>14</day><volume>2009</volume><fpage>421</fpage><lpage>425</lpage><pub-id pub-id-type="medline">20351892</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Grannis</surname><given-names>SJ</given-names> </name><name name-style="western"><surname>Williams</surname><given-names>JL</given-names> </name><name name-style="western"><surname>Kasthuri</surname><given-names>S</given-names> </name><name name-style="western"><surname>Murray</surname><given-names>M</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>H</given-names> </name></person-group><article-title>Evaluation of real-world referential and probabilistic patient matching to advance patient identification strategy</article-title><source>J Am Med Inform Assoc</source><year>2022</year><month>07</month><day>12</day><volume>29</volume><issue>8</issue><fpage>1409</fpage><lpage>1415</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocac068</pub-id><pub-id pub-id-type="medline">35568993</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Crowley</surname><given-names>R</given-names> </name><name name-style="western"><surname>Daniel</surname><given-names>H</given-names> </name><name name-style="western"><surname>Cooney</surname><given-names>TG</given-names> </name><name name-style="western"><surname>Engel</surname><given-names>LS</given-names> </name><collab>Health and Public Policy Committee of the American College of Physicians</collab></person-group><article-title>Envisioning a better U.S. health care system for all: coverage and cost of care</article-title><source>Ann Intern Med</source><year>2020</year><month>01</month><day>21</day><volume>172</volume><issue>2 Suppl</issue><fpage>S7</fpage><lpage>S32</lpage><pub-id pub-id-type="doi">10.7326/M19-2415</pub-id><pub-id pub-id-type="medline">31958805</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Gill</surname><given-names>L</given-names> </name></person-group><source>Methods for Automatic Record Matching and Linkage and Their Use in National Statistics</source><year>2001</year><access-date>2026-02-05</access-date><publisher-name>Her Majesty's Stationery Office (HMSO)</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://books.google.co.in/books/about/Methods_for_Automatic_Record_Matching_an.html?id=nRWFAAAAIAAJ&#x0026;redir_esc=y">https://books.google.co.in/books/about/Methods_for_Automatic_Record_Matching_an.html?id=nRWFAAAAIAAJ&#x0026;redir_esc=y</ext-link></comment><pub-id pub-id-type="other">9781857744200</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gomatam</surname><given-names>S</given-names> </name><name name-style="western"><surname>Carter</surname><given-names>R</given-names> </name><name name-style="western"><surname>Ariet</surname><given-names>M</given-names> </name><name name-style="western"><surname>Mitchell</surname><given-names>G</given-names> </name></person-group><article-title>An empirical comparison of record linkage procedures</article-title><source>Stat Med</source><year>2002</year><month>05</month><day>30</day><volume>21</volume><issue>10</issue><fpage>1485</fpage><lpage>1496</lpage><pub-id pub-id-type="doi">10.1002/sim.1147</pub-id><pub-id pub-id-type="medline">12185898</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="report"><person-group person-group-type="author"><name name-style="western"><surname>Winkler</surname><given-names>WE</given-names> </name></person-group><article-title>Overview of record linkage and current research directions</article-title><year>2006</year><access-date>2026-02-05</access-date><publisher-name>U.S. Census Bureau</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.census.gov/content/dam/Census/library/working-papers/2006/adrm/rrs2006-02.pdf">https://www.census.gov/content/dam/Census/library/working-papers/2006/adrm/rrs2006-02.pdf</ext-link></comment></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Culbertson</surname><given-names>A</given-names> </name><name name-style="western"><surname>Goel</surname><given-names>S</given-names> </name><name name-style="western"><surname>Madden</surname><given-names>MB</given-names> </name><etal/></person-group><article-title>The building blocks of interoperability. a multisite analysis of patient demographic attributes available for matching</article-title><source>Appl Clin Inform</source><year>2017</year><month>04</month><day>5</day><volume>8</volume><issue>2</issue><fpage>322</fpage><lpage>336</lpage><pub-id pub-id-type="doi">10.4338/ACI-2016-11-RA-0196</pub-id><pub-id pub-id-type="medline">28378025</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ong</surname><given-names>TC</given-names> </name><name name-style="western"><surname>Hill</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kahn</surname><given-names>MG</given-names> </name><name name-style="western"><surname>Lembcke</surname><given-names>LR</given-names> </name><name name-style="western"><surname>Schilling</surname><given-names>LM</given-names> </name><name name-style="western"><surname>Grannis</surname><given-names>SJ</given-names> </name></person-group><article-title>Linkability measures to assess the data characteristics for record linkage</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>11</month><day>1</day><volume>31</volume><issue>11</issue><fpage>2651</fpage><lpage>2659</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocae248</pub-id><pub-id pub-id-type="medline">39301630</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ash</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Ip-Lin</surname><given-names>K</given-names> </name></person-group><article-title>Embracing the sparse, noisy, and interrelated aspects of patient demographics for use in clinical medical record linkage</article-title><source>AMIA Jt Summits Transl Sci Proc</source><year>2015</year><volume>2015</volume><fpage>425</fpage><lpage>429</lpage><pub-id pub-id-type="medline">26306279</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Houston</surname><given-names>L</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>P</given-names> </name><name name-style="western"><surname>Martin</surname><given-names>A</given-names> </name><name name-style="western"><surname>Probst</surname><given-names>Y</given-names> </name></person-group><article-title>Heterogeneity in clinical research data quality monitoring: a national survey</article-title><source>J Biomed Inform</source><year>2020</year><month>08</month><volume>108</volume><fpage>103491</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2020.103491</pub-id><pub-id pub-id-type="medline">32574794</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kahn</surname><given-names>MG</given-names> </name><name name-style="western"><surname>Callahan</surname><given-names>TJ</given-names> </name><name name-style="western"><surname>Barnard</surname><given-names>J</given-names> </name><etal/></person-group><article-title>A harmonized data quality assessment terminology and framework for the secondary use of electronic health record data</article-title><source>EGEMS (Wash DC)</source><year>2016</year><volume>4</volume><issue>1</issue><fpage>1244</fpage><pub-id pub-id-type="doi">10.13063/2327-9214.1244</pub-id><pub-id pub-id-type="medline">27713905</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Weiskopf</surname><given-names>NG</given-names> </name><name name-style="western"><surname>Weng</surname><given-names>C</given-names> </name></person-group><article-title>Methods and dimensions of electronic health record data quality assessment: enabling reuse for clinical research</article-title><source>J Am Med Inform Assoc</source><year>2013</year><month>01</month><day>1</day><volume>20</volume><issue>1</issue><fpage>144</fpage><lpage>151</lpage><pub-id pub-id-type="doi">10.1136/amiajnl-2011-000681</pub-id><pub-id pub-id-type="medline">22733976</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Harron</surname><given-names>KL</given-names> </name><name name-style="western"><surname>Doidge</surname><given-names>JC</given-names> </name><name name-style="western"><surname>Knight</surname><given-names>HE</given-names> </name><etal/></person-group><article-title>A guide to evaluating linkage quality for the analysis of linked data</article-title><source>Int J Epidemiol</source><year>2017</year><month>10</month><day>1</day><volume>46</volume><issue>5</issue><fpage>1699</fpage><lpage>1710</lpage><pub-id pub-id-type="doi">10.1093/ije/dyx177</pub-id><pub-id pub-id-type="medline">29025131</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Grath-Lone</surname><given-names>LM</given-names> </name><name name-style="western"><surname>Libuy</surname><given-names>N</given-names> </name><name name-style="western"><surname>Etoori</surname><given-names>D</given-names> </name><name name-style="western"><surname>Blackburn</surname><given-names>R</given-names> </name><name name-style="western"><surname>Gilbert</surname><given-names>R</given-names> </name><name name-style="western"><surname>Harron</surname><given-names>K</given-names> </name></person-group><article-title>Ethnic bias in data linkage</article-title><source>Lancet Digit Health</source><year>2021</year><month>06</month><volume>3</volume><issue>6</issue><fpage>e339</fpage><pub-id pub-id-type="doi">10.1016/S2589-7500(21)00081-9</pub-id><pub-id pub-id-type="medline">34045000</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Getzen</surname><given-names>E</given-names> </name><name name-style="western"><surname>Ungar</surname><given-names>L</given-names> </name><name name-style="western"><surname>Mowery</surname><given-names>D</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Long</surname><given-names>Q</given-names> </name></person-group><article-title>Mining for equitable health: assessing the impact of missing data in electronic health records</article-title><source>J Biomed Inform</source><year>2023</year><month>03</month><volume>139</volume><fpage>104269</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2022.104269</pub-id><pub-id pub-id-type="medline">36621750</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bohensky</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Jolley</surname><given-names>D</given-names> </name><name name-style="western"><surname>Sundararajan</surname><given-names>V</given-names> </name><etal/></person-group><article-title>Data linkage: a powerful research tool with potential problems</article-title><source>BMC Health Serv Res</source><year>2010</year><month>12</month><day>22</day><volume>10</volume><issue>1</issue><fpage>346</fpage><pub-id pub-id-type="doi">10.1186/1472-6963-10-346</pub-id><pub-id pub-id-type="medline">21176171</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McDonald</surname><given-names>CJ</given-names> </name><name name-style="western"><surname>Overhage</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Barnes</surname><given-names>M</given-names> </name><etal/></person-group><article-title>The Indiana network for patient care: a working local health information infrastructure. An example of a working infrastructure collaboration that links data from five health systems and hundreds of millions of entries</article-title><source>Health Aff (Millwood)</source><year>2005</year><volume>24</volume><issue>5</issue><fpage>1214</fpage><lpage>1220</lpage><pub-id pub-id-type="doi">10.1377/hlthaff.24.5.1214</pub-id><pub-id pub-id-type="medline">16162565</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>X</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Grannis</surname><given-names>S</given-names> </name></person-group><article-title>The data-adaptive Fellegi-Sunter model for probabilistic record linkage: algorithm development and validation for incorporating missing data and field selection</article-title><source>J Med Internet Res</source><year>2022</year><month>09</month><day>29</day><volume>24</volume><issue>9</issue><fpage>e33775</fpage><pub-id pub-id-type="doi">10.2196/33775</pub-id><pub-id pub-id-type="medline">36173664</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Li</surname><given-names>X</given-names> </name><name name-style="western"><surname>Grannis</surname><given-names>S</given-names> </name></person-group><article-title>A simple two-step procedure using the Fellegi-Sunter model for frequency-based record linkage</article-title><source>J Appl Stat</source><year>2022</year><volume>49</volume><issue>11</issue><fpage>2789</fpage><lpage>2804</lpage><pub-id pub-id-type="doi">10.1080/02664763.2021.1922615</pub-id><pub-id pub-id-type="medline">35909667</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>DuVall</surname><given-names>SL</given-names> </name><name name-style="western"><surname>Kerber</surname><given-names>RA</given-names> </name><name name-style="western"><surname>Thomas</surname><given-names>A</given-names> </name></person-group><article-title>Extending the Fellegi-Sunter probabilistic record linkage method for approximate field comparators</article-title><source>J Biomed Inform</source><year>2010</year><month>02</month><volume>43</volume><issue>1</issue><fpage>24</fpage><lpage>30</lpage><pub-id pub-id-type="doi">10.1016/j.jbi.2009.08.004</pub-id><pub-id pub-id-type="medline">19683070</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fellegi</surname><given-names>IP</given-names> </name><name name-style="western"><surname>Sunter</surname><given-names>AB</given-names> </name></person-group><article-title>A theory for record linkage</article-title><source>J Am Stat Assoc</source><year>1969</year><month>12</month><volume>64</volume><issue>328</issue><fpage>1183</fpage><lpage>1210</lpage><pub-id pub-id-type="doi">10.1080/01621459.1969.10501049</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gupta</surname><given-names>AK</given-names> </name><name name-style="western"><surname>Kasthurirathne</surname><given-names>SN</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>H</given-names> </name><etal/></person-group><article-title>A framework for a consistent and reproducible evaluation of manual review for patient matching algorithms</article-title><source>J Am Med Inform Assoc</source><year>2022</year><month>11</month><day>14</day><volume>29</volume><issue>12</issue><fpage>2105</fpage><lpage>2109</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocac175</pub-id><pub-id pub-id-type="medline">36305781</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shannon</surname><given-names>CE</given-names> </name></person-group><article-title>A mathematical theory of communication</article-title><source>Bell Sys Tech J</source><year>1948</year><month>07</month><volume>27</volume><issue>3</issue><fpage>379</fpage><lpage>423</lpage><pub-id pub-id-type="doi">10.1002/j.1538-7305.1948.tb01338.x</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Khunti</surname><given-names>K</given-names> </name><name name-style="western"><surname>Routen</surname><given-names>A</given-names> </name><name name-style="western"><surname>Banerjee</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pareek</surname><given-names>M</given-names> </name></person-group><article-title>The need for improved collection and coding of ethnicity in health research</article-title><source>J Public Health (Oxf)</source><year>2021</year><month>06</month><day>7</day><volume>43</volume><issue>2</issue><fpage>e270</fpage><lpage>e272</lpage><pub-id pub-id-type="doi">10.1093/pubmed/fdaa198</pub-id><pub-id pub-id-type="medline">33283239</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Baker</surname><given-names>DW</given-names> </name><name name-style="western"><surname>Cameron</surname><given-names>KA</given-names> </name><name name-style="western"><surname>Feinglass</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Patients&#x2019; attitudes toward health care providers collecting information about their race and ethnicity</article-title><source>J Gen Intern Med</source><year>2005</year><month>10</month><volume>20</volume><issue>10</issue><fpage>895</fpage><lpage>900</lpage><pub-id pub-id-type="doi">10.1111/j.1525-1497.2005.0195.x</pub-id><pub-id pub-id-type="medline">16191134</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hansotte</surname><given-names>E</given-names> </name><name name-style="western"><surname>Bowman</surname><given-names>E</given-names> </name><name name-style="western"><surname>Gibson</surname><given-names>PJ</given-names> </name><name name-style="western"><surname>Dixon</surname><given-names>BE</given-names> </name><name name-style="western"><surname>Madden</surname><given-names>VR</given-names> </name><name name-style="western"><surname>Caine</surname><given-names>VA</given-names> </name></person-group><article-title>Supporting health equity through data-driven decision-making: a local health department response to COVID-19</article-title><source>Am J Public Health</source><year>2021</year><month>10</month><volume>111</volume><issue>S3</issue><fpage>S197</fpage><lpage>S200</lpage><pub-id pub-id-type="doi">10.2105/AJPH.2021.306421</pub-id><pub-id pub-id-type="medline">34709872</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Randall</surname><given-names>S</given-names> </name><name name-style="western"><surname>Brown</surname><given-names>A</given-names> </name><name name-style="western"><surname>Boyd</surname><given-names>J</given-names> </name><name name-style="western"><surname>Schnell</surname><given-names>R</given-names> </name><name name-style="western"><surname>Borgs</surname><given-names>C</given-names> </name><name name-style="western"><surname>Ferrante</surname><given-names>A</given-names> </name></person-group><article-title>Sociodemographic differences in linkage error: an examination of four large-scale datasets</article-title><source>BMC Health Serv Res</source><year>2018</year><month>09</month><day>3</day><volume>18</volume><issue>1</issue><fpage>678</fpage><pub-id pub-id-type="doi">10.1186/s12913-018-3495-x</pub-id><pub-id pub-id-type="medline">30176856</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Christen</surname><given-names>P</given-names> </name></person-group><article-title>Data linkage: the big picture</article-title><source>Harv Data Sci Rev</source><year>2019</year><volume>1</volume><issue>2</issue><pub-id pub-id-type="doi">10.1162/99608f92.84deb5c4</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Herzog</surname><given-names>TH</given-names> </name><name name-style="western"><surname>Scheuren</surname><given-names>F</given-names> </name><name name-style="western"><surname>Winkler</surname><given-names>WE</given-names> </name></person-group><article-title>Record linkage</article-title><source>WIREs Comput Stat</source><year>2010</year><month>09</month><volume>2</volume><issue>5</issue><fpage>535</fpage><lpage>543</lpage><pub-id pub-id-type="doi">10.1002/wics.108</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tromp</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ravelli</surname><given-names>AC</given-names> </name><name name-style="western"><surname>Bonsel</surname><given-names>GJ</given-names> </name><name name-style="western"><surname>Hasman</surname><given-names>A</given-names> </name><name name-style="western"><surname>Reitsma</surname><given-names>JB</given-names> </name></person-group><article-title>Results from simulated data sets: probabilistic record linkage outperforms deterministic record linkage</article-title><source>J Clin Epidemiol</source><year>2011</year><month>05</month><volume>64</volume><issue>5</issue><fpage>565</fpage><lpage>572</lpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2010.05.008</pub-id><pub-id pub-id-type="medline">20952162</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Harron</surname><given-names>K</given-names> </name><name name-style="western"><surname>Dibben</surname><given-names>C</given-names> </name><name name-style="western"><surname>Boyd</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Challenges in administrative data linkage for research</article-title><source>Big Data Soc</source><year>2017</year><month>12</month><day>5</day><volume>4</volume><issue>2</issue><fpage>2053951717745678</fpage><pub-id pub-id-type="doi">10.1177/2053951717745678</pub-id><pub-id pub-id-type="medline">30381794</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Dusetzina</surname><given-names>SB</given-names> </name><name name-style="western"><surname>Tyree</surname><given-names>S</given-names> </name><name name-style="western"><surname>Meyer</surname><given-names>AM</given-names> </name><etal/></person-group><source>Linking Data for Health Services Research: A Framework and Instructional Guide</source><year>2014</year><publisher-name>Agency for Healthcare Research and Quality (US)</publisher-name><pub-id pub-id-type="medline">25392892</pub-id><pub-id pub-id-type="other">9781505859430</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jaro</surname><given-names>MA</given-names> </name></person-group><article-title>Probabilistic linkage of large public health data files</article-title><source>Stat Med</source><year>1995</year><volume>14</volume><issue>5-7</issue><fpage>491</fpage><lpage>498</lpage><pub-id pub-id-type="doi">10.1002/sim.4780140510</pub-id><pub-id pub-id-type="medline">7792443</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Joffe</surname><given-names>E</given-names> </name><name name-style="western"><surname>Byrne</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Reeder</surname><given-names>P</given-names> </name><etal/></person-group><article-title>A benchmark comparison of deterministic and probabilistic methods for defining manual review datasets in duplicate records reconciliation</article-title><source>J Am Med Inform Assoc</source><year>2014</year><volume>21</volume><issue>1</issue><fpage>97</fpage><lpage>104</lpage><pub-id pub-id-type="doi">10.1136/amiajnl-2013-001744</pub-id><pub-id pub-id-type="medline">23703827</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sayers</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ben-Shlomo</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Blom</surname><given-names>AW</given-names> </name><name name-style="western"><surname>Steele</surname><given-names>F</given-names> </name></person-group><article-title>Probabilistic record linkage</article-title><source>Int J Epidemiol</source><year>2016</year><month>06</month><volume>45</volume><issue>3</issue><fpage>954</fpage><lpage>964</lpage><pub-id pub-id-type="doi">10.1093/ije/dyv322</pub-id><pub-id pub-id-type="medline">26686842</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Doidge</surname><given-names>JC</given-names> </name><name name-style="western"><surname>Harron</surname><given-names>K</given-names> </name></person-group><article-title>Demystifying probabilistic linkage: common myths and misconceptions</article-title><source>Int J Popul Data Sci</source><year>2018</year><month>01</month><day>10</day><volume>3</volume><issue>1</issue><fpage>410</fpage><pub-id pub-id-type="doi">10.23889/ijpds.v3i1.410</pub-id><pub-id pub-id-type="medline">30533534</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Giang</surname><given-names>PH</given-names> </name></person-group><article-title>A machine learning approach to create blocking criteria for record linkage</article-title><source>Health Care Manag Sci</source><year>2015</year><month>03</month><volume>18</volume><issue>1</issue><fpage>93</fpage><lpage>105</lpage><pub-id pub-id-type="doi">10.1007/s10729-014-9276-0</pub-id><pub-id pub-id-type="medline">24777833</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>R&#x00F6;chner</surname><given-names>P</given-names> </name><name name-style="western"><surname>Rothlauf</surname><given-names>F</given-names> </name></person-group><article-title>Using machine learning to link electronic health records in cancer registries: on the tradeoff between linkage quality and manual effort</article-title><source>Int J Med Inform</source><year>2024</year><month>05</month><volume>185</volume><fpage>105387</fpage><pub-id pub-id-type="doi">10.1016/j.ijmedinf.2024.105387</pub-id><pub-id pub-id-type="medline">38428200</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Roos</surname><given-names>LL</given-names> </name><name name-style="western"><surname>Wall-Wieler</surname><given-names>E</given-names> </name><name name-style="western"><surname>Burchill</surname><given-names>C</given-names> </name><name name-style="western"><surname>Hamm</surname><given-names>NC</given-names> </name><name name-style="western"><surname>Hamad</surname><given-names>AF</given-names> </name><name name-style="western"><surname>Lix</surname><given-names>LM</given-names> </name></person-group><article-title>Record linkage and big data-enhancing information and improving design</article-title><source>J Clin Epidemiol</source><year>2022</year><month>10</month><volume>150</volume><fpage>18</fpage><lpage>24</lpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2022.06.006</pub-id><pub-id pub-id-type="medline">35760238</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jaro</surname><given-names>MA</given-names> </name></person-group><article-title>Advances in record-linkage methodology as applied to matching the 1985 Census of Tampa, Florida</article-title><source>J Am Stat Assoc</source><year>1989</year><month>06</month><volume>84</volume><issue>406</issue><fpage>414</fpage><lpage>420</lpage><pub-id pub-id-type="doi">10.1080/01621459.1989.10478785</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sondik</surname><given-names>EJ</given-names> </name><name name-style="western"><surname>Lucas</surname><given-names>JW</given-names> </name><name name-style="western"><surname>Madans</surname><given-names>JH</given-names> </name><name name-style="western"><surname>Smith</surname><given-names>SS</given-names> </name></person-group><article-title>Race/ethnicity and the 2000 Census: implications for public health</article-title><source>Am J Public Health</source><year>2000</year><month>11</month><volume>90</volume><issue>11</issue><fpage>1709</fpage><lpage>1713</lpage><pub-id pub-id-type="doi">10.2105/ajph.90.11.1709</pub-id><pub-id pub-id-type="medline">11076236</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shaw</surname><given-names>RJ</given-names> </name><name name-style="western"><surname>Harron</surname><given-names>KL</given-names> </name><name name-style="western"><surname>Pescarini</surname><given-names>JM</given-names> </name><etal/></person-group><article-title>Biases arising from linked administrative data for epidemiological research: a conceptual framework from registration to analyses</article-title><source>Eur J Epidemiol</source><year>2022</year><month>12</month><volume>37</volume><issue>12</issue><fpage>1215</fpage><lpage>1224</lpage><pub-id pub-id-type="doi">10.1007/s10654-022-00934-w</pub-id><pub-id pub-id-type="medline">36333542</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>NT</given-names> </name><name name-style="western"><surname>Barton</surname><given-names>G</given-names> </name><name name-style="western"><surname>Resnik</surname><given-names>P</given-names> </name></person-group><article-title>Algorithmic bias detection and mitigation: best practices and policies to reduce consumer harms</article-title><source>Brookings</source><year>2019</year><access-date>2026-02-05</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.brookings.edu/articles/algorithmic-bias-detection-and-mitigation-best-practices-and-policies-to-reduce-consumer-harms/">https://www.brookings.edu/articles/algorithmic-bias-detection-and-mitigation-best-practices-and-policies-to-reduce-consumer-harms/</ext-link></comment></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Obermeyer</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Powers</surname><given-names>B</given-names> </name><name name-style="western"><surname>Vogeli</surname><given-names>C</given-names> </name><name name-style="western"><surname>Mullainathan</surname><given-names>S</given-names> </name></person-group><article-title>Dissecting racial bias in an algorithm used to manage the health of populations</article-title><source>Science</source><year>2019</year><month>10</month><day>25</day><volume>366</volume><issue>6464</issue><fpage>447</fpage><lpage>453</lpage><pub-id pub-id-type="doi">10.1126/science.aax2342</pub-id><pub-id pub-id-type="medline">31649194</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Xiao</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>WH</given-names> </name><etal/></person-group><article-title>Algorithmic fairness in computational medicine</article-title><source>EBioMedicine</source><year>2022</year><month>10</month><volume>84</volume><fpage>104250</fpage><pub-id pub-id-type="doi">10.1016/j.ebiom.2022.104250</pub-id><pub-id pub-id-type="medline">36084616</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lopez</surname><given-names>P</given-names> </name></person-group><article-title>Bias does not equal bias: a socio-technical typology of bias in data-based algorithmic systems</article-title><source>Internet Policy Rev</source><year>2021</year><month>12</month><day>7</day><volume>10</volume><issue>4</issue><pub-id pub-id-type="doi">10.14763/2021.4.1598</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Goldstein</surname><given-names>ND</given-names> </name><name name-style="western"><surname>Kahal</surname><given-names>D</given-names> </name><name name-style="western"><surname>Testa</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gracely</surname><given-names>EJ</given-names> </name><name name-style="western"><surname>Burstyn</surname><given-names>I</given-names> </name></person-group><article-title>Data quality in electronic health record research: an approach for validation and quantitative bias analysis for imperfectly ascertained health outcomes via diagnostic codes</article-title><source>Harv Data Sci Rev</source><year>2022</year><volume>4</volume><issue>2</issue><fpage>2</fpage><pub-id pub-id-type="doi">10.1162/99608f92.cbe67e91</pub-id><pub-id pub-id-type="medline">36324333</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Anderson</surname><given-names>JW</given-names> </name><name name-style="western"><surname>Visweswaran</surname><given-names>S</given-names> </name></person-group><article-title>Algorithmic individual fairness and healthcare: a scoping review</article-title><source>JAMIA Open</source><year>2025</year><month>02</month><volume>8</volume><issue>1</issue><fpage>ooae149</fpage><pub-id pub-id-type="doi">10.1093/jamiaopen/ooae149</pub-id><pub-id pub-id-type="medline">39737346</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vyas</surname><given-names>DA</given-names> </name><name name-style="western"><surname>Eisenstein</surname><given-names>LG</given-names> </name><name name-style="western"><surname>Jones</surname><given-names>DS</given-names> </name></person-group><article-title>Hidden in plain sight - reconsidering the use of race correction in clinical algorithms</article-title><source>N Engl J Med</source><year>2020</year><month>08</month><day>27</day><volume>383</volume><issue>9</issue><fpage>874</fpage><lpage>882</lpage><pub-id pub-id-type="doi">10.1056/NEJMms2004740</pub-id><pub-id pub-id-type="medline">32853499</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Flanagin</surname><given-names>A</given-names> </name><name name-style="western"><surname>Frey</surname><given-names>T</given-names> </name><name name-style="western"><surname>Christiansen</surname><given-names>SL</given-names> </name><collab>AMA Manual of Style Committee</collab></person-group><article-title>Updated guidance on the reporting of race and ethnicity in medical and science journals</article-title><source>JAMA</source><year>2021</year><month>08</month><day>17</day><volume>326</volume><issue>7</issue><fpage>621</fpage><lpage>627</lpage><pub-id pub-id-type="doi">10.1001/jama.2021.13304</pub-id><pub-id pub-id-type="medline">34402850</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Belenguer</surname><given-names>L</given-names> </name></person-group><article-title>AI bias: exploring discriminatory algorithmic decision-making models and the application of possible machine-centric solutions adapted from the pharmaceutical industry</article-title><source>AI Ethics</source><year>2022</year><volume>2</volume><issue>4</issue><fpage>771</fpage><lpage>787</lpage><pub-id pub-id-type="doi">10.1007/s43681-022-00138-8</pub-id><pub-id pub-id-type="medline">35194591</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Matching performance parameters, stratified by sociodemographic groups across all datasets.</p><media xlink:href="formative_v10i1e78622_app1.docx" xlink:title="DOCX File, 39 KB"/></supplementary-material></app-group></back></article>