<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JFR</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id>
      <journal-title>JMIR Formative Research</journal-title>
      <issn pub-type="epub">2561-326X</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v7i1e50027</article-id>
      <article-id pub-id-type="pmid">38060305</article-id>
      <article-id pub-id-type="doi">10.2196/50027</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Traceable Research Data Sharing in a German Medical Data Integration Center With FAIR (Findability, Accessibility, Interoperability, and Reusability)-Geared Provenance Implementation: Proof-of-Concept Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Mavragani</surname>
            <given-names>Amaryllis</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Miksa</surname>
            <given-names>Tomasz</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Gierend</surname>
            <given-names>Kerstin</given-names>
          </name>
          <degrees>Dipl Inf (FH)</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Biomedical Informatics at the Center for Preventive Medicine and Digital Health</institution>
            <institution>Medical Faculty Mannheim, Heidelberg University</institution>
            <addr-line>Theodor-Kutzer-Ufer 1-3</addr-line>
            <addr-line>Mannheim, 68167</addr-line>
            <country>Germany</country>
            <phone>49 621383 ext 8087</phone>
            <email>kerstin.gierend@medma.uni-heidelberg.de</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-0417-3454</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Waltemath</surname>
            <given-names>Dagmar</given-names>
          </name>
          <degrees>Dr -lng</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-5886-5563</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Ganslandt</surname>
            <given-names>Thomas</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6864-8936</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Siegel</surname>
            <given-names>Fabian</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-9673-5030</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Biomedical Informatics at the Center for Preventive Medicine and Digital Health</institution>
        <institution>Medical Faculty Mannheim, Heidelberg University</institution>
        <addr-line>Mannheim</addr-line>
        <country>Germany</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Core Unit Data Integration Center and Medical Informatics Laboratory</institution>
        <institution>University Medicine Greifswald</institution>
        <addr-line>Greifswald</addr-line>
        <country>Germany</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Chair of Medical Informatics</institution>
        <institution>Friedrich-Alexander-Universität Erlangen-Nürnberg</institution>
        <addr-line>Erlangen</addr-line>
        <country>Germany</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Kerstin Gierend <email>kerstin.gierend@medma.uni-heidelberg.de</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2023</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>7</day>
        <month>12</month>
        <year>2023</year>
      </pub-date>
      <volume>7</volume>
      <elocation-id>e50027</elocation-id>
      <history>
        <date date-type="received">
          <day>16</day>
          <month>6</month>
          <year>2023</year>
        </date>
        <date date-type="rev-request">
          <day>5</day>
          <month>10</month>
          <year>2023</year>
        </date>
        <date date-type="rev-recd">
          <day>25</day>
          <month>10</month>
          <year>2023</year>
        </date>
        <date date-type="accepted">
          <day>1</day>
          <month>11</month>
          <year>2023</year>
        </date>
      </history>
      <copyright-statement>©Kerstin Gierend, Dagmar Waltemath, Thomas Ganslandt, Fabian Siegel. Originally published in JMIR Formative Research (https://formative.jmir.org), 07.12.2023.</copyright-statement>
      <copyright-year>2023</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on https://formative.jmir.org, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://formative.jmir.org/2023/1/e50027" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Secondary investigations into digital health records, including electronic patient data from German medical data integration centers (DICs), pave the way for enhanced future patient care. However, only limited information is captured regarding the integrity, traceability, and quality of the (sensitive) data elements. This lack of detail diminishes trust in the validity of the collected data. From a technical standpoint, adhering to the widely accepted FAIR (Findability, Accessibility, Interoperability, and Reusability) principles for data stewardship necessitates enriching data with provenance-related metadata. Provenance offers insights into the readiness for the reuse of a data element and serves as a supplier of data governance.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>The primary goal of this study is to augment the reusability of clinical routine data within a medical DIC for secondary utilization in clinical research. Our aim is to establish provenance traces that underpin the status of data integrity, reliability, and consequently, trust in electronic health records, thereby enhancing the accountability of the medical DIC. We present the implementation of a proof-of-concept provenance library integrating international standards as an initial step.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We adhered to a customized road map for a provenance framework, and examined the data integration steps across the ETL (extract, transform, and load) phases. Following a maturity model, we derived requirements for a provenance library. Using this research approach, we formulated a provenance model with associated metadata and implemented a proof-of-concept provenance class. Furthermore, we seamlessly incorporated the internationally recognized Word Wide Web Consortium (W3C) provenance standard, aligned the resultant provenance records with the interoperable health care standard Fast Healthcare Interoperability Resources, and presented them in various representation formats. Ultimately, we conducted a thorough assessment of provenance trace measurements.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>This study marks the inaugural implementation of integrated provenance traces at the data element level within a German medical DIC. We devised and executed a practical method that synergizes the robustness of quality- and health standard–guided (meta)data management practices. Our measurements indicate commendable pipeline execution times, attaining notable levels of accuracy and reliability in processing clinical routine data, thereby ensuring accountability in the medical DIC. These findings should inspire the development of additional tools aimed at providing evidence-based and reliable electronic health record services for secondary use.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>The research method outlined for the proof-of-concept provenance class has been crafted to promote effective and reliable core data management practices. It aims to enhance biomedical data by imbuing it with meaningful provenance, thereby bolstering the benefits for both research and society. Additionally, it facilitates the streamlined reuse of biomedical data. As a result, the system mitigates risks, as data analysis without knowledge of the origin and quality of all data elements is rendered futile. While the approach was initially developed for the medical DIC use case, these principles can be universally applied throughout the scientific domain.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>provenance</kwd>
        <kwd>traceability</kwd>
        <kwd>data management</kwd>
        <kwd>metadata</kwd>
        <kwd>data integrity</kwd>
        <kwd>data integration center</kwd>
        <kwd>medical informatics</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Provenance—a piece of metadata—is considered information that is fundamental in the data life cycle because it expresses the traceability of the processed data and facilitates the reproducibility of the results [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. The availability of provenance throughout the data life cycle is deemed a crucial factor for maintaining trust in the data at all stages [<xref ref-type="bibr" rid="ref3">3</xref>]. The data life cycle encompasses data generation, processing, validation, analysis, reporting, and application for decision-making in any context, culminating in storage within a specified retention period [<xref ref-type="bibr" rid="ref4">4</xref>]. Medical data integration centers (DICs), particularly those established within the German Medical Informatics Initiative, must enhance accountability for their activities. This is particularly crucial for the methods used in extracting, transforming, and loading sensitive patient data from heterogeneous clinical routine systems into (standardized) research data repositories for subsequent secondary use [<xref ref-type="bibr" rid="ref5">5</xref>]. In this given context, it is necessary to understand the limitations of the provided data [<xref ref-type="bibr" rid="ref6">6</xref>]. Collecting comprehensive and pertinent contextual provenance information along these processing pipelines is one approach to enhance the accountability of the medical DIC (<xref ref-type="boxed-text" rid="box1">Textbox 1</xref>). Provenance and integrity must be systematically evaluated and documented in routinely collected data sets to facilitate their reuse in clinical trials [<xref ref-type="bibr" rid="ref7">7</xref>].</p>
      <boxed-text id="box1" position="float">
        <title>Accountability in a German medical data integration center.</title>
        <p>Accountability means accepting responsibility for activities and in this context entails all procedures and processes for data managing pipelines [<xref ref-type="bibr" rid="ref8">8</xref>]. This includes keeping the movement of data elements transparent and traceable. Provenance traces enable documentation of this movement and hence generate trust in the data integrity and reliability of the provided data for secondary use.</p>
      </boxed-text>
      <p>To achieve reproducibility [<xref ref-type="bibr" rid="ref9">9</xref>] and integrity when exchanging data between academia and industry, researchers must adhere to essential research principles, particularly following good practice guidelines (eg, good clinical practice, good research/scientific practice, commonly referred to as GxP) [<xref ref-type="bibr" rid="ref10">10</xref>]. Ensuring and evaluating data integrity and data provenance are anticipated to be prerequisites for clinical trial data [<xref ref-type="bibr" rid="ref11">11</xref>]. For instance, the clinical research data quality standard ALCOA+ (Attributable, Legible, Contemporaneous, Original, and Accurate+) articulates enhanced data integrity properties and fundamentally contributes to provenance information [<xref ref-type="bibr" rid="ref12">12</xref>]. These properties pertain to attributable, legible, contemporaneous, original, accurate, complete, consistent, enduring, and available data characteristics [<xref ref-type="bibr" rid="ref10">10</xref>].</p>
      <p>In addition to adhering to good scientific practice [<xref ref-type="bibr" rid="ref13">13</xref>], heightened legal requirements such as compliance with the General Data Protection Regulation (GDPR) in the European Union, or contractual obligations, mandate evidence-based data processing for both deidentification and reidentification of data, encompassing the life cycle of the patient’s consent [<xref ref-type="bibr" rid="ref14">14</xref>].</p>
      <p>A crucial factor in advancing these objectives is the metadata acquired from the data transformation and integration process throughout the data life cycle. The field of biological research has already acknowledged the significance of metadata, as outlined in ISO norms such as ISO/CD 20961 [<xref ref-type="bibr" rid="ref15">15</xref>] and ISO/TC 276/WG5 on data processing and integration [<xref ref-type="bibr" rid="ref16">16</xref>]. ISO 20961, for example, specifies requirements for the consistent formatting and documentation of data and metadata.</p>
      <p>Furthermore, the FAIR (Findability, Accessibility, Interoperability, and Reusability) guiding principles for data management and data stewardship emphasize the overall relevance of metadata for the data itself, including those used in infrastructures and services [<xref ref-type="bibr" rid="ref17">17</xref>]. Aspects of the FAIR recommendations explicitly address provenance capture. As such, the “R1.2” FAIR principle demands machine-accessible and readable metadata, which include provenance information about the data creation or generation. Related metadata accumulate not only during the data transformation itself but also within the software used [<xref ref-type="bibr" rid="ref18">18</xref>]. The principle “R1.3” expects metadata to be adhering to domain-relevant community standards such as the HL7 Fast Healthcare Interoperability Resources (FHIR) or Dublin core [<xref ref-type="bibr" rid="ref1">1</xref>]. FHIR is an internationally recognized standard that supports the exchange of data between different software systems within the health care sector [<xref ref-type="bibr" rid="ref19">19</xref>]. In this vein, the FHIR resource “provenance” records entities and processes involved in creating a specific resource. From a technical point of view, the FHIR Provenance resource is founded on the framework of the open W3C standard PROV-Data Model definition and ontology [<xref ref-type="bibr" rid="ref20">20</xref>], the successor to the Open Provenance Model [<xref ref-type="bibr" rid="ref21">21</xref>]. Here, the concepts of linked entities, activities, and agent resources enable the establishment of a provenance model. Such resources can be described with the W3C Resource Description Framework (RDF) method [<xref ref-type="bibr" rid="ref22">22</xref>]. RDF is a data model, which is commonly stored in formats such as RDF/XML (.rd) or JSON-LD (.json). All formats represent a knowledge graph.</p>
      <p>As of now, the capture of provenance in health care is not adequately or uniformly implemented in German medical DICs, as revealed in a recent study on their data management status [<xref ref-type="bibr" rid="ref23">23</xref>]. The results demonstrated that provenance is indeed a factor strongly influenced by the maturity level of data management practices. Following complex transformations in the data integration process, the provenance of data elements is often lost, making it difficult to impossible to assess the (measurement) quality of a data element. This reduction in traceability diminishes trust in the validity of the collected data.</p>
      <p>The primary objective of this study is to improve the reusability of clinical routine data within a medical DIC for its secondary application in clinical research. Our goal is to enhance processed clinical routine data by incorporating appropriate semantic metadata, a key requirement guided by the FAIR principles [<xref ref-type="bibr" rid="ref17">17</xref>]. Furthermore, our intention is to bolster the accountability of our DIC by mitigating the risks associated with the reuse of compromised data in clinical research.</p>
      <p>To our knowledge, this is the first demonstration of provenance integration within a medical DIC.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Materials</title>
        <p>We used test data to develop and test our provenance class. Test data elements were chosen to reflect the composition of a typical data integration repository. We created exemplary dummy data element definitions with comprehensive annotation (<xref ref-type="boxed-text" rid="box2">Textbox 2</xref>). We defined 7 data element types and generated 100,000 data elements for each data element type to generate a total of 700,000 provenance records using a Python (Python Foundation) script.</p>
        <boxed-text id="box2" position="float">
          <title>Exemplary dummy text–based data element definition.</title>
          <p>id=’syst_blood_pressure’,</p>
          <p>name=’syst_blood_pressure’,</p>
          <p>description=’Systolic Blood Pressure’,</p>
          <p>source=’stg_sap_vitalis’,</p>
          <p>source_variable=’SysBP’,</p>
          <p>destination=’dwh_vitalis’,</p>
          <p>destination_variable=’SBP’,</p>
          <p>description_of_transformation=’copy’,</p>
          <p>description_of_qualitycheck=’range check 80-160’,</p>
          <p>status_log=’passed date 12.May2022’,</p>
          <p>sop_name=’SOP p’</p>
          <p>sop_version=’v1.5’,</p>
          <p>sop_status=’approved’,</p>
          <p>steward_name=’no name given’)</p>
        </boxed-text>
      </sec>
      <sec>
        <title>Proof-of-Concept Solution</title>
        <p>Following the tailor-made provenance framework [<xref ref-type="bibr" rid="ref3">3</xref>], we developed a proof-of-concept provenance solution. This framework complements a standard software engineering cycle (requirements, design, coding, testing, and implementation) with insights from a comprehensive literature search and uses established works as a guide to the users of the framework. The expanded requirements analysis is substantiated by the topics identified through the literature search. Details are described in <xref rid="figure1" ref-type="fig">Figure 1</xref>.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Overview of the road map steps.</p>
          </caption>
          <graphic xlink:href="formative_v7i1e50027_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Requirements Analysis</title>
        <sec>
          <title>Overview</title>
          <p>An interdisciplinary team of internal stakeholders in the University Medicine Mannheim-DIC (lead, medical experts, computer scientists, technical staff, and process owner of the ETL [extract, transform, and load] process) performed the requirements analysis for the research approach. Initially, we engaged in discussions, documented feedback, and obtained approval for our own data pipeline processes, based on the WH questions (what, when, where, who, why, how, which, whose). This was done to ensure accurate and risk-managed data processing pipelines. Our focus centered on questions related to data governance, annotation, documentation, interoperability, data integrity and accuracy, data sharing, and information technology operations. This emphasis aligns with a prior investigation on data management practices in German DICs [<xref ref-type="bibr" rid="ref23">23</xref>], where these questions were identified as integral to tracing patient data through the DICs.</p>
          <p>Building on the previous steps, we initiated the process by visualizing the scope definition (system border and context) of the planned provenance tracking systems. Using notation according to DeMarco [<xref ref-type="bibr" rid="ref24">24</xref>], we generated a data flow diagram. Following this, we documented the resultant requirements, representing them in free text and as a unified modeling language (UML) class diagram to address various requirements perspectives [<xref ref-type="bibr" rid="ref25">25</xref>].</p>
        </sec>
        <sec>
          <title>System Border and Context</title>
          <p>The context view (<xref rid="figure2" ref-type="fig">Figure 2</xref>) is used to delineate the scope of our system, establishing the boundary between functionalities that are considered in and out of scope. The system to be modeled, known as the Provenance Information System Traces (PISA), is depicted as a circle in the center (outlined by the dotted red line in <xref rid="figure2" ref-type="fig">Figure 2</xref>). At the conceptual level, we established the system border to encompass all aspects within the object scope. We delineated the system context (depicted in green as a freehand drawing) with aspects (A to H) that impact the planned provenance tracking system in our medical DIC. The processes that were modeled had been previously defined by local stakeholders and were influenced by the processes of the medical informatics initiative community [<xref ref-type="bibr" rid="ref5">5</xref>]. The core process, the ETL process (D), includes valid documents (G) (eg, statutes, standard operating procedures, European Union-GDPR) and the involvement of stakeholders within and beyond the organizational unit (H), representing the primary focus of our development efforts. Existing software and hardware systems (A–C), as well as the processes of secondary usage for data request (E) and long-term archiving (F), are outside the scope of this study.</p>
          <fig id="figure2" position="float">
            <label>Figure 2</label>
            <caption>
              <p>Aspects in the system context and border of the Provenance Information System Traces (PISA). EU: European Union; GDPR: General Data Protection Regulation; SOP: standard operating procedure.</p>
            </caption>
            <graphic xlink:href="formative_v7i1e50027_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Data Flow</title>
          <p>Given the multitude of processes within a DIC, we confined our focus to the requirements related to the data integration process (<xref rid="figure2" ref-type="fig">Figure 2</xref>; ETL, letter D). We scrutinized the data flow and derived a data flow diagram, illustrating the functional requirements perspective (<xref rid="figure3" ref-type="fig">Figure 3</xref>). As part of the Medical Informatics Initiative, all DICs in Germany modeled a comparable, generic data flow. This data flow delineates the movement of data among processes (ETL), storage entities (staging area, data warehouse, FHIR server, and research data repository), and involved actors (staff in DIC, researcher, and trusted third party). Processes encapsulate functions responsible for transforming processing data. These processes consume input data from diverse systems, manage these data, and convey the results to an output. Storage ensures data persistence, allowing processes to access the storage in read or write modes. Actors actively engage in information exchange with the system.</p>
          <fig id="figure3" position="float">
            <label>Figure 3</label>
            <caption>
              <p>Simplified general data flow diagram in the data integration center. The simplified general data flow diagram in the data integration center (DIC) provides information about components participating in data flow: different hospital or laboratory systems donating the data, the independent trust center (trusted third party) enabling the separate processing of identifying data (IDAT) and medical data (MDAT), the data integration center with the different integration phases staging, data warehouse, FHIR and the research data repository (RDR). Individual DIC may deviate from this general data flow. FHIR: Fast Healthcare Interoperability Resources.</p>
            </caption>
            <graphic xlink:href="formative_v7i1e50027_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Requirements Description</title>
          <p>In a previous publication, we conducted interviews with various German medical DICs [<xref ref-type="bibr" rid="ref23">23</xref>]. Through these interviews, we identified the most crucial requirements, emphasizing assessments of data quality, traceability, and information capability. Additionally, transparency in processing steps, workflows, and data sets emerged as a significant consideration. Other identified requirements encompassed aspects such as debugging or performance evaluation. Additionally, there was a focus on compliance with regulations, reproducibility, support of the scientific utilization process, increased confidence in data, and clear regulation of responsible parties [<xref ref-type="bibr" rid="ref23">23</xref>].</p>
          <p>In alignment with this study, we established preconditions and requirements along the data flow for implementing the provenance tracking system. We identified the intended features for the implementation of the PISA and derived the system’s requirements (<xref ref-type="table" rid="table1">Table 1</xref>). In general, PISA should have the capability to trace the complete production history of a data element while incorporating domain-specific characteristics of the data element. These provenance traces for an individual data element must be captured along the presented data flow.</p>
          <table-wrap position="float" id="table1">
            <label>Table 1</label>
            <caption>
              <p>Requirements for the proof of concept for PISA<sup>a</sup>.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="90"/>
              <col width="470"/>
              <col width="440"/>
              <thead>
                <tr valign="top">
                  <td>Number</td>
                  <td>Requirements (functional and nonfunctional)</td>
                  <td>Explanation</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>1</td>
                  <td>PISA must have the capability to track the complete processing history of a data element, and the provenance information must be stored in a database. This encompasses all derivation steps performed on data elements during their processing steps.</td>
                  <td>It includes all the information (metadata) required for producing a specific data set or a data element while preserving its data integrity status. This encompasses details such as data source, data destination, method, tools, software, and versions used. The benchmark should align with the “entities” and “activities” components of the W3C model.</td>
                </tr>
                <tr valign="top">
                  <td>2</td>
                  <td>PISA must possess the capability to trace organizational responsibilities and the means used.</td>
                  <td>It includes information (metadata) about all the involved agents in producing a data set or data elements, such as staff, standard operating procedures, and guidance. The benchmark should align with the “agent” components of the W3C model.</td>
                </tr>
                <tr valign="top">
                  <td>3</td>
                  <td>PISA must be analyzable by an authorized user and capable of producing diverse representations and export formats for the provenance traces.</td>
                  <td> Detailed provenance traces are accessible and exportable to support evaluation by users, including formats such as log files, FHIR<sup>b</sup> provenance, W3C<sup>c</sup> RDF<sup>d</sup>/XML, and RDF/JSON-LD provenance.</td>
                </tr>
                <tr valign="top">
                  <td>4</td>
                  <td>PISA must be able to track the quality status and assessment of data elements.</td>
                  <td>The provenance information for a data element is expanded to include the quality status of the processed data element.</td>
                </tr>
                <tr valign="top">
                  <td>5</td>
                  <td>PISA must be able to track the status of the script execution.</td>
                  <td>At a minimum, the provenance information should encompass the verification status and time stamp of the processed scripts.</td>
                </tr>
                <tr valign="top">
                  <td>7</td>
                  <td>PISA must provide a high level of ease of use for ETL<sup>e</sup> programmers and should be usable without requiring in-depth knowledge of provenance terms and concepts.</td>
                  <td>PISA should facilitate easy integration into ETL pipelines with transfer interfaces, allowing seamless integration with established technologies. Moreover, it must be easy to install, for example, by supporting widely used and easily set up databases.</td>
                </tr>
                <tr valign="top">
                  <td>8</td>
                  <td>PISA must be time-efficient and capable of ensuring acceptable performance.</td>
                  <td>Time measurements per data element must take place and be evaluated to verify the feasibility of the proof-of-concept approach.</td>
                </tr>
                <tr valign="top">
                  <td>9</td>
                  <td>Verification by unit tests/code coverage &#62;80%</td>
                  <td>Passed testing results.</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table1fn1">
                <p><sup>a</sup>PISA: Provenance Information System Traces.</p>
              </fn>
              <fn id="table1fn2">
                <p><sup>b</sup>FHIR: Fast Healthcare Interoperability Resources.</p>
              </fn>
              <fn id="table1fn3">
                <p><sup>c</sup>W3C: Word Wide Web Consortium.</p>
              </fn>
              <fn id="table1fn4">
                <p><sup>d</sup>RDF: Resource Description Framework.</p>
              </fn>
              <fn id="table1fn5">
                <p><sup>e</sup>ETL: extract, transform, and load.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
        </sec>
      </sec>
      <sec>
        <title>Design and Architecture of the Provenance Class</title>
        <sec>
          <title>Development of the Logical Data Model</title>
          <p>Based on the aforementioned requirements (<xref ref-type="table" rid="table1">Table 1</xref>) and the DIC maturity model [<xref ref-type="bibr" rid="ref23">23</xref>], we constructed the logical data model as a UML class diagram, identifying classes and their associations (<xref rid="figure4" ref-type="fig">Figure 4</xref>).</p>
          <fig id="figure4" position="float">
            <label>Figure 4</label>
            <caption>
              <p>The logical data model as UML class diagram (technology-agnostic).</p>
            </caption>
            <graphic xlink:href="formative_v7i1e50027_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Metadata Strategy</title>
          <p>Our metadata strategy centered on characterizing the data elements and their associated artifacts throughout their processing pipeline.</p>
          <p>Aligned with the requirements and the logical model, we extracted the pertinent provenance metadata and aligned this provenance profile with the W3C components entity, agent, and activity. Simultaneously, we diligently enforced documentation efforts and annotation, guided by good documentation practices such as the ALCOA(+) principles for the identified components [<xref ref-type="bibr" rid="ref10">10</xref>]. The annotation process we implemented enhanced the comprehension, increased understanding, and improved the traceability of the processed data elements.</p>
          <p>The FAIR principles R1.2 and R1.3 guided us to enrich (R) data elements with meaningful (provenance) metadata. Consequently, we characterized data elements by collecting content-rich contextual and technical metadata that narrate the story of the entire data processing workflow and link to related artifacts (<xref ref-type="table" rid="table2">Table 2</xref>). During the transformation processes, we documented quality procedures and incorporated coding practices and versioning information.</p>
          <table-wrap position="float" id="table2">
            <label>Table 2</label>
            <caption>
              <p>Levels of contextual and technical metadata and their related FHIR<sup>a</sup> mapping: a mapping example of our metadata to the FHIR Provenance resource. The FHIR Provenance elements are aligned with the W3C<sup>b</sup> PROV model elements.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="140"/>
              <col width="340"/>
              <col width="220"/>
              <col width="300"/>
              <thead>
                <tr valign="top">
                  <td>Level<sup>c</sup></td>
                  <td>Description<sup>d</sup></td>
                  <td>Possible mapping<sup>e</sup></td>
                  <td>Exemplified output<sup>f</sup></td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Data Governance<sup>g</sup></td>
                  <td>Name and version of the standard operating procedures or regulation (eg, “DIC_ETL-ST.pdf, v1, approved”)</td>
                  <td>.policy<break/>.agent.type</td>
                  <td>“policy” : [“<ext-link ext-link-type="uri" xlink:href="http://example.org/policy/1234" xlink:type="simple">http://example.org/policy/1234</ext-link>”],<break/>“location”: {<break/>“reference”: “DIC”<break/>},</td>
                </tr>
                <tr valign="top">
                  <td>Data Owner</td>
                  <td>Name of the (hospital) department and the responsible person owning the patient data (eg, physician or stakeholder name)</td>
                  <td>.authorization<break/>.agent<break/>.agent.type<break/>.agent.role<break/>.agent.who<break/>.agent.onBehalfOf</td>
                  <td>“authorization”: {<break/>“coding”: [<break/>{<break/>“system”: “http://terminology.hl7.org/CodeSystem/v3-ActReason”,<break/>“code”: “TRANSRCH”<break/>}<break/>]<break/>},</td>
                </tr>
                <tr valign="top">
                  <td>Data Steward</td>
                  <td>Name of the responsible data steward (eg, person who takes care of data management)</td>
                  <td>.location<break/>.agent<break/>.agent.type<break/>.agent.role<break/>.agent.who<break/>.agent.onBehalfOf</td>
                  <td>“agent”: {<break/>“who”: {<break/>“display”: “Hr. Koch”<break/>}<break/>}</td>
                </tr>
                <tr valign="top">
                  <td>Data Store</td>
                  <td>Used input or created output data file as part of the processing pipeline (eg, name original source system and name target system)</td>
                  <td>.entity<break/>.entity.role<break/>.entity.what<break/>.target (as mapping from entity)</td>
                  <td>“entity”: {<break/>“what”: {<break/>“identifier”: [<break/>{<break/>“system”: “urn:ietf:rfc:3986”,<break/>“value”: “243c773b-8936-407e-9c23-270d0ea49cc4”,<break/>“display”: “”<break/>}<break/>]<break/>}<break/>}</td>
                </tr>
                <tr valign="top">
                  <td>Data Script</td>
                  <td>Scripts or programs developed to process the data with a description of script version and name and creator (eg, etl_st.py v1 MZ)</td>
                  <td>.activity<break/>.basedOn<break/>.agent.type</td>
                  <td>“activity”: {<break/>“coding”: [<break/>{<break/>“system”: “http://terminology.hl7.org/CodeSystem/iso-21089-lifecycle”,<break/>“code”: “averaging”,<break/>“display”: “Transform”<break/>}<break/>]<break/>}<break/>“basedOn”: [<break/>{<break/>“reference” : “ServiceRequest”<break/>}<break/>]</td>
                </tr>
                <tr valign="top">
                  <td>Data Element</td>
                  <td>Individual characteristics per data element during a processing step such as ID, name, description, source and destination information from Data Store Level, description of the transformation approach, description of quality check (testing and validation approach), privacy and security status, and information from Script Level</td>
                  <td>.entitiy<break/>.entity.role<break/>.entity.what<break/>.entity.agent</td>
                  <td>Schema as in Data Store Level</td>
                </tr>
                <tr valign="top">
                  <td>Data Provenance</td>
                  <td>References to all other mentioned levels and testimony for quality (eg, “25, 3, 5, good, 2023-02-03 06:01:34”)</td>
                  <td>.id<break/>.occuredDateTime<break/>.recorded<break/>.patient<break/>.encounter<break/>.target</td>
                  <td>“id” : “id”<break/>”occuredDateTime“: ”timestamp“,<break/>”recorded“: ”timestamp“</td>
                </tr>
                <tr valign="top">
                  <td>Data Infrastructure<sup>g</sup></td>
                  <td>Used hardware and software conditions during data processing</td>
                  <td>N/A<sup>h</sup></td>
                  <td>N/A</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table2fn1">
                <p><sup>a</sup>FHIR: Fast Healthcare Interoperability Resources.</p>
              </fn>
              <fn id="table2fn2">
                <p><sup>b</sup>W3C: Word Wide Web Consortium.</p>
              </fn>
              <fn id="table2fn3">
                <p><sup>c</sup>Level corresponds to the maturity level of the data integration center.</p>
              </fn>
              <fn id="table2fn4">
                <p><sup>d</sup>Description of the possible content or annotation.</p>
              </fn>
              <fn id="table2fn5">
                <p><sup>e</sup>Possible mapping to the Health Level 7 FHIR resource “Provenance.”</p>
              </fn>
              <fn id="table2fn6">
                <p><sup>f</sup>One possible exemplified output extract as a serialization in FHIR JSON.</p>
              </fn>
              <fn id="table2fn7">
                <p><sup>g</sup>Not yet or only partly implemented.</p>
              </fn>
              <fn id="table2fn8">
                <p><sup>h</sup>N/A: not applicable.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
          <p>Examples of expanded metadata elements are more detailed descriptions of the transformation, the quality check, and the status of the data element in scope, or the results of the used log files. The metadata gathering for provenance comprises both manual annotation and an automated collection process, representing a hybrid form of provenance [<xref ref-type="bibr" rid="ref26">26</xref>].</p>
        </sec>
        <sec>
          <title>Ontology</title>
          <p>We organized, annotated, and represented information using WebProtégé 4.0.2 (Protege Team in the Biomedical Informatics Research Group at Stanford University), a tool designed for collaboratively creating complex ontologies [<xref ref-type="bibr" rid="ref27">27</xref>]. The W3C PROV ontology and the fundamental relationships between entities, activities, and agents served as a framework for representing the provenance graph [<xref ref-type="bibr" rid="ref20">20</xref>]. More specifically, we mapped processes onto activities, actors onto agents, and input/output data onto entities. The attributes of the provenance data model were aligned with the attributes of the data set. An instantiation of the provenance model, reflecting the W3C PROV vocabulary and layout convention, is illustrated in <xref rid="figure5" ref-type="fig">Figure 5</xref>. Additionally, the W3C PROV supports interoperable interchange of provenance in heterogeneous environments.</p>
          <fig id="figure5" position="float">
            <label>Figure 5</label>
            <caption>
              <p>Exemplary instantiation of the provenance information model. SOP: standard operating procedure.</p>
            </caption>
            <graphic xlink:href="formative_v7i1e50027_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Implementation and Verification Approach</title>
          <p>Finally, building on the preceding steps, we developed an open-source Python class “Data Provenance” with associated methods, and validated our approach in an exemplified data integration pipeline [<xref ref-type="bibr" rid="ref28">28</xref>]. Provenance traces were mapped exemplarily onto the W3C RDF/XML and HL7 FHIR resource “Provenance” in its current maturity level (version R 5). We utilized peewee (version 3.15.4), a Python Object-Relational Mapping library that supports the binding of objects to relational databases such as SQLite, MySQL, or PostgreSQL [<xref ref-type="bibr" rid="ref29">29</xref>]. To visualize the provenance traces, we used the Mermaid plotting framework [<xref ref-type="bibr" rid="ref30">30</xref>].</p>
          <p>The verification and validation approach for the developed provenance class involved an independent code review and unit tests to ensure that the code meets the requirements of the design. We assessed efficiency (storage space in kilobytes and computing time) and ensured the maintainability of the program (code structure, modularity, comments in code, currency, and comprehensibility of documentation).</p>
          <p>While creating provenance records, we conducted a runtime experiment to measure the performance of our developed class. We recorded the time that the program took to run for proper execution. The runtime environment comprised the operating system Ubuntu 22.04.2 LTS (Canonical Ltd.), 32 GB memory, and an 8-core Intel Xeon Platinum 8276 CPU @ 2.20-GHz computer.</p>
          <p>As a runtime environment, we used a virtual machine running on top of the machine. The runtime period was defined as the duration when the program was actively running.</p>
          <p>We conducted measurements per data element and per provenance record on 9 virtual machines, each utilizing different data element block sizes (starting with 1, 10, 100, 1000, 10,000, and 100,000 up to 9, 90, 900, 9000, 90,000, and 900,000 data elements). For the analysis of runtime measurements, we used R version 4.2.0 (2022; R Foundation for Statistical Computing), and figures were generated using the ggplot2 package [<xref ref-type="bibr" rid="ref31">31</xref>].</p>
          <p>The code is available in a git repository under the Massachusetts Institute of Technology (MIT) license [<xref ref-type="bibr" rid="ref32">32</xref>].</p>
        </sec>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>Given the nature of the proof-of-concept study relying on dummy test data, ethics approval, informed consent, and deidentification were not applicable.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Provenance Traces Representation</title>
        <p>All the gathered provenance information is in a machine-readable format. Additionally, FHIR health care standards were used [<xref ref-type="bibr" rid="ref33">33</xref>].</p>
        <p>We developed an FHIR profile based on the “provenance” resource, resulting in a record that delineates the entities and processes involved in producing, delivering, or otherwise influencing that resource. This was accomplished by mapping the contextual and technical metadata to the corresponding resource provenance elements (<xref ref-type="table" rid="table2">Table 2</xref>).</p>
        <p>Through the integration of all metadata levels, we facilitated the traceability of each data element. We illustrated the traceability using a data flow diagram and presented it in a human-readable text form. Additionally, the provenance information was exported into various formats such as FHIR-JSON, W3C-RDF/XML, W3C-RDF/JSON-LD, or a text-based log file. This approach aligns with data obtained in other studies [<xref ref-type="bibr" rid="ref34">34</xref>].</p>
      </sec>
      <sec>
        <title>Measurement of Provenance Traces</title>
        <p>As anticipated, the specified provenance class successfully generated the database and the metadata tables according to the UML class diagram (illustrated in <xref ref-type="table" rid="table2">Table 2</xref>). Provenance records were automatically appended to the provenance table throughout the execution of the exemplified data integration pipeline. We recorded runtime measurements of the algorithm, displayed separately for the storage duration of a data element and for a record, as well as the corresponding increase in the database (<xref rid="figure6" ref-type="fig">Figure 6</xref>). As evident, the runtime complexity of the algorithm per data element indicates a nearly linear relationship with the size of the input data.</p>
        <fig id="figure6" position="float">
          <label>Figure 6</label>
          <caption>
            <p>Provenance-Runtime-Experiment presenting storage duration per element and per record.</p>
          </caption>
          <graphic xlink:href="formative_v7i1e50027_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>We observed an acceptable runtime duration ranging from 0.0039 to 0.02601 seconds per data element. However, when measuring the runtime for a provenance record, we encountered an increasing duration, ranging from 0.0271 to 0.1882 seconds. Given that our approach incorporates novel aspects, we were unable to find comparable studies for this measurement. Nevertheless, the data obtained here suggest that using this approach to establish provenance traces can yield accurate and timely information.</p>
      </sec>
      <sec>
        <title>Verification and Validation</title>
        <p>The validation status for our proof-of-concept provenance class is outlined in <xref ref-type="table" rid="table3">Table 3</xref>. We anticipate that our results can be readily adopted for additional metadata components and seamlessly transferred to decision-making applications.</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Validation status of requirements.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="240"/>
            <col width="760"/>
            <thead>
              <tr valign="top">
                <td>Requirement number</td>
                <td>Validation result</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>1</td>
                <td>Introduction of metadata for data elements and their processing collected automatically during ETL<sup>a</sup> job running in data flow. Relevant tables (DataProvenance, DataElement, and associated tables) in the provenance database were created and continuously updated during processing.</td>
              </tr>
              <tr valign="top">
                <td>2</td>
                <td>Organizational topics (DataGovernance, DataSteward, and DataOwner) were recorded in the provenance database and continuously updated during processing.</td>
              </tr>
              <tr valign="top">
                <td>3</td>
                <td>Provenance traces were created in different formats. Detailed provenance traces are accessible and exportable to support evaluation by users (eg, FHIR<sup>b</sup> provenance, W3C<sup>c</sup> RDF<sup>d</sup>/XML RDF/JSON-LD provenance).</td>
              </tr>
              <tr valign="top">
                <td>4</td>
                <td>The quality status of a processed data element is tracked and currently presented with a placeholder value in the DataProvenance table (see the “Future Work” section).</td>
              </tr>
              <tr valign="top">
                <td>5</td>
                <td>The verification status of used scripts and time stamps were recorded in the table DataElement.<break/>More specific content-related provenance information needs to be added in the second step. This compromises detailed annotation about the performed transactions and can be used for handling inconsistencies and rules for conflict resolution (see the “Future Work” section)</td>
              </tr>
              <tr valign="top">
                <td>7</td>
                <td>Easy integration into the ETL pipeline setup: only 3 lines of code, set up per data element: 1 line (see the “Future Work” section).</td>
              </tr>
              <tr valign="top">
                <td>8</td>
                <td>Time measurements confirmed satisfying results.</td>
              </tr>
              <tr valign="top">
                <td>9</td>
                <td>We achieved a code coverage of &#62;90%, confirming that the code is comprehensively verified (quality aspect for software). We successfully verified the provenance with unit tests and validated all results against the defined requirements.</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>ETL: extract, transform, and load.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>FHIR: Fast Healthcare Interoperability Resources.</p>
            </fn>
            <fn id="table3fn3">
              <p><sup>c</sup>W3C: Word Wide Web Consortium.</p>
            </fn>
            <fn id="table3fn4">
              <p><sup>d</sup>RDF: Resource Description Framework.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>Our study introduces the first ready-to-use library designed to record provenance information from clinical data processing pipelines in a German medical DIC. This current research extends previous work in provenance by using an approach that systematically combines detailed insights from medical, data management, and information technology operational experts. This method aims to facilitate the reuse of enriched patient data with precision and rigor. We demonstrated that our research approach successfully facilitates the implementation of traceability in the processing of data elements. This, in turn, contributes to the promotion of good data management and documentation practices, ultimately ensuring sufficient provenance quality. Furthermore, these good practices pave the way for the (automated) generation of annotations [<xref ref-type="bibr" rid="ref23">23</xref>] and prevent poor data integrity, thereby enhancing data quality [<xref ref-type="bibr" rid="ref35">35</xref>]. Through this, we hypothesize that our work could contribute to the reliability and safety of quality-assured patient data for secondary use. Simultaneously, we mitigate the risks associated with the reuse of weak data in clinical research.</p>
        <p>We fulfilled the requirement for FAIR (Findability, Accessibility, Interoperability, and Reusability) provenance information by adhering to standards for syntactic and semantic interoperability, including JSON, W3C PROV, and FHIR mapping. Compared with the FHIR resource Provenance, we noted that our metadata recording offers significantly more detailed contextual information for each data element. We suggest that improvements to the FHIR Provenance resource, particularly for data within medical DICs, be deliberated and harmonized with existing FHIR resources such as “AuditEvent” or the “FiveWs Pattern” [<xref ref-type="bibr" rid="ref19">19</xref>].</p>
        <p>The strengths of this study are (1) the provision of provenance information for data elements with export options to interchange standard formats such as FHIR-JSON or W3C RDF/XML; (2) the simplicity of integrating this provenance class into ETL and other data pipelines; and (3) the extensibility of metadata components along with acceptable runtime measurements.</p>
      </sec>
      <sec>
        <title>Related Work</title>
        <p>In general, research on provenance and related management has progressed significantly in recent years. Numerous studies have been conducted, both domain specific and domain independent, focusing on provenance. Recently submitted scoping review results on provenance tracking have yielded valuable insights and provided an extensive summary of current approaches and criteria [<xref ref-type="bibr" rid="ref3">3</xref>]. The scoping review revealed technical, implementation, and knowledge gaps, with a specific emphasis on modeling and metadata frameworks for (sensitive) scientific biomedical data. Moreover, the primary focus of the research was centered on workflow provenance. This involved the utilization of models such as the Open Provenance Model or the W3C PROV data model across various semantic levels and tools in scientific workflows or experiments, as demonstrated in frameworks such as BioWorkbench or the OpenPREDICT use case [<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref37">37</xref>]. Additionally, other work has delved into different yet more general approaches for metadata usage and harvesting [<xref ref-type="bibr" rid="ref38">38</xref>,<xref ref-type="bibr" rid="ref39">39</xref>]. A systematic literature analysis on functional requirements for medical data integration outlined general requirements for data traceability and metadata management [<xref ref-type="bibr" rid="ref40">40</xref>].</p>
        <p>While these prior efforts are crucial, they still lack the specific requirements and considerations tailored for a DIC use case. By contrast, our approach is finely tuned to the unique needs of a DIC, providing a comprehensive exploration of provenance that imparts medical meaning and understanding to the data elements, thereby enhancing their reusability.</p>
      </sec>
      <sec>
        <title>Lessons Learned</title>
        <p>We discovered that interdisciplinary competence profiles; fostering communication between medical experts, data stewards, and information technology developers; and establishing a common language were pivotal factors leading to significant progress in our specific DIC use case. Implementing proper data governance and comprehensive data management documentation, such as data management plans, would be instrumental in mitigating the risk of incorrect use of the data.</p>
        <p>The lessons learned from our description could serve as motivation for other researchers aiming to establish FAIR-oriented provenance. This would not only advance the reuse of their research data and results but also underscore the importance of maintaining overall responsibility for the data, even after project funding concludes.</p>
      </sec>
      <sec>
        <title>Future Work</title>
        <p>Future work should also prioritize the development of a strategy for assessing data privacy, data integrity, and related quality of a data element. Integrating this information into the framework would enhance the expressiveness of the provenance information and enable the derivation of quality dimensions. For this reason, data elements may need to be accompanied by additional properties (refer to <xref ref-type="table" rid="table2">Table 2</xref>) that are significant for interpretability, helping determine limitations or detect duplications for use in similar research studies. Addressing the adequacy and relevance of the data element for upcoming research questions aids in supporting interpretation and, consequently, the reuse of a data element, as already highlighted in a draft Food and Drug Administration guidance [<xref ref-type="bibr" rid="ref41">41</xref>]. To facilitate easy integration with other programming languages, we will provide an application programming interface.</p>
        <p>Future studies should also explore ways to enhance the script for generating the provenance class in alignment with the FAIR for Research Software Principles [<xref ref-type="bibr" rid="ref42">42</xref>]. Determining appropriate software metadata that accurately describe the specific characteristics of the software is an essential aspect to be addressed [<xref ref-type="bibr" rid="ref18">18</xref>].</p>
        <p>Before the future implementation and integration of the provenance class into real-world data integration processes, it is advisable to seek recommendations for risk measures. Factors such as the confidentiality level and security of provenance information, storage considerations, performance issues, and scalability should be carefully considered. In addition, it is crucial to consider experiences gained from maintaining metadata management and interoperable technologies, especially from professional data stewards. Ongoing exchanges with stakeholders and conducting usability evaluations are essential aspects that should be taken into account.</p>
        <p>This work also contributes to a broader community project that seeks to establish the “Minimal Requirements for Automated Provenance Information Enrichment” (MIRAPIE) project [<xref ref-type="bibr" rid="ref43">43</xref>].</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>As the library has only been tested with simulated data, the next step—testing in a real environment—is currently in preparation. Despite the straightforward ETL integration approach, we will carefully assess the complexity and associated costs of implementation within the medical DIC. We recognize the need to bolster the overall qualification and validation concept. We believe it is crucial to expand the current provenance class to one that is inspection- or audit-ready, although accreditation demands additional measures and efforts. Additionally, further scalability analysis should be incorporated into the research approach.</p>
        <p>Trust involves more than just the provenance of data elements; it also implies correctness and security against malicious users. This challenge can only be addressed through technical access limitations and organizational measures. Nevertheless, automated provenance traces can contribute to building trust in the transformation and movement of data within the DIC. Moreover, it empowers us to confidently assess the quality and validity of the original data points even after undergoing complex transformations within a data warehouse.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>We have designed, developed, and implemented provenance traces at the data element level for a German medical DIC, with the potential for extension at the national level. The described research method for the proof-of-concept provenance class has been crafted to promote effective and reliable core data management practices, enriching biomedical data with meaningful provenance. This, in turn, strengthens the benefits for research and society while simplifying the reuse of biomedical data. While the approach was initially developed for the medical DIC use case, these principles can be applied universally throughout the scientific domain. The implementation and analysis of provenance traces play a crucial role in minimizing risks associated with undetected or unintended data integrity breaches. Hence, provenance traces significantly contribute to building trust in routine clinical data and enhancing the accountability of a medical DIC. We are confident that by adhering to this advanced practice, the existing gaps between industry (pharmaceutical companies), service providers, and academia can be mitigated. Consequently, this can lead to an increase in the secondary use of (sensitive) patient data in clinical investigations.</p>
        <p>The outcomes of our research prompt additional questions, particularly regarding how in-depth exploration of further provenance analysis can predict the quality of data using machine learning methods. The limitations identified in our study indicate the need for further investigations into provenance theory, standards, and practices in the clinical field.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">ALCOA</term>
          <def>
            <p>Attributable, Legible, Contemporaneous, Original, and Accurate</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">DIC</term>
          <def>
            <p>data integration center</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">ETL</term>
          <def>
            <p>extract, transform, and load</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">FAIR</term>
          <def>
            <p>Findability, Accessibility, Interoperability, and Reusability</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">FHIR</term>
          <def>
            <p>Fast Healthcare Interoperability Resources</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">GDPR</term>
          <def>
            <p>General Data Protection Regulation</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">MIRAPIE</term>
          <def>
            <p>Minimal Requirements for Automated Provenance Information Enrichment</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">MIT</term>
          <def>
            <p>Massachusetts Institute of Technology</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">PISA</term>
          <def>
            <p>Provenance Information System Traces</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">RDF</term>
          <def>
            <p>Resource Description Framework</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">UML</term>
          <def>
            <p>unified modeling language</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">W3C</term>
          <def>
            <p>Word Wide Web Consortium</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This research is funded by the “Digitale Forschung” project of Baden-Wuerttembergg, Germany, and by the German Federal Ministry of Education and Research within the German Medical Informatics Initiative with the grant 01ZZ1801E (Medical Informatics in Research and Care in University Medicine). This work was part of the precondition for the first author to obtain the degree Dr. sc. hum. from Heidelberg University. For the publication fee, we acknowledge financial support by Deutsche Forschungsgemeinschaft within the funding program “Open Access Publikationskosten” as well as by Heidelberg University.</p>
    </ack>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>The code of the provenance class is provided in a git repository [<xref ref-type="bibr" rid="ref32">32</xref>].</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="con">
        <p>KG contributed substantially to the methodology, coding, implementation, testing, validation, analysis, visualization, and interpretation of the data; drafted all sections of the manuscript, performed data curation, coordinated reviewing, incorporated the comments from the co-authors, and submitted the paper. DW contributed to the discussion of the general provenance concept, reviewed, and revised the manuscript. TG reviewed and revised the manuscript. FS contributed to the discussion of the methodology, performed a code review, supported implementation, reviewed, and revised the manuscript.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="web">
          <article-title>2018</article-title>
          <source>Metadata Basics</source>
          <access-date>2023-02-10</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.dublincore.org/resources/metadata-basics/">https://www.dublincore.org/resources/metadata-basics/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Douthit</surname>
              <given-names>BJ</given-names>
            </name>
            <name name-style="western">
              <surname>Del Fiol</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Staes</surname>
              <given-names>CJ</given-names>
            </name>
            <name name-style="western">
              <surname>Docherty</surname>
              <given-names>SL</given-names>
            </name>
            <name name-style="western">
              <surname>Richesson</surname>
              <given-names>RL</given-names>
            </name>
          </person-group>
          <article-title>A Conceptual Framework of Data Readiness: The Contextual Intersection of Quality, Availability, Interoperability, and Provenance</article-title>
          <source>Appl Clin Inform</source>
          <year>2021</year>
          <month>05</month>
          <day>21</day>
          <volume>12</volume>
          <issue>3</issue>
          <fpage>675</fpage>
          <lpage>685</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.thieme-connect.com/DOI/DOI?10.1055/s-0041-1732423"/>
          </comment>
          <pub-id pub-id-type="doi">10.1055/s-0041-1732423</pub-id>
          <pub-id pub-id-type="medline">34289504</pub-id>
          <pub-id pub-id-type="pmcid">PMC8294946</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gierend</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Krüger</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Genehr</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hartmann</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Siegel</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Waltemath</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Ganslandt</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Zeleke</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Capturing provenance information for biomedical data and workflows: A scoping review</article-title>
          <source>Research Square. Preprint posted online on February 09, 2023</source>
          <pub-id pub-id-type="doi">10.21203/rs.3.rs-2408394/v1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Symons</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Agapow</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Teo</surname>
              <given-names>JT</given-names>
            </name>
            <name name-style="western">
              <surname>Paxton</surname>
              <given-names>CA</given-names>
            </name>
            <name name-style="western">
              <surname>Abdi</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Mattie</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Davie</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Torres</surname>
              <given-names>AZ</given-names>
            </name>
            <name name-style="western">
              <surname>Folarin</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sood</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Celi</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Halamka</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Eapen</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Budhdeo</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Best practices in the real-world data life cycle</article-title>
          <source>PLOS Digit Health</source>
          <year>2022</year>
          <month>01</month>
          <day>18</day>
          <volume>1</volume>
          <issue>1</issue>
          <fpage>e0000003</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36812509"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pdig.0000003</pub-id>
          <pub-id pub-id-type="medline">36812509</pub-id>
          <pub-id pub-id-type="pii">PDIG-D-21-00033</pub-id>
          <pub-id pub-id-type="pmcid">PMC9931348</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Semler</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wissing</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Heyder</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>German Medical Informatics Initiative</article-title>
          <source>Methods Inf Med</source>
          <year>2018</year>
          <month>07</month>
          <day>17</day>
          <volume>57</volume>
          <issue>S 01</issue>
          <fpage>e50</fpage>
          <lpage>e56</lpage>
          <pub-id pub-id-type="doi">10.3414/me18-03-0003</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shin</surname>
              <given-names>EY</given-names>
            </name>
            <name name-style="western">
              <surname>Ochuko</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Bhatt</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Howard</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>McGorisk</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Delaney</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Langdon</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Khosravanipour</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Nambi</surname>
              <given-names>AA</given-names>
            </name>
            <name name-style="western">
              <surname>Grahovec</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Morris</surname>
              <given-names>DC</given-names>
            </name>
            <name name-style="western">
              <surname>Castellano</surname>
              <given-names>PZ</given-names>
            </name>
            <name name-style="western">
              <surname>Shaw</surname>
              <given-names>LJ</given-names>
            </name>
            <name name-style="western">
              <surname>Sperling</surname>
              <given-names>LS</given-names>
            </name>
            <name name-style="western">
              <surname>Goyal</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Errors in Electronic Health Record–Based Data Query of Statin Prescriptions in Patients With Coronary Artery Disease in a Large, Academic, Multispecialty Clinic Practice</article-title>
          <source>JAHA</source>
          <year>2018</year>
          <month>04</month>
          <day>17</day>
          <volume>7</volume>
          <issue>8</issue>
          <fpage>e007762</fpage>
          <pub-id pub-id-type="doi">10.1161/jaha.117.007762</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Murray</surname>
              <given-names>ML</given-names>
            </name>
            <name name-style="western">
              <surname>Love</surname>
              <given-names>SB</given-names>
            </name>
            <name name-style="western">
              <surname>Carpenter</surname>
              <given-names>JR</given-names>
            </name>
            <name name-style="western">
              <surname>Hartley</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Landray</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Mafham</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Parmar</surname>
              <given-names>MKB</given-names>
            </name>
            <name name-style="western">
              <surname>Pinches</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Sydes</surname>
              <given-names>MR</given-names>
            </name>
          </person-group>
          <article-title>Data provenance and integrity of health-care systems data for clinical trials</article-title>
          <source>The Lancet Digital Health</source>
          <year>2022</year>
          <month>08</month>
          <volume>4</volume>
          <issue>8</issue>
          <fpage>e567</fpage>
          <lpage>e568</lpage>
          <pub-id pub-id-type="doi">10.1016/s2589-7500(22)00122-4</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Emanuel</surname>
              <given-names>EJ</given-names>
            </name>
            <name name-style="western">
              <surname>Emanuel</surname>
              <given-names>L L</given-names>
            </name>
          </person-group>
          <article-title>What is accountability in health care?</article-title>
          <source>Ann Intern Med</source>
          <year>1996</year>
          <month>01</month>
          <day>15</day>
          <volume>124</volume>
          <issue>2</issue>
          <fpage>229</fpage>
          <lpage>39</lpage>
          <pub-id pub-id-type="doi">10.7326/0003-4819-124-2-199601150-00007</pub-id>
          <pub-id pub-id-type="medline">8533999</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Curcin</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Embedding data provenance into the Learning Health System to facilitate reproducible research</article-title>
          <source>Learn Health Syst</source>
          <year>2017</year>
          <month>04</month>
          <day>27</day>
          <volume>1</volume>
          <issue>2</issue>
          <fpage>e10019</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/31245557"/>
          </comment>
          <pub-id pub-id-type="doi">10.1002/lrh2.10019</pub-id>
          <pub-id pub-id-type="medline">31245557</pub-id>
          <pub-id pub-id-type="pii">LRH210019</pub-id>
          <pub-id pub-id-type="pmcid">PMC6516719</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bongiovanni</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Purdue</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Kornienko</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Bernard</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Quality in Non-GxP Research Environment</article-title>
          <source>Handb Exp Pharmacol</source>
          <year>2020</year>
          <publisher-loc>Cham, Switzerland</publisher-loc>
          <publisher-name>Springer International Publishing</publisher-name>
          <fpage>1</fpage>
          <lpage>17</lpage>
          <pub-id pub-id-type="doi">10.1007/164_2019_274</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sahoo</surname>
              <given-names>SS</given-names>
            </name>
            <name name-style="western">
              <surname>Valdez</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Rueschman</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Scientific Reproducibility in Biomedical Research: Provenance Metadata Ontology for Semantic Annotation of Study Description</article-title>
          <source>AMIA Annu Symp Proc</source>
          <year>2016</year>
          <volume>2016</volume>
          <fpage>1070</fpage>
          <lpage>1079</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/28269904"/>
          </comment>
          <pub-id pub-id-type="medline">28269904</pub-id>
          <pub-id pub-id-type="pmcid">PMC5333253</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bargaje</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Good documentation practice in clinical research</article-title>
          <source>Perspect Clin Res</source>
          <year>2011</year>
          <month>04</month>
          <volume>2</volume>
          <issue>2</issue>
          <fpage>59</fpage>
          <lpage>63</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.picronline.org/article.asp?issn=2229-3485;year=2011;volume=2;issue=2;spage=59;epage=63;aulast=Bargaje"/>
          </comment>
          <pub-id pub-id-type="doi">10.4103/2229-3485.80368</pub-id>
          <pub-id pub-id-type="medline">21731856</pub-id>
          <pub-id pub-id-type="pii">PCR-2-59</pub-id>
          <pub-id pub-id-type="pmcid">PMC3121265</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="web">
          <source>Guidelines for Safeguarding Good Research Practice</source>
          <access-date>2022-11-18</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://wissenschaftliche-integritaet.de/en/code-of-conduct/">https://wissenschaftliche-integritaet.de/en/code-of-conduct/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Debruyne</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Pandit</surname>
              <given-names>HJ</given-names>
            </name>
            <name name-style="western">
              <surname>Lewis</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>O'Sullivan</surname>
              <given-names>Declan</given-names>
            </name>
          </person-group>
          <article-title>"Just-in-time" generation of datasets by considering structured representations of given consent for GDPR compliance</article-title>
          <source>Knowl Inf Syst</source>
          <year>2020</year>
          <month>04</month>
          <day>15</day>
          <volume>62</volume>
          <issue>9</issue>
          <fpage>3615</fpage>
          <lpage>3640</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/32647404"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s10115-020-01468-x</pub-id>
          <pub-id pub-id-type="medline">32647404</pub-id>
          <pub-id pub-id-type="pii">1468</pub-id>
          <pub-id pub-id-type="pmcid">PMC7327958</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>14</collab>
          </person-group>
          <article-title>ISO 20691</article-title>
          <source>ISO 20691:2022, Biotechnology</source>
          <access-date>2022-11-18</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.iso.org/cms/render/live/en/sites/isoorg/contents/data/standard/06/88/68848.html">https://www.iso.org/cms/render/live/en/sites/isoorg/contents/data/standard/06/88/68848.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="web">
          <source>Standards by ISO/TC 276</source>
          <access-date>2022-11-18</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.iso.org/committee/4514241/x/catalogue/">https://www.iso.org/committee/4514241/x/catalogue/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wilkinson</surname>
              <given-names>MD</given-names>
            </name>
            <name name-style="western">
              <surname>Dumontier</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Aalbersberg</surname>
              <given-names>IJ</given-names>
            </name>
            <name name-style="western">
              <surname>Appleton</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Axton</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Baak</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Blomberg</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Boiten</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>da Silva Santos</surname>
              <given-names>LB</given-names>
            </name>
            <name name-style="western">
              <surname>Bourne</surname>
              <given-names>PE</given-names>
            </name>
            <name name-style="western">
              <surname>Bouwman</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Brookes</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Clark</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Crosas</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Dillo</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Dumon</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Edmunds</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Evelo</surname>
              <given-names>CT</given-names>
            </name>
            <name name-style="western">
              <surname>Finkers</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Gonzalez-Beltran</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Gray</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Groth</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Goble</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Grethe</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Heringa</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>'t Hoen</surname>
              <given-names>Peter A C</given-names>
            </name>
            <name name-style="western">
              <surname>Hooft</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Kuhn</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Kok</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Kok</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lusher</surname>
              <given-names>SJ</given-names>
            </name>
            <name name-style="western">
              <surname>Martone</surname>
              <given-names>ME</given-names>
            </name>
            <name name-style="western">
              <surname>Mons</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Packer</surname>
              <given-names>AL</given-names>
            </name>
            <name name-style="western">
              <surname>Persson</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Rocca-Serra</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Roos</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>van Schaik</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Sansone</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Schultes</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Sengstag</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Slater</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Strawn</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Swertz</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Thompson</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>van der Lei</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>van Mulligen</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Velterop</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Waagmeester</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Wittenburg</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Wolstencroft</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Mons</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>The FAIR Guiding Principles for scientific data management and stewardship</article-title>
          <source>Sci Data</source>
          <year>2016</year>
          <month>03</month>
          <day>15</day>
          <volume>3</volume>
          <issue>1</issue>
          <fpage>160018</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/sdata.2016.18"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/sdata.2016.18</pub-id>
          <pub-id pub-id-type="medline">26978244</pub-id>
          <pub-id pub-id-type="pii">sdata201618</pub-id>
          <pub-id pub-id-type="pmcid">PMC4792175</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lamprecht</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Garcia</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Kuzak</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Martinez</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Arcila</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Martin Del Pico</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Dominguez Del Angel</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>van de Sandt</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ison</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Martinez</surname>
              <given-names>PA</given-names>
            </name>
            <name name-style="western">
              <surname>McQuilton</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Valencia</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Harrow</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Psomopoulos</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Gelpi</surname>
              <given-names>JL</given-names>
            </name>
            <name name-style="western">
              <surname>Chue Hong</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Goble</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Capella-Gutierrez</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Towards FAIR principles for research software</article-title>
          <source>DS</source>
          <year>2020</year>
          <month>06</month>
          <day>12</day>
          <volume>3</volume>
          <issue>1</issue>
          <fpage>37</fpage>
          <lpage>59</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://paperpile.com/b/Zfh0qz/oIin"/>
          </comment>
          <pub-id pub-id-type="doi">10.3233/DS-190026</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="web">
          <source>HL7 FHIR Foundation enabling healthcare interoperability through FHIR</source>
          <access-date>2023-02-10</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://fhir.org/">https://fhir.org/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="web">
          <source>W3C PROV Overview</source>
          <access-date>2022-11-18</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.w3.org/TR/prov-overview/">https://www.w3.org/TR/prov-overview/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Moreau</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Clifford</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Freire</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Futrelle</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Gil</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Groth</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Kwasnikowska</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Miles</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Missier</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Myers</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Plale</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Simmhan</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Stephan</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>den Bussche</surname>
              <given-names>JV</given-names>
            </name>
          </person-group>
          <article-title>The Open Provenance Model core specification (v1.1)</article-title>
          <source>Future Generation Computer Systems</source>
          <year>2011</year>
          <month>6</month>
          <volume>27</volume>
          <issue>6</issue>
          <fpage>743</fpage>
          <lpage>756</lpage>
          <pub-id pub-id-type="doi">10.1016/j.future.2010.07.005</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sikos</surname>
              <given-names>LF</given-names>
            </name>
            <name name-style="western">
              <surname>Philp</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Provenance-Aware Knowledge Representation: A Survey of Data Models and Contextualized Knowledge Graphs</article-title>
          <source>Data Sci Eng</source>
          <year>2020</year>
          <month>05</month>
          <day>08</day>
          <volume>5</volume>
          <issue>3</issue>
          <fpage>293</fpage>
          <lpage>316</lpage>
          <pub-id pub-id-type="doi">10.1007/s41019-020-00118-0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gierend</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Freiesleben</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kadioglu</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Siegel</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Ganslandt</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Waltemath</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>The Status of Data Management Practices Across German Medical Data Integration Centers: Mixed Methods Study</article-title>
          <source>J Med Internet Res</source>
          <year>2023</year>
          <month>11</month>
          <day>08</day>
          <volume>25</volume>
          <fpage>e48809</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2023//e48809/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/48809</pub-id>
          <pub-id pub-id-type="medline">37938878</pub-id>
          <pub-id pub-id-type="pii">v25i1e48809</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>DeMarco</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Structure AnalysisSystem Specification</article-title>
          <source>Broy M, Denert E.  editors</source>
          <year>1979</year>
          <publisher-loc>Pioneers and Their Contributions to Software Engineering Berlin, Heidelberg</publisher-loc>
          <publisher-name>Springer Berlin Heidelberg</publisher-name>
          <fpage>255</fpage>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bornberg-Bauer</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Paton</surname>
              <given-names>Norman W</given-names>
            </name>
          </person-group>
          <article-title>Conceptual data modelling for bioinformatics</article-title>
          <source>Brief Bioinform</source>
          <year>2002</year>
          <month>06</month>
          <day>01</day>
          <volume>3</volume>
          <issue>2</issue>
          <fpage>166</fpage>
          <lpage>80</lpage>
          <pub-id pub-id-type="doi">10.1093/bib/3.2.166</pub-id>
          <pub-id pub-id-type="medline">12139436</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lim</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chebotko</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Fotouhi</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Prospective and Retrospective Provenance Collection in Scientific Workflow Environments</article-title>
          <source>ProspectiveRetrospective Provenance Collection in Scientific Workflow Environments IEEE International Conference on Services Computing Miami, FL</source>
          <year>2010</year>
          <conf-name>Conference on Services Computing</conf-name>
          <conf-date>2010</conf-date>
          <conf-loc>Miami, FL, USA</conf-loc>
          <publisher-loc>USA</publisher-loc>
          <publisher-name>IEEE</publisher-name>
          <fpage>449</fpage>
          <pub-id pub-id-type="doi">10.1109/SCC.2010.18</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="web">
          <source>Provenance in Data Integration Center, WebProtégé</source>
          <access-date>2023-05-10</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://webprotege.stanford.edu/#login">https://webprotege.stanford.edu/#login</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="web">
          <source>Python</source>
          <access-date>2022-11-18</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.python.org/">https://www.python.org/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="web">
          <article-title>15</article-title>
          <source>Peewee documentation</source>
          <access-date>2022-11-18</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://docs.peewee-orm.com/en/latest/">https://docs.peewee-orm.com/en/latest/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Woodward</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Include diagrams in your Markdown files with Mermaid</article-title>
          <source>The GitHub Blog</source>
          <access-date>2022-12-02</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.blog/2022-02-14-include-diagrams-markdown-files-mermaid/">https://github.blog/2022-02-14-include-diagrams-markdown-files-mermaid/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="web">
          <source>R: A language and environment for statistical computing</source>
          <access-date>2022-12-02</access-date>
          <publisher-loc>Vienna, Austria</publisher-loc>
          <publisher-name>R Foundation for Statistical Computing</publisher-name>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.R-project.org/">https://www.R-project.org/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="web">
          <source>GitHub: kegieKG/Provenance-in-Data-Integration-Center</source>
          <access-date>2023-05-10</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/kegieKG/Provenance-in-Data-Integration-Center">https://github.com/kegieKG/Provenance-in-Data-Integration-Center</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vorisek</surname>
              <given-names>CN</given-names>
            </name>
            <name name-style="western">
              <surname>Lehne</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Klopfenstein</surname>
              <given-names>SAI</given-names>
            </name>
            <name name-style="western">
              <surname>Mayer</surname>
              <given-names>PJ</given-names>
            </name>
            <name name-style="western">
              <surname>Bartschke</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Haese</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Thun</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Fast Healthcare Interoperability Resources (FHIR) for Interoperability in Health Research: Systematic Review</article-title>
          <source>JMIR Med Inform</source>
          <year>2022</year>
          <month>07</month>
          <day>19</day>
          <volume>10</volume>
          <issue>7</issue>
          <fpage>e35724</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2022/7/e35724/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/35724</pub-id>
          <pub-id pub-id-type="medline">35852842</pub-id>
          <pub-id pub-id-type="pii">v10i7e35724</pub-id>
          <pub-id pub-id-type="pmcid">PMC9346559</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>de Oliveira</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Braga</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>David</surname>
              <given-names>JMN</given-names>
            </name>
            <name name-style="western">
              <surname>Stroele</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Campos</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Castro</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Visionary: a framework for analysis and visualization of provenance data</article-title>
          <source>Knowl Inf Syst</source>
          <year>2022</year>
          <month>01</month>
          <day>04</day>
          <volume>64</volume>
          <issue>2</issue>
          <fpage>381</fpage>
          <lpage>413</lpage>
          <pub-id pub-id-type="doi">10.1007/s10115-021-01645-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mitchell</surname>
              <given-names>SN</given-names>
            </name>
            <name name-style="western">
              <surname>Lahiff</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Cummings</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Hollocombe</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Boskamp</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Field</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Reddyhoff</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Zarebski</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Wilson</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Viola</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Burke</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Archibald</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Bessell</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Blackwell</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Boden</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Brett</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Brett</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Dundas</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Enright</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Gonzalez-Beltran</surname>
              <given-names>AN</given-names>
            </name>
            <name name-style="western">
              <surname>Harris</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Hinder</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>David Hughes</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Knight</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Mano</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>McMonagle</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Mellor</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Mohr</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Marion</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Matthews</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>McKendrick</surname>
              <given-names>IJ</given-names>
            </name>
            <name name-style="western">
              <surname>Mark Pooley</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Porphyre</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Reeves</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Townsend</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Turner</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Walton</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Reeve</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>FAIR data pipeline: provenance-driven data management for traceable scientific workflows</article-title>
          <source>Philos Trans A Math Phys Eng Sci</source>
          <year>2022</year>
          <month>10</month>
          <day>03</day>
          <volume>380</volume>
          <issue>2233</issue>
          <fpage>20210300</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://royalsocietypublishing.org/doi/abs/10.1098/rsta.2021.0300?url_ver=Z39.88-2003&#38;rfr_id=ori:rid:crossref.org&#38;rfr_dat=cr_pub  0pubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.1098/rsta.2021.0300</pub-id>
          <pub-id pub-id-type="medline">35965468</pub-id>
          <pub-id pub-id-type="pmcid">PMC9376726</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mondelli</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Magalhães</surname>
              <given-names>Thiago</given-names>
            </name>
            <name name-style="western">
              <surname>Loss</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Wilde</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Foster</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Mattoso</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Katz</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Barbosa</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>de Vasconcelos</surname>
              <given-names>Ana Tereza R</given-names>
            </name>
            <name name-style="western">
              <surname>Ocaña</surname>
              <given-names>Kary</given-names>
            </name>
            <name name-style="western">
              <surname>Gadelha</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>BioWorkbench: a high-performance framework for managing and analyzing bioinformatics experiments</article-title>
          <source>PeerJ</source>
          <year>2018</year>
          <volume>6</volume>
          <fpage>e5551</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/30186700"/>
          </comment>
          <pub-id pub-id-type="doi">10.7717/peerj.5551</pub-id>
          <pub-id pub-id-type="medline">30186700</pub-id>
          <pub-id pub-id-type="pii">5551</pub-id>
          <pub-id pub-id-type="pmcid">PMC6119457</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Celebi</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Rebelo Moreira</surname>
              <given-names>Joao</given-names>
            </name>
            <name name-style="western">
              <surname>Hassan</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ayyar</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ridder</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Kuhn</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Dumontier</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Towards FAIR protocols and workflows: the OpenPREDICT use case</article-title>
          <source>PeerJ Comput Sci</source>
          <year>2020</year>
          <volume>6</volume>
          <fpage>e281</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/33816932"/>
          </comment>
          <pub-id pub-id-type="doi">10.7717/peerj-cs.281</pub-id>
          <pub-id pub-id-type="medline">33816932</pub-id>
          <pub-id pub-id-type="pii">cs-281</pub-id>
          <pub-id pub-id-type="pmcid">PMC7924452</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gonçalves</surname>
              <given-names>Rafael S</given-names>
            </name>
            <name name-style="western">
              <surname>Musen</surname>
              <given-names>MA</given-names>
            </name>
          </person-group>
          <article-title>The variable quality of metadata about biological samples used in biomedical experiments</article-title>
          <source>Sci Data</source>
          <year>2019</year>
          <month>02</month>
          <day>19</day>
          <volume>6</volume>
          <issue>1</issue>
          <fpage>190021</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/sdata.2019.21"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/sdata.2019.21</pub-id>
          <pub-id pub-id-type="medline">30778255</pub-id>
          <pub-id pub-id-type="pii">sdata201921</pub-id>
          <pub-id pub-id-type="pmcid">PMC6380228</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bönisch</surname>
              <given-names>Caroline</given-names>
            </name>
            <name name-style="western">
              <surname>Kesztyüs</surname>
              <given-names>Dorothea</given-names>
            </name>
            <name name-style="western">
              <surname>Kesztyüs</surname>
              <given-names>Tibor</given-names>
            </name>
          </person-group>
          <article-title>Harvesting metadata in clinical care: a crosswalk between FHIR, OMOP, CDISC and openEHR metadata</article-title>
          <source>Sci Data</source>
          <year>2022</year>
          <month>10</month>
          <day>28</day>
          <volume>9</volume>
          <issue>1</issue>
          <fpage>659</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41597-022-01792-7"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41597-022-01792-7</pub-id>
          <pub-id pub-id-type="medline">36307424</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41597-022-01792-7</pub-id>
          <pub-id pub-id-type="pmcid">PMC9616884</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kinast</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Ulrich</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Bergh</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Schreiweis</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Functional Requirements for Medical Data Integration into Knowledge Management Environments: Requirements Elicitation Approach Based on Systematic Literature Analysis</article-title>
          <source>J Med Internet Res</source>
          <year>2023</year>
          <month>02</month>
          <day>09</day>
          <volume>25</volume>
          <fpage>e41344</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2023//e41344/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/41344</pub-id>
          <pub-id pub-id-type="medline">36757764</pub-id>
          <pub-id pub-id-type="pii">v25i1e41344</pub-id>
          <pub-id pub-id-type="pmcid">PMC9951079</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Girman</surname>
              <given-names>CJ</given-names>
            </name>
            <name name-style="western">
              <surname>Ritchey</surname>
              <given-names>ME</given-names>
            </name>
            <name name-style="western">
              <surname>Lo Re</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Real-world data: Assessing electronic health records and medical claims data to support regulatory decision-making for drug and biological products</article-title>
          <source>Pharmacoepidemiol Drug Saf</source>
          <year>2022</year>
          <month>07</month>
          <day>03</day>
          <volume>31</volume>
          <issue>7</issue>
          <fpage>717</fpage>
          <lpage>720</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/35471704"/>
          </comment>
          <pub-id pub-id-type="doi">10.1002/pds.5444</pub-id>
          <pub-id pub-id-type="medline">35471704</pub-id>
          <pub-id pub-id-type="pmcid">PMC9320939</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Barker</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chue Hong</surname>
              <given-names>NP</given-names>
            </name>
            <name name-style="western">
              <surname>Katz</surname>
              <given-names>DS</given-names>
            </name>
            <name name-style="western">
              <surname>Lamprecht</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Martinez-Ortiz</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Psomopoulos</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Harrow</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Castro</surname>
              <given-names>LJ</given-names>
            </name>
            <name name-style="western">
              <surname>Gruenpeter</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Martinez</surname>
              <given-names>PA</given-names>
            </name>
            <name name-style="western">
              <surname>Honeyman</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Introducing the FAIR Principles for research software</article-title>
          <source>Sci Data</source>
          <year>2022</year>
          <month>10</month>
          <day>14</day>
          <volume>9</volume>
          <issue>1</issue>
          <fpage>622</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41597-022-01710-x"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41597-022-01710-x</pub-id>
          <pub-id pub-id-type="medline">36241754</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41597-022-01710-x</pub-id>
          <pub-id pub-id-type="pmcid">PMC9562067</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gierend</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Wodke</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Genehr</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Gött</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Henkel</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Krüger</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Mandalka</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Michaelis</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Scheuerlein</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Schröder</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zeleke</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Waltemath</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>TAPP: Defining standard provenance information for clinical research data and workflows - Obstacles and opportunities</article-title>
          <year>2023</year>
          <month>04</month>
          <day>30</day>
          <conf-name>In Companion Proceedings of the ACM Web Conference 2023 (WWW '23 Companion)</conf-name>
          <conf-date>2023-04-30</conf-date>
          <conf-loc>Austin, TX, USA</conf-loc>
          <publisher-loc>Companion Proceedings of the ACM Web Conference 2023 Austin TX USA</publisher-loc>
          <publisher-name>Association for Computing Machinery, New York, NY, USA</publisher-name>
          <fpage>1551</fpage>
          <lpage>1554</lpage>
          <pub-id pub-id-type="doi">10.1145/3543873.3587562</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
