<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="article-commentary" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JFR</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id>
      <journal-title>JMIR Formative Research</journal-title>
      <issn pub-type="epub">2561-326X</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v8i1e59794</article-id>
      <article-id pub-id-type="pmid">39018549</article-id>
      <article-id pub-id-type="doi">10.2196/59794</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Commentary</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Commentary</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Ethics of the Use of Social Media as Training Data for AI Models Used for Digital Phenotyping</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Leung</surname>
            <given-names>Tiffany</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Jaiswal</surname>
            <given-names>Aditi</given-names>
          </name>
          <degrees>MS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1367-818X</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Shah</surname>
            <given-names>Aekta</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0000-0117-0982</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Harjadi</surname>
            <given-names>Christopher</given-names>
          </name>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-4084-2208</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Windgassen</surname>
            <given-names>Erik</given-names>
          </name>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0006-1220-5734</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Washington</surname>
            <given-names>Peter</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Information and Computer Sciences</institution>
            <institution>University of Hawaii at Manoa</institution>
            <addr-line>1680 East-West Road</addr-line>
            <addr-line>Honolulu, HI, 96822</addr-line>
            <country>United States</country>
            <phone>1 8088296359</phone>
            <email>pyw@hawaii.edu</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-3276-4411</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Information and Computer Sciences</institution>
        <institution>University of Hawaii at Manoa</institution>
        <addr-line>Honolulu, HI</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Salesforce</institution>
        <addr-line>San Francisco, CA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Department of Computer Science</institution>
        <institution>University of California, Berkeley</institution>
        <addr-line>Berkeley, CA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Department of Computer Science and Engineering</institution>
        <institution>University of California, Riverside</institution>
        <addr-line>Riverside, CA</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Peter Washington <email>pyw@hawaii.edu</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2024</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>17</day>
        <month>7</month>
        <year>2024</year>
      </pub-date>
      <volume>8</volume>
      <elocation-id>e59794</elocation-id>
      <history>
        <date date-type="received">
          <day>22</day>
          <month>4</month>
          <year>2024</year>
        </date>
        <date date-type="rev-request">
          <day>4</day>
          <month>6</month>
          <year>2024</year>
        </date>
        <date date-type="rev-recd">
          <day>8</day>
          <month>6</month>
          <year>2024</year>
        </date>
        <date date-type="accepted">
          <day>10</day>
          <month>6</month>
          <year>2024</year>
        </date>
      </history>
      <copyright-statement>©Aditi Jaiswal, Aekta Shah, Christopher Harjadi, Erik Windgassen, Peter Washington. Originally published in JMIR Formative Research (https://formative.jmir.org), 17.07.2024.</copyright-statement>
      <copyright-year>2024</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on https://formative.jmir.org, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://formative.jmir.org/2024/1/e59794" xlink:type="simple"/>
      <related-article related-article-type="commentary-article" id="v8i1e52660" ext-link-type="doi" xlink:href="10.2196/52660" vol="8" page="e52660" xlink:type="simple">https://formative.jmir.org/2024/1/e52660</related-article>
      <abstract>
        <p>Digital phenotyping, or personal sensing, is a field of research that seeks to quantify traits and characteristics of people using digital technologies, usually for health care purposes. In this commentary, we discuss emerging ethical issues regarding the use of social media as training data for artificial intelligence (AI) models used for digital phenotyping. In particular, we describe the ethical need for explicit consent from social media users, particularly in cases where sensitive information such as labels related to neurodiversity are scraped. We also advocate for the use of community-based participatory design principles when developing health care AI models using social media data.</p>
      </abstract>
      <kwd-group>
        <kwd>social media analytics</kwd>
        <kwd>machine learning</kwd>
        <kwd>ethics</kwd>
        <kwd>research ethics</kwd>
        <kwd>consent</kwd>
        <kwd>scientific integrity</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <p>Community-based participatory research and human-centered design are central to research that aims to advance health equity [<xref ref-type="bibr" rid="ref1">1</xref>]. While participatory design is a well-known framework that is increasingly, although not yet widely, used for research in areas such as interventions development [<xref ref-type="bibr" rid="ref2">2</xref>] and partnered science, there is a dearth of research that builds artificial intelligence (AI) models for health in a manner that is grounded in community-based principles. The lack of community guidance early in the AI development process may lead, inadvertently, to models that are unethical despite being formally approved by an institutional review board (IRB). In particular, we discuss the topic of consent, which we argue spans at least two parts of the AI development process: (1) consent to build the AI model, which can be determined through participatory design sessions with the community that the AI model is meant to serve; and (2) consent to use an individual’s data within the training process of the model, which can be obtained through explicit consent procedures.</p>
    <p>We discuss these gaps in community-based research for AI, with a particular focus on the development of social media–based screening tools for underserved communities, especially neurodiverse populations. Using social media for the quantification of characteristics or traits of an individual is a form of digital phenotyping, a method that can work with a broad range of data sources [<xref ref-type="bibr" rid="ref3">3</xref>]. While the increasing availability of public data trails on social media can lead to predictive models that are possibly useful for creating positive good for health outcomes, the unrestricted use of these data poses the risk of training machine learning models on user-generated content without the explicit consent of the people who generated the data. Furthermore, the release of such models has the potential to lead to unintended consequences and possibly harm.</p>
    <p>Social media platforms have emerged as a popular data source for several research domains, including for screening and surveillance broadly in psychiatry and behavioral sciences [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref6">6</xref>], sometimes with the help of AI. Government agencies such as the National Institutes of Health (NIH) in the United States encourage research that uses existing data streams, including social media, to provide actionable insights for conditions such as substance use [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref7">7</xref>]. However, several thought leaders are noting that such research must be carefully performed so as to not scrape data from the internet without the consent of the end users [<xref ref-type="bibr" rid="ref8">8</xref>]. Some recent papers in social media analytics have been careful to obtain explicit consent from users participating in the study or to only conduct the analysis on anonymized data feeds. The NIH has started to prioritize funding research that addresses these ethical challenges [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref8">8</xref>]. In late 2023, The White House highlighted the need for ethical AI practices via its list of “Voluntary AI Commitments” created for companies [<xref ref-type="bibr" rid="ref9">9</xref>] that are also highly relevant to noncommercial research, including guidelines such as prioritizing “research on societal risks posed by AI systems” and protecting privacy.</p>
    <p>This conversation intersects strongly with the discourse around the training procedures of large language models, many of which have been trained on web data without user consent. Over the last few years, generative AI has revolutionized the field of AI by demonstrating remarkable capabilities from generating human-like text to creating art and music. These models require massive amounts of pretraining data collected from various public forums. However, there have been numerous examples of popular language models being trained with web data without explicit user consent or consent that was hidden away in terms and conditions. For example, users were concerned that Google was famously suspected of training Bard/Gemini using Gmail data without consent from end users, although Google denies these claims. Similarly, OpenAI has trained ChatGPT using data from users’ conversation histories. These cases raise questions about how our social contracts may have changed and what users inadvertently opt for when signing up on social media. Although OpenAI has provided the option to opt out of data retention, the default opt-in option raises privacy and data concerns.</p>
    <p>The issue of data consent is particularly salient for vulnerable and marginalized groups. There are several instances of well-known misuse of data for scientific purposes. HeLa cells, named after Henrietta Lacks, are well known in the field of biology and have contributed greatly to progress in science. However, HeLa cells were commercialized, leading to financial gains without compensation or even an acknowledgement of Henrietta Lacks’ contributions. Another notable example is the historical misuse of Indigenous DNA through repeated lack of informed consent by members of Indigenous populations. </p>
    <p>In light of these reflections and the evolving discussions around AI ethics, we have elected to make some significant amendments to our recently published Twitter analysis paper on the use of the #ActuallyAutistic hashtag on Twitter for training a machine learning model that could serve as a screening tool for autism [<xref ref-type="bibr" rid="ref10">10</xref>]. This paper serves as an example of what is possible with AI and social media in today’s tech ecosystem, and we provide a word of caution for creators of such models to think through how such models may be misused and interpreted by the community that they were built to serve. Models meant to help the autistic community should be built in collaboration with the community from the onset of the ideation and development process or should be led by autistic individuals. We hope that our decision to delete our data set and model can serve as a template for other researchers.</p>
    <p>We would like to highlight two important closing thoughts. First, approval by an IRB does not necessarily translate to an ethical study. Some institutions are creating ethical review boards to provide an additional layer of ethical review of studies. Second, while many areas of health-related research are guided by community-based participatory principles, such practices are not as commonplace in research at the intersection of health, social media, and AI. Speaking with impacted communities helps verify assumptions and provides input into methods design and analysis, leading to more robust conclusions for future research.</p>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">IRB</term>
          <def>
            <p>institutional review board</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">NIH</term>
          <def>
            <p>National Institutes of Health</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Brewer</surname>
              <given-names>LC</given-names>
            </name>
            <name name-style="western">
              <surname>Fortuna</surname>
              <given-names>KL</given-names>
            </name>
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Walker</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Hayes</surname>
              <given-names>SN</given-names>
            </name>
            <name name-style="western">
              <surname>Patten</surname>
              <given-names>CA</given-names>
            </name>
            <name name-style="western">
              <surname>Cooper</surname>
              <given-names>LA</given-names>
            </name>
          </person-group>
          <article-title>Back to the future: achieving health equity through health informatics and digital health</article-title>
          <source>JMIR Mhealth Uhealth</source>
          <year>2020</year>
          <month>01</month>
          <day>14</day>
          <volume>8</volume>
          <issue>1</issue>
          <fpage>e14512</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mhealth.jmir.org/2020/1/e14512/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/14512</pub-id>
          <pub-id pub-id-type="medline">31934874</pub-id>
          <pub-id pub-id-type="pii">v8i1e14512</pub-id>
          <pub-id pub-id-type="pmcid">PMC6996775</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kumar</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Dell</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Towards informed practice in HCI for development</article-title>
          <source>Proc ACM Hum Comput Interact</source>
          <year>2018</year>
          <month>11</month>
          <volume>2</volume>
          <issue>CSCW</issue>
          <fpage>1</fpage>
          <lpage>20</lpage>
          <pub-id pub-id-type="doi">10.1145/3274368</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Torous</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bucci</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Bell</surname>
              <given-names>IH</given-names>
            </name>
            <name name-style="western">
              <surname>Kessing</surname>
              <given-names>LV</given-names>
            </name>
            <name name-style="western">
              <surname>Faurholt-Jepsen</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Whelan</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Carvalho</surname>
              <given-names>AF</given-names>
            </name>
            <name name-style="western">
              <surname>Keshavan</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Linardon</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Firth</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>The growing field of digital psychiatry: current evidence and the future of apps, social media, chatbots, and virtual reality</article-title>
          <source>World Psychiatry</source>
          <year>2021</year>
          <month>10</month>
          <volume>20</volume>
          <issue>3</issue>
          <fpage>318</fpage>
          <lpage>335</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/34505369"/>
          </comment>
          <pub-id pub-id-type="doi">10.1002/wps.20883</pub-id>
          <pub-id pub-id-type="medline">34505369</pub-id>
          <pub-id pub-id-type="pmcid">PMC8429349</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="web">
          <article-title>Notice of Special Interest (NOSI): Computational and statistical methods to enhance discovery from health data (NOT-LM-23-001)</article-title>
          <source>National Institutes of Health</source>
          <access-date>2024-04-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://grants.nih.gov/grants/guide/notice-files/NOT-LM-23-001.html">https://grants.nih.gov/grants/guide/notice-files/NOT-LM-23-001.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="web">
          <article-title>Notice of Special Interest (NOSI): Addressing health disparities in NIMHD research: leveraging health data science (NOT-OD-22-026)</article-title>
          <source>National Institutes of Health</source>
          <access-date>2024-04-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://grants.nih.gov/grants/guide/notice-files/not-od-22-026.html">https://grants.nih.gov/grants/guide/notice-files/not-od-22-026.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="web">
          <article-title>Notice of Special Interest (NOSI): IDEA2Health: Innovative data evaluation and analysis to health (NOT-HL-22-001)</article-title>
          <source>National Institutes of Health</source>
          <access-date>2024-04-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://grants.nih.gov/grants/guide/notice-files/NOT-HL-22-001.html">https://grants.nih.gov/grants/guide/notice-files/NOT-HL-22-001.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="web">
          <article-title>Notice of Special Interest (NOSI): Leveraging data science to bring actionable insights for substance use prevention and treatment (NOT-DA-23-006)</article-title>
          <source>National Institutes of Health</source>
          <access-date>2024-04-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://grants.nih.gov/grants/guide/notice-files/NOT-DA-23-006.html">https://grants.nih.gov/grants/guide/notice-files/NOT-DA-23-006.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ahmed</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Bath</surname>
              <given-names>PA</given-names>
            </name>
            <name name-style="western">
              <surname>Demartini</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <person-group person-group-type="editor">
            <name name-style="western">
              <surname>Woodfield</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Using Twitter as a data source: an overview of ethical, legal, and methodological challenges</article-title>
          <source>The Ethics of Online Research (Advances in Research Ethics and Integrity, Vol. 2)</source>
          <year>2017</year>
          <publisher-loc>Leeds</publisher-loc>
          <publisher-name>Emerald Publishing Limited</publisher-name>
          <fpage>79</fpage>
          <lpage>107</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="web">
          <article-title>Voluntary AI Commitments</article-title>
          <source>The White House</source>
          <year>2023</year>
          <month>09</month>
          <access-date>2024-04-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.whitehouse.gov/wp-content/uploads/2023/09/Voluntary-AI-Commitments-September-2023.pdf">https://www.whitehouse.gov/wp-content/uploads/2023/09/Voluntary-AI-Commitments-September-2023.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jaiswal</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Washington</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Using #ActuallyAutistic on Twitter for precision diagnosis of autism spectrum disorder: machine learning study</article-title>
          <source>JMIR Form Res</source>
          <year>2024</year>
          <month>02</month>
          <day>14</day>
          <volume>8</volume>
          <fpage>e52660</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://formative.jmir.org/2024//e52660/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/52660</pub-id>
          <pub-id pub-id-type="medline">38354045</pub-id>
          <pub-id pub-id-type="pii">v8i1e52660</pub-id>
          <pub-id pub-id-type="pmcid">PMC10902768</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
