<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.0" xmlns:xlink="http://www.w3.org/1999/xlink">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JFR</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id>
      <journal-title>JMIR Formative Research</journal-title>
      <issn pub-type="epub">2561-326X</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v6i12e23422</article-id>
      <article-id pub-id-type="pmid">36534457</article-id>
      <article-id pub-id-type="doi">10.2196/23422</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Detecting Elevated Air Pollution Levels by Monitoring Web Search Queries: Algorithm Development and Validation</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Mavragani</surname>
            <given-names>Amaryllis</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Zhao</surname>
            <given-names>Chang</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Staffini</surname>
            <given-names>Alessio</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Ceron</surname>
            <given-names>Wilson</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Lin</surname>
            <given-names>Chen</given-names>
          </name>
          <degrees>BSc, MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Computer Science</institution>
            <institution>Emory University</institution>
            <addr-line>201 Dowman Drive</addr-line>
            <addr-line>W302</addr-line>
            <addr-line>Atlanta, GA, 30322</addr-line>
            <country>United States</country>
            <phone>1 404 395 0266</phone>
            <email>chen.lin@emory.edu</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9783-7525</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Yousefi</surname>
            <given-names>Safoora</given-names>
          </name>
          <degrees>DPhil</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-7320-2998</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Kahoro</surname>
            <given-names>Elvis</given-names>
          </name>
          <degrees>BSc</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-3467-2449</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Karisani</surname>
            <given-names>Payam</given-names>
          </name>
          <degrees>DPhil</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-7756-3608</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Liang</surname>
            <given-names>Donghai</given-names>
          </name>
          <degrees>DPhil</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-7311-2298</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Sarnat</surname>
            <given-names>Jeremy</given-names>
          </name>
          <degrees>DPhil</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-8733-8749</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Agichtein</surname>
            <given-names>Eugene</given-names>
          </name>
          <degrees>DPhil</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-3148-5448</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Computer Science</institution>
        <institution>Emory University</institution>
        <addr-line>Atlanta, GA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of Computer Science</institution>
        <institution>Pomona College</institution>
        <addr-line>Claremont, CA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Department of Environmental Health</institution>
        <institution>Emory University</institution>
        <addr-line>Atlanta, GA</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Chen Lin <email>chen.lin@emory.edu</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>12</month>
        <year>2022</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>19</day>
        <month>12</month>
        <year>2022</year>
      </pub-date>
      <volume>6</volume>
      <issue>12</issue>
      <elocation-id>e23422</elocation-id>
      <history>
        <date date-type="received">
          <day>30</day>
          <month>3</month>
          <year>2022</year>
        </date>
        <date date-type="rev-request">
          <day>21</day>
          <month>6</month>
          <year>2022</year>
        </date>
        <date date-type="rev-recd">
          <day>6</day>
          <month>10</month>
          <year>2022</year>
        </date>
        <date date-type="accepted">
          <day>25</day>
          <month>10</month>
          <year>2022</year>
        </date>
      </history>
      <copyright-statement>©Chen Lin, Safoora Yousefi, Elvis Kahoro, Payam Karisani, Donghai Liang, Jeremy Sarnat, Eugene Agichtein. Originally published in JMIR Formative Research (https://formative.jmir.org), 19.12.2022.</copyright-statement>
      <copyright-year>2022</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on https://formative.jmir.org, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://formative.jmir.org/2022/12/e23422" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Real-time air pollution monitoring is a valuable tool for public health and environmental surveillance. In recent years, there has been a dramatic increase in air pollution forecasting and monitoring research using artificial neural networks. Most prior work relied on modeling pollutant concentrations collected from ground-based monitors and meteorological data for long-term forecasting of outdoor ozone (O<sub>3</sub>), oxides of nitrogen, and fine particulate matter (PM<sub>2.5</sub>). Given that traditional, highly sophisticated air quality monitors are expensive and not universally available, these models cannot adequately serve those not living near pollutant monitoring sites. Furthermore, because prior models were built based on physical measurement data collected from sensors, they may not be suitable for predicting the public health effects of pollution exposure.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aimed to develop and validate models to <italic>nowcast</italic> the observed pollution levels using web search data, which are publicly available in near real time from major search engines.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We developed novel machine learning–based models using both traditional supervised classification methods and state-of-the-art deep learning methods to detect elevated air pollution levels at the US city level by using generally available meteorological data and aggregate web-based search volume data derived from Google Trends. We validated the performance of these methods by predicting 3 critical air pollutants (O<sub>3</sub>, nitrogen dioxide, and PM<sub>2.5</sub>) across 10 major US metropolitan statistical areas in 2017 and 2018. We also explore different variations of the long short-term memory model and propose a novel search term dictionary learner-long short-term memory model to learn sequential patterns across multiple search terms for prediction.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>The top-performing model was a deep neural sequence model long short-term memory, using meteorological and web search data, and reached an accuracy of 0.82 (<italic>F</italic><sub>1</sub>-score 0.51) for O<sub>3,</sub> 0.74 (<italic>F</italic><sub>1</sub>-score 0.41) for nitrogen dioxide, and 0.85 (<italic>F</italic><sub>1</sub>-score 0.27) for PM<sub>2.5</sub>, when used for detecting elevated pollution levels. Compared with using only meteorological data, the proposed method achieved superior accuracy by incorporating web search data.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>The results show that incorporating web search data with meteorological data improves the nowcasting performance for all 3 pollutants and suggest promising novel applications for tracking global physical phenomena using web search data.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>nowcasting of air pollution</kwd>
        <kwd>web-based public health surveillance</kwd>
        <kwd>neural network sequence modeling</kwd>
        <kwd>search engine log analysis</kwd>
        <kwd>air pollution exposure assessment</kwd>
        <kwd>mobile phone</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>Web-based crowd surveillance has been used to track emergent risks to public health [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>]. Most commonly, these efforts involve the collection of web-based search queries to document acute changes in the incidence or symptom occurrence of primary infectious disease agents, such as influenza [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref7">7</xref>], Ebola [<xref ref-type="bibr" rid="ref8">8</xref>], dengue fever [<xref ref-type="bibr" rid="ref9">9</xref>], and COVID-19 [<xref ref-type="bibr" rid="ref10">10</xref>]. These methods have the potential to provide public health and medical professionals with benefits over traditional health surveillance and environmental epidemiology in their ability to capture both personal exposures and response dynamics at more sensitive spatial and temporal scales [<xref ref-type="bibr" rid="ref2">2</xref>].</p>
        <p>Despite the promise of these approaches for infectious diseases, only a limited number of studies have examined how crowd surveillance approaches can be used to track environmental exposures and, less frequently, responses to noninfectious environment-mediated disease processes [<xref ref-type="bibr" rid="ref11">11</xref>-<xref ref-type="bibr" rid="ref13">13</xref>]. The global burden of disease attributable to outdoor and indoor air pollution has been quantified by recent efforts and has increased public awareness of the severity of this public health crisis worldwide [<xref ref-type="bibr" rid="ref14">14</xref>]. Therefore, urban air pollution provides a key test case for the evaluation of web-based surveillance approaches for noninfectious environmental risks. The web-based surveillance approach is distinct from traditional approaches for measuring urban air pollution exposure. Therefore, it could possibly serve as a substitute to or complement the existing approaches. Traditional indicators of air pollution exposure, namely, concentrations measured at ambient monitoring sites, are widely used to assess the health effects associated with air pollution in epidemiological studies. However, the use of ambient monitoring measurements as surrogates of exposure may result in the misclassification of health responses and potential risks, especially for those not living near pollutant monitoring sites [<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref17">17</xref>]. Moreover, ambient monitoring, by design, provides information on measured outdoor pollutant concentrations and may not necessarily reflect accurate personal exposures for individuals spending most of their time indoors or for those with preexisting biological susceptibility to air pollution. Several recent studies have focused on using smartphones within distributed air pollution sensing networks, where users record and upload local air pollution conditions to crowd-generated, geospatially refined pollution maps [<xref ref-type="bibr" rid="ref11">11</xref>-<xref ref-type="bibr" rid="ref13">13</xref>]. These studies demonstrate the feasibility of web-based crowd-generated participation in projects predicted on urban air pollution awareness.</p>
        <p>To the best of our knowledge, few studies have investigated the feasibility of using web search data to produce accurate “nowcasts” of urban air pollution levels in real time. Conducting accurate predictions using web search data is a challenging task with 2 major challenges. The first is the selection of search terms to comprehensively capture people’s responses. Several approaches have been proposed to select search terms. For example, some studies preliminarily prepare keywords related to the target disease and then use these keywords to filter the search terms, which is often difficult because finding related keywords could be difficult for some diseases or be costly when conducting for multiple diseases. The second is the selection of the appropriate models. Although the literature on data-driven nowcasting methods for estimating infectious disease activity is well developed from an epidemiological standpoint, the machine learning methods used lag behind the state-of-the-art methods. The nowcasting models introduced to date mainly use variations of regularized linear regressions or, less often, random forests (RFs) or support vector machines. From a machine learning perspective, the problem of disease activity estimation is most suited to a more sophisticated and time series–specific model architecture. Because of the growing volume of recorded environment-mediated disease data, the use of recurrent neural networks (RNNs) and, more specifically, their variants long short-term memory (LSTM) and gated recurrent unit networks is increasingly feasible. The vanilla LSTM model makes predictions solely relying on the time series of the search activity while ignoring the semantic information in the search query phrases. Previous studies have pointed out that search queries could be semantically related, and ignoring their correlation would lead to a decrease in model performance [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>]. Recent advances in natural language processing have led to the development of a technique called word embeddings to represent the semantic information in phrases, and fine-tuning of word embeddings has been encouraged for downstream tasks (Wu, Y, unpublished data, September 2016) [<xref ref-type="bibr" rid="ref20">20</xref>-<xref ref-type="bibr" rid="ref22">22</xref>]. However, there is still a lack of knowledge on incorporating both the semantic information of search queries and time series of search activities to make predictions.</p>
      </sec>
      <sec>
        <title>Objectives</title>
        <p>In this study, we investigate web search data as an important source of a web-based crowd-based indicator. As web search data are free and broadly accessible, we posit that they could serve as a scalable means of tracking urban air pollution exposures and corresponding population-level health responses. To measure search interest, we used the freely accessible Google Trends service, which reports aggregate search volume data at a city-level geographical resolution. For this analysis, we use known health end point terms and topics, such as “difficulty breathing,” and observations (eg, “haze”) suggested by public health researchers, augmented by automatic term expansion based on semantic and temporal correlations, to estimate the levels of search activities related to air pollution, and ultimately to predict whether the pollution levels were elevated [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>].</p>
        <p>Compared with existing air pollution classification models, this study explores the use of web search anomalies as an auxiliary signal to detect air pollution. We compared our approach with the state-of-the-art physical sensor–based models that incorporate various pollutant covariates such as historical pollutant concentrations and meteorological data [<xref ref-type="bibr" rid="ref25">25</xref>]. Using web search data for prediction introduces several challenges, including an unclear relationship between search interest and pollution levels and the trade-off between model complexity and convergence for the inclusion of web search data in a data-deficient scenario.</p>
        <p>In summary, our contributions are as follows:</p>
        <list list-type="bullet">
          <list-item>
            <p>We proposed a novel search term dictionary learner-LSTM (DL-LSTM) model to learn sequential patterns from broad historical records of web search data for air pollution nowcasting.</p>
          </list-item>
          <list-item>
            <p>We compared the DL-LSTM models with a variety of baseline models on the efficacy of using web search data to indicate exposure to a noninfectious environmental stressor (ie, air pollution) and demonstrate that the proposed models are effective across different experimental settings.</p>
          </list-item>
          <list-item>
            <p>We evaluated the efficacy of combining web search data and meteorological data for air pollution prediction and showed that the inclusion of web search data improves the prediction accuracy and provides a promising substitute when historical pollutant data are unavailable.</p>
          </list-item>
        </list>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <p>We now describe the methodology. First, we formalize our problem setting, then describe the data, and then introduce our modeling approaches.</p>
      <sec>
        <title>Problem Statement</title>
        <p>We formalized this task as a classification problem and adapted state-of-the-art machine learning models. We constructed a multivariate autoregressive model and an RF model fit on historical air pollutant concentrations as well as search and meteorological data as baseline models. We evaluated the performance of our proposed models (described below) in comparison with the baselines in terms of prediction accuracy and other standard classification prediction metrics.</p>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>The data available to the public are not individually identifiable and therefore analysis does not involve human subjects. The International Review Board (IRB) recognizes that the analysis of de-identified, publicly available data does not constitute human subjects research and therefore does not require IRB review.</p>
      </sec>
      <sec>
        <title>Data Collection</title>
        <p>We collected daily air pollutant concentration data as well as temperature and relative humidity in the 10 largest US. metropolitan statistical areas (MSAs) from January 2007 to December 2018. We focused on 3 air pollutants: ozone (O<sub>3</sub>), nitrogen dioxide (NO<sub>2</sub>), and fine particulate matter (PM<sub>2.5</sub>). The in-situ pollutant concentrations and meteorological data such as temperature, relative humidity, and dew point temperature were retrieved from the US Environmental Protection Agency, Air Quality System, and AirNow database. To create a single daily pollutant concentration for each city, we used the median pollutant concentration from all available monitoring sites within each city to avoid outlier bias.</p>
        <p>We collected the daily search frequency of pollution-related terms from Google Trends for the same 12-year period and cities. We created a curated list of 152 pollution-related terms based on our previous air pollution epidemiology studies and in reviewing the environmental health literature [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref26">26</xref>-<xref ref-type="bibr" rid="ref30">30</xref>], and we downloaded the reports of trending results terms using PyTrends [<xref ref-type="bibr" rid="ref31">31</xref>]. For each PyTrends request, we downloaded the search history of pollution-related terms over a 6-month window with 1 overlapping month for calibration. PyTrends provided us with a search frequency scaled on a range of 0 to 100 based on a topic’s proportion to all searches on all topics. Because of the PyTrends restriction, we downloaded the reports of trending results multiple times, and the search frequencies were scaled separately in each 6-month window, which required us to calibrate the search frequency for the 12-year period. We calibrated the search frequencies by joining the search logs on the overlapping periods (1 out of 6 months) for intercalibration [<xref ref-type="bibr" rid="ref32">32</xref>].</p>
        <p>We investigated the available input features from meteorological data (temperature and relative humidity), historical pollutant concentrations, and web search data (<xref ref-type="table" rid="table1">Table 1</xref>).</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Input features calculated per time step in the input sequence.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="500"/>
            <col width="500"/>
            <thead>
              <tr valign="top">
                <td>Input feature</td>
                <td>Feature transformation</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Meteorological data (Met<sup>a</sup>)</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Maximum temperature (Temp_max<sup>b</sup>)</p>
                    </list-item>
                    <list-item>
                      <p>Mean temperature (Temp_mean<sup>c</sup>)</p>
                    </list-item>
                    <list-item>
                      <p>Relative humidity (humidity)</p>
                    </list-item>
                    <list-item>
                      <p>Square of Temp_mean</p>
                    </list-item>
                    <list-item>
                      <p>Cube of Temp_mean</p>
                    </list-item>
                    <list-item>
                      <p>Square of humidity</p>
                    </list-item>
                    <list-item>
                      <p>Cube of humidity</p>
                    </list-item>
                    <list-item>
                      <p>Dew point temperature</p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>Pollutant concentration (Pol<sup>d</sup>)</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Concentration on day t-7<sup>e</sup></p>
                    </list-item>
                    <list-item>
                      <p>Concentration on day t-6<sup>e</sup></p>
                    </list-item>
                    <list-item>
                      <p>Concentration on day t-5<sup>e</sup></p>
                    </list-item>
                    <list-item>
                      <p>Concentration on day t-4<sup>e</sup></p>
                    </list-item>
                    <list-item>
                      <p>Concentration on day t-3<sup>e</sup></p>
                    </list-item>
                    <list-item>
                      <p>Concentration on day t-2<sup>e</sup></p>
                    </list-item>
                    <list-item>
                      <p>Concentration on day t-1<sup>e</sup></p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>Search</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Search volumes of search terms</p>
                    </list-item>
                  </list>
                </td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>Met<sub>:</sub> meteorological data.</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>Temp_max: maximum temperature</p>
            </fn>
            <fn id="table1fn3">
              <p><sup>c</sup>Temp_mean: mean temperature</p>
            </fn>
            <fn id="table1fn4">
              <p><sup>d</sup>Pol<sub>:</sub> pollutant concentration.</p>
            </fn>
            <fn id="table1fn5">
              <p><sup>e</sup>Day t-7,..., t-1: days preceding the prediction day t.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Missing Data Imputation and Normalization</title>
        <p>Smoothing and interpolation are simple and efficient data imputation methods [<xref ref-type="bibr" rid="ref33">33</xref>], and we applied linear interpolation to fill the missing data in historical pollutant concentration, temperature, and humidity, with a rolling window size of 3. To fill in the missing data in infrequent search terms for which Google Trends does not return a count, we used random numbers close to 0 (e<sup>-10</sup>~e<sup>-5</sup>). We normalized all the input features to standard scores by subtracting their mean values and dividing them by the respective SDs.</p>
      </sec>
      <sec>
        <title>Search Term Expansion</title>
        <p>As web-based search queries may reflect individual exposure to ambient air pollution, the seed terms were mostly related to symptoms, observations, and emission sources (Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). However, because an exhaustive list of user queries was not available, reliance on only expert-generated seed words may result in poor prediction because of the high mismatch rate between the user queries and our expected search words.</p>
        <p>Query expansion is a common approach for resolving this discrepancy. A recent study [<xref ref-type="bibr" rid="ref18">18</xref>] showed that the initial set of seed words could be effectively expanded through semantic and temporal correlations. Thus, for each seed word, we used Google Correlate [<xref ref-type="bibr" rid="ref34">34</xref>] to retrieve the top 100 correlated query terms. Then, we used the pretrained word2vec model [<xref ref-type="bibr" rid="ref21">21</xref>] to retrieve the vector representation of each query; phrases were mapped to the centroid of the constituent terms. A utility score was calculated for each candidate query by measuring the maximum cosine similarity between the query and seed words. Queries with a high utility score were retained, and the remaining queries were eliminated, and we empirically set the utility cutoff to 0.55. This method expanded the set of search terms for the 152 search terms to track (Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p>
      </sec>
      <sec>
        <title>Modeling and Evaluation</title>
        <sec>
          <title>Problem Definition</title>
          <p>Given sequences of physical sensor data P = [p<sub>t-L,</sub>..., p<sub>t-1</sub>]<sup>T</sup> with the dimension of L times d<sub>p</sub>, and search interest data S = [s<sub>t-L+2</sub>,..., s<sub>t+1</sub>]<sup>T</sup> with the dimension of L times d<sub>s</sub>, the task is to classify day <italic>t</italic> as <italic>polluted</italic> or not, where a positive class label indicates that the air pollution was above a predefined threshold. L denotes the sequence length, and d<sub>p</sub> and d<sub>s</sub> are the number of physical sensor features and the number of search-related terms, respectively.</p>
        </sec>
        <sec>
          <title>Autoregressive and RF Classification Models</title>
          <p>Previous work has shown that simple autoregressive models using web search data can generate nowcast estimates for influenza-like illnesses at the US national level [<xref ref-type="bibr" rid="ref19">19</xref>]. We adapted autoregressive models with a logistic regression (LR) classifier for classification purposes. Furthermore, we applied elastic net regularization, which is a linear combination of <italic>l<sub>1</sub></italic> and <italic>l<sub>2</sub></italic> regularization, as proposed in previous studies [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>]. LR+Elastic Net was implemented using the Python <italic>scikit-learn</italic> package, using cross-validation to set the model’s hyperparameters to maximize the <italic>F</italic><sub>1</sub>-score on the validation set, with class_weight set to “balanced.”</p>
          <p>RF is an ensemble learning model that is robust against overfitting and provides a strong baseline for the development of nonlinear predictive models [<xref ref-type="bibr" rid="ref35">35</xref>]. We used the <italic>scikit-learn</italic> implementation of RFs. The number of trees and maximum depth of individual trees were selected to maximize the <italic>F</italic><sub>1</sub>-score on the validation set, with balanced class_weight for positive and negative samples.</p>
        </sec>
        <sec>
          <title>LSTM and Its Variants</title>
          <p>LSTM units [<xref ref-type="bibr" rid="ref36">36</xref>] are RNN models designed for sequence modeling, which can learn nonlinear relationships in time series data [<xref ref-type="bibr" rid="ref37">37</xref>]. First, we describe a baseline LSTM model with 2 subnetworks to separate the search data and meteorological data. As shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>, there are 4 layers in the model, that is, the sequence embedding layer, LSTM layer, fully connected hidden layer, and output layer [<xref ref-type="bibr" rid="ref38">38</xref>].</p>
          <fig id="figure1" position="float">
            <label>Figure 1</label>
            <caption>
              <p>The architecture of the long short-term memory (LSTM) model.</p>
            </caption>
            <graphic xlink:href="formative_v6i12e23422_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <p>In the left subnetwork of the LSTM model with search data as input, we propose 2 methods for capturing semantic information in search terms. The first is the LSTM semantic model (GloVe [Global Vectors for Word Representation]; LSTM-GloVe). As a variant of the vanilla LSTM model, for the sequence embedding layer of the right subnetwork in <xref rid="figure1" ref-type="fig">Figure 1</xref>, we introduce the matrix multiplication operation to project the search values of search terms to their semantic embedding space (GloVe embeddings), as shown in equation 1.</p>
          <p>Given the search interest data S = [s<sub>1</sub>,..., s<sub>7</sub>]<sup>T</sup> with the dimension of 7 times d<sub>s</sub>, and their GloVe embedding G = [g<sub>1</sub>,..., g<sub>dg</sub>] with the dimension of d<sub>s</sub> times d<sub>g</sub>, where d<sub>g</sub> = 50 (GloVe 50-dimensional word vectors trained on tweets [<xref ref-type="bibr" rid="ref22">22</xref>]). The matrix multiplication operation is defined as</p>
          <graphic xlink:href="formative_v6i12e23422_fig10.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          <p>Specifically, the tensor generated by the matrix multiplication operation was then fed into the LSTM layer for further calculations. This matrix multiplication is designed specifically for the model consistency problem when introducing collinear predictors after search term expansion (STE).</p>
          <p>The second variation of the LSTM model is the DL-LSTM model, which is theoretically based on the idea of matrix multiplication, as shown in LSTM-GloVe. However, instead of directly applying the GloVe embedding for matrix multiplication, it introduces the fine-tuning of the word embeddings via a <italic>d<sub>g</sub></italic> by <italic>d<sub>e</sub></italic> rectified linear unit–activated fully connected layer. As shown in <xref rid="figure2" ref-type="fig">Figure 2</xref>, the rectified linear unit–activated fully connected layer was applied to the initial GloVe embedding, where <italic>d<sub>e</sub></italic>=100 is the size of the new embedding. In this architecture, the GloVe 50-dimensional word vectors are used to initialize the search term embedding dictionary, and the matrix multiplication operation is used to transform the input embedding of search terms into the semantic embedding space [<xref ref-type="bibr" rid="ref39">39</xref>].</p>
          <fig id="figure2" position="float">
            <label>Figure 2</label>
            <caption>
              <p>The architecture of the dictionary learner-long short-term memory model.</p>
            </caption>
            <graphic xlink:href="formative_v6i12e23422_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <p>In summary, we evaluate the following models in this paper:</p>
          <list list-type="bullet">
            <list-item>
              <p>LR: LR is LR classifier with elastic net regularization.</p>
            </list-item>
            <list-item>
              <p>RF: RF is RF classifier with the number of trees and maximum depth tuned for prediction.</p>
            </list-item>
            <list-item>
              <p>LSTM: The baseline LSTM model, as shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>, combines physical sensor features, if available, with the search interest volume data directly, providing a direct adaptation of RNNs to this problem without any problem-specific extensions.</p>
            </list-item>
            <list-item>
              <p>LSTM-GloVe: LSTM semantic model is a variant of the LSTM model as described in equation 1, where we control the input of search interest data (ie, 51 seed search terms vs 152 terms after STE) in this model. We refer to the variants as <italic>LSTM-GloVe</italic> and <italic>LSTM-GloVe with [w/] STE</italic>, respectively.</p>
            </list-item>
            <list-item>
              <p>DL-LSTM: The DL-LSTM model is shown in <xref rid="figure2" ref-type="fig">Figure 2</xref>. We control the input of the search interest data (ie, 51 seed search terms vs 152 terms after STE) in this model and refer to the variants as <italic>DL-LSTM</italic> and <italic>DL-LSTM w/STE</italic>, respectively.</p>
            </list-item>
          </list>
        </sec>
      </sec>
      <sec>
        <title>Validation</title>
        <p>To tune the model parameters and validate the model performance, we split the available data into training (from January 2007 to December 2014), validation (from January 2015 to December 2016), and testing (from January 2017 to December 2018) sets. This 8-year training period provides a broad history for learning the relationship between input and output variables, and the predictive models are evaluated based on their ability to make predictions for completely unseen periods. For evaluating our model, we made predictions for each day from January 2017 to December 2018 in the test data set. The distribution of the classes in the training, validation, and test data sets is presented in <xref ref-type="table" rid="table2">Table 2</xref>. Note that the positive and negative classes are heavily imbalanced, with positive classes comprising, for instance, only 16% of the training samples when PM<sub>2.5</sub> is the target pollutant.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>The distribution of classes in the training, validation, and test sets.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="200"/>
            <col width="150"/>
            <col width="150"/>
            <col width="100"/>
            <col width="0"/>
            <col width="150"/>
            <col width="150"/>
            <col width="100"/>
            <thead>
              <tr valign="top">
                <td>Pollutant</td>
                <td colspan="4">Negative samples</td>
                <td colspan="3">Positive samples</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Training</td>
                <td>Validation</td>
                <td>Test</td>
                <td colspan="2">Training</td>
                <td>Validation</td>
                <td>Test</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>O<sub>3</sub><sup>a</sup></td>
                <td>24,322</td>
                <td>6269</td>
                <td>6311</td>
                <td colspan="2">4896</td>
                <td>1038</td>
                <td>982</td>
              </tr>
              <tr valign="top">
                <td>NO<sub>2</sub><sup>b</sup></td>
                <td>23,926</td>
                <td>6119</td>
                <td>6332</td>
                <td colspan="2">5292</td>
                <td>1188</td>
                <td>961</td>
              </tr>
              <tr valign="top">
                <td>PM<sub>2.5</sub><sup>c</sup></td>
                <td>24,297</td>
                <td>6745</td>
                <td>6757</td>
                <td colspan="2">4921</td>
                <td>562</td>
                <td>536</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>O<sub>3</sub>: ozone.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>NO<sub>2</sub>: nitrogen dioxide.</p>
            </fn>
            <fn id="table2fn3">
              <p><sup>c</sup>PM<sub>2.5</sub>: fine particulate matter.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Evaluation Metrics</title>
        <p>As we defined this task as a classification problem, we used the standard classification evaluation metrics. We report the accuracy and <italic>F</italic><sub>1</sub>-score of the positive class (the harmonic mean of precision and recall) of the predictions as evaluation metrics for all models. Although accuracy measures the total fraction of correct predictions and could misrepresent model performance in the presence of heavily imbalanced classes, the <italic>F</italic><sub>1</sub>-score considers class imbalance and is, therefore, a more appropriate metric for our problem.</p>
        <graphic xlink:href="formative_v6i12e23422_fig11.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        <graphic xlink:href="formative_v6i12e23422_fig12.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        <p>Where <italic>TP</italic>, <italic>TN</italic>, <italic>FP</italic>, and <italic>FN</italic> are the number of true positive samples, true negative samples, false positive samples, and false negative samples, respectively.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Overview</title>
        <p>In this section, we first present the findings of the data exploration. Next, we present the principal findings of this study.</p>
      </sec>
      <sec>
        <title>Insights From Collected Data</title>
        <p>In this section, we describe the thresholds of abnormal air pollutant concentrations and present the lag between the search anomalies and air pollution.</p>
        <sec>
          <title>Thresholds of Abnormal Air Pollutant Concentrations</title>
          <p>The major MSAs chosen for this study have different distributions of pollutant concentrations over time and almost always fall below the Environmental Protection Agency standard 24-hour threshold (<xref rid="figure3" ref-type="fig">Figure 3</xref>). However, multiple studies have shown that even at low concentrations, chronic exposure to air pollution negatively affects human health [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]. Therefore, calibrating a meaningful threshold for each city, especially those with generally lower levels of air pollution (eg, Miami), may be critical for adequately protecting population health. A natural way to do this may be to set the threshold to 1 SD above the mean daily pollutant concentration within each city, which was adopted in this study. The input predictors were also normalized within each city to reflect the city-level dynamics. The resulting thresholds for the 3 pollutants and cities under investigation are reported in <xref ref-type="table" rid="table3">Table 3</xref>.</p>
          <fig id="figure3" position="float">
            <label>Figure 3</label>
            <caption>
              <p>Distribution of pollution values for Atlanta, Los Angeles, Philadelphia, and Miami, with city-specific elevated pollution level (dashed line) and the general Environmental Protection Agency–mandated standard (dotted line), for ozone (O<sub>3</sub>; left column), nitrogen dioxide (NO<sub>2</sub>; middle column), and fine particulate matter (PM<sub>2.5</sub>; right column). EPA: Environmental Protection Agency.</p>
            </caption>
            <graphic xlink:href="formative_v6i12e23422_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <table-wrap position="float" id="table3">
            <label>Table 3</label>
            <caption>
              <p>Classification thresholds for 3 pollutants across 10 major metropolitan statistical areas in the United States.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="130"/>
              <col width="100"/>
              <col width="150"/>
              <col width="100"/>
              <col width="70"/>
              <col width="70"/>
              <col width="70"/>
              <col width="90"/>
              <col width="70"/>
              <col width="70"/>
              <col width="80"/>
              <thead>
                <tr valign="top">
                  <td>Pollutant</td>
                  <td>Los Angeles</td>
                  <td>District of Columbia</td>
                  <td>Philadelphia</td>
                  <td>Dallas</td>
                  <td>Atlanta</td>
                  <td>Boston</td>
                  <td>New York</td>
                  <td>Miami</td>
                  <td>Chicago</td>
                  <td>Houston</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>O<sub>3</sub><sup>a</sup> (ppb<sup>b</sup>)</td>
                  <td>55</td>
                  <td>54</td>
                  <td>53</td>
                  <td>53</td>
                  <td>53</td>
                  <td>48</td>
                  <td>49</td>
                  <td>45</td>
                  <td>49</td>
                  <td>49</td>
                </tr>
                <tr valign="top">
                  <td>NO<sub>2</sub><sup>c</sup> (ppb)</td>
                  <td>43.7</td>
                  <td>38.1</td>
                  <td>36</td>
                  <td>25.2</td>
                  <td>27.8</td>
                  <td>30.7</td>
                  <td>45.3</td>
                  <td>25.5</td>
                  <td>43.7</td>
                  <td>27.7</td>
                </tr>
                <tr valign="top">
                  <td>PM<sub>2.5</sub><sup>d</sup> (µg/m<sup>3</sup>)</td>
                  <td>18.7</td>
                  <td>15.1</td>
                  <td>16.4</td>
                  <td>13.1</td>
                  <td>15.6</td>
                  <td>12.4</td>
                  <td>13.9</td>
                  <td>10.6</td>
                  <td>16.2</td>
                  <td>14.4</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table3fn1">
                <p><sup>a</sup>O<sub>3</sub>: ozone.</p>
              </fn>
              <fn id="table3fn2">
                <p><sup>b</sup>ppb: parts per billion.</p>
              </fn>
              <fn id="table3fn3">
                <p><sup>c</sup>NO<sub>2</sub>: nitrogen dioxide.</p>
              </fn>
              <fn id="table3fn4">
                <p><sup>d</sup>PM<sub>2.5</sub>: fine particulate matter.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
        </sec>
        <sec>
          <title>Lag Between Search Anomalies and Air Pollution</title>
          <p>A previous study showed that there could be a lag between incident occurrence and Google search activity [<xref ref-type="bibr" rid="ref40">40</xref>]. As shown in <xref rid="figure4" ref-type="fig">Figure 4</xref>, the normalized search frequency of the term “cough” is correlated with the concentration of NO<sub>2</sub> in Atlanta with a certain lag of time. To determine the lag between elevated pollution levels and consequent pollution-related searches, the mean absolute Spearman correlation between pollutant concentrations and search interest data was calculated and shifted forward in time for 0, 1, 2, and 3 days. As shown in <xref ref-type="table" rid="table4">Table 4</xref>, for O<sub>3</sub> and PM<sub>2.5</sub>, the mean absolute Spearman correlation increased with an increase in the shifted days. Considering that the task aimed to detect elevated pollution levels as soon as possible, a lag of 1 day was applied to search data. In other words, the search interest data from the current day were used to estimate whether air pollution was elevated on the previous day.</p>
          <fig id="figure4" position="float">
            <label>Figure 4</label>
            <caption>
              <p>Daily nitrogen dioxide (NO<sub>2</sub>) levels and search interest for the term “cough” in October 2016 in Atlanta.</p>
            </caption>
            <graphic xlink:href="formative_v6i12e23422_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <table-wrap position="float" id="table4">
            <label>Table 4</label>
            <caption>
              <p>Cross-correlation of top 5 search terms with different lags for 3 pollutants in the Atlanta metropolitan area in 2016 (N=366).</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="80"/>
              <col width="160"/>
              <col width="70"/>
              <col width="160"/>
              <col width="70"/>
              <col width="160"/>
              <col width="70"/>
              <col width="160"/>
              <col width="70"/>
              <thead>
                <tr valign="top">
                  <td>Pollutant</td>
                  <td>Lag=0; search term (Spearman correlation)</td>
                  <td><italic>P</italic> value</td>
                  <td>Lag=1; search term (Spearman correlation)</td>
                  <td><italic>P</italic> value</td>
                  <td>Lag=2; search term (Spearman correlation)</td>
                  <td><italic>P</italic> value</td>
                  <td>Lag=3; search term (Spearman correlation)</td>
                  <td><italic>P</italic> value</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td colspan="9">
                    <bold>O<sub>3</sub><sup>a</sup></bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Cough (−0.34)</td>
                  <td>&lt;.001</td>
                  <td>Cough (−0.38)</td>
                  <td>&lt;.001</td>
                  <td>Cough (−0.41)</td>
                  <td>&lt;.001</td>
                  <td>Cough (−0.41)</td>
                  <td>&lt;.001</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Bronchitis (−0.31)</td>
                  <td>&lt;.001</td>
                  <td>Bronchitis (−0.32)</td>
                  <td>&lt;.001</td>
                  <td>Bronchitis (−0.33)</td>
                  <td>&lt;.001</td>
                  <td>Bronchitis (−0.35)</td>
                  <td>&lt;.001</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Traffic (0.26)</td>
                  <td>&lt;.001</td>
                  <td>Traffic (0.27)</td>
                  <td>&lt;.001</td>
                  <td>Traffic (0.26)</td>
                  <td>&lt;.001</td>
                  <td>Smoke (0.24)</td>
                  <td>&lt;.001</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Smoke (0.23)</td>
                  <td>&lt;.001</td>
                  <td>Chest pain (−0.23)</td>
                  <td>&lt;.001</td>
                  <td>Chest pain (−0.23)</td>
                  <td>&lt;.001</td>
                  <td>Traffic (0.23)</td>
                  <td>&lt;.001</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Snoring (0.22)</td>
                  <td>&lt;.001</td>
                  <td>Snoring (0.22)</td>
                  <td>&lt;.001</td>
                  <td>Smoke (0.22)</td>
                  <td>&lt;.001</td>
                  <td>Chest pain (−0.22)</td>
                  <td>&lt;.001</td>
                </tr>
                <tr valign="top">
                  <td colspan="9">
                    <bold>NO<sub>2</sub><sup>b</sup></bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Asthma (0.20)</td>
                  <td>&lt;.001</td>
                  <td>Sulfate (0.20)</td>
                  <td>&lt;.001</td>
                  <td>Sulfate (0.16)</td>
                  <td>.002</td>
                  <td>Cough (0.16)</td>
                  <td>.002</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Sulfate (0.19)</td>
                  <td>&lt;.001</td>
                  <td>Bronchitis (0.16)</td>
                  <td>.002</td>
                  <td>Bronchitis (0.15)</td>
                  <td>.005</td>
                  <td>COPD<sup>c</sup> (−0.16)</td>
                  <td>.003</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Cough (0.17)</td>
                  <td>&lt;.001</td>
                  <td>Inhaler (0.15)</td>
                  <td>.005</td>
                  <td>Cough (0.14)</td>
                  <td>.008</td>
                  <td>Bronchitis (0.14)</td>
                  <td>.008</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Bronchitis (0.17)</td>
                  <td>.001</td>
                  <td>Cough (0.14)</td>
                  <td>.006</td>
                  <td>Inhaler (0.11)</td>
                  <td>.03</td>
                  <td>Wheezing (−0.12)</td>
                  <td>.02</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Inhaler (0.16)</td>
                  <td>.002</td>
                  <td>Difficulty breathing (−0.12)</td>
                  <td>.02</td>
                  <td>Headache (−0.11)</td>
                  <td>.03</td>
                  <td>Headache (−0.10)</td>
                  <td>.04</td>
                </tr>
                <tr valign="top">
                  <td colspan="9">
                    <bold>PM<sub>2.5</sub><sup>d</sup></bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Wildfires (0.14)</td>
                  <td>.009</td>
                  <td>COPD (−0.15)</td>
                  <td>.005</td>
                  <td>Air pollution (0.19)</td>
                  <td>&lt;.001</td>
                  <td>Air pollution (0.18)</td>
                  <td>&lt;.001</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>COPD (−0.11)</td>
                  <td>.03</td>
                  <td>Wildfires (0.14)</td>
                  <td>.007</td>
                  <td>COPD (−0.17)</td>
                  <td>.001</td>
                  <td>COPD (−0.18)</td>
                  <td>&lt;.001</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Snoring (0.11)</td>
                  <td>.03</td>
                  <td>Air pollution (0.14)</td>
                  <td>.008</td>
                  <td>Wildfires (0.14)</td>
                  <td>.009</td>
                  <td>Wildfires (0.15)</td>
                  <td>.004</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Inhaler (0.10)</td>
                  <td>.06</td>
                  <td>Asthma attack (0.11)</td>
                  <td>.04</td>
                  <td>Respiratory illness (0.10)</td>
                  <td>.05</td>
                  <td>Sulfate (−0.11)</td>
                  <td>.03</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Difficulty breathing (−0.09)</td>
                  <td>.08</td>
                  <td>Respiratory illness (0.10)</td>
                  <td>.05</td>
                  <td>Traffic (0.10)</td>
                  <td>.06</td>
                  <td>Traffic (0.11)</td>
                  <td>.04</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table4fn1">
                <p><sup>a</sup>O<sub>3</sub>: ozone.</p>
              </fn>
              <fn id="table4fn2">
                <p><sup>b</sup>NO<sub>2</sub>: nitrogen dioxide.</p>
              </fn>
              <fn id="table4fn3">
                <p><sup>c</sup>COPD: chronic obstructive pulmonary disease.</p>
              </fn>
              <fn id="table4fn4">
                <p><sup>d</sup>PM<sub>2.5</sub>: fine particulate matter.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
        </sec>
      </sec>
      <sec>
        <title>Evaluation Outcomes</title>
        <p>In this section, we consider 3 conditions to evaluate the performance of using web search data to detect elevated pollution, that is, using only search data, using search data as auxiliary data for meteorological data, and using search data as auxiliary data for meteorological data and historical pollutant concentrations.</p>
        <sec>
          <title>Using Only Search Data</title>
          <p>For areas where ambient pollution monitoring is unavailable, investigating whether web search data can be used as the only signal for nowcasting elevated air pollution is a vital question. When relying only on search data for air pollution prediction, both the proposed DL-LSTM architecture and STE contribute to the improvement of prediction accuracy. As shown in the “Search” section of <xref ref-type="table" rid="table5">Table 5</xref>, the LSTM-based models exhibited superior accuracy over the baseline LR and RF models for O<sub>3</sub> and NO<sub>2.</sub> For PM<sub>2.5</sub>, the proposed models did not perform better than the baseline LR or LSTM model because the validation and test data sets were heavily imbalanced (<xref ref-type="table" rid="table5">Table 5</xref>). The proposed DL-LSTM w/STE model achieved the highest <italic>F</italic><sub>1</sub>-score (32.44% for O<sub>3</sub> and 27.70% for NO<sub>2</sub>) for detecting O<sub>3</sub> and NO<sub>2</sub> pollution.</p>
          <table-wrap position="float" id="table5">
            <label>Table 5</label>
            <caption>
              <p>Accuracy and <italic>F</italic>1-score of the logistic regression, random forest, and long short-term memory models for detecting elevated pollution across 10 major US cities, for varying input feature combinations: no prior knowledge, search data only (Search), meteorological data only (Met), meteorological data and search data (Met+Search), meteorological data and historical pollutant concentration (Met+Pol) and all input features (Met+Pol+Search).</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="30"/>
              <col width="310"/>
              <col width="220"/>
              <col width="220"/>
              <col width="220"/>
              <thead>
                <tr valign="bottom">
                  <td colspan="2">Features and model</td>
                  <td>O<sub>3</sub><sup>a</sup>, accuracy (<italic>F</italic><sub>1</sub>-score; %)</td>
                  <td>NO<sub>2</sub><sup>b</sup>, accuracy (<italic>F</italic><sub>1</sub>-score; %)</td>
                  <td>PM<sub>2.5</sub><sup>c</sup>, accuracy (<italic>F</italic><sub>1</sub>-score; %)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td colspan="5">
                    <bold>No prior knowledge</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>All positives</td>
                  <td>13.46 (23.73)</td>
                  <td>13.18 (23.28)</td>
                  <td>7.35 (13.69)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>All negatives</td>
                  <td>86.54 (0.0)</td>
                  <td>86.82 (0.0)</td>
                  <td>92.65 (0.0)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Random (prob of positive=0.5)</td>
                  <td>50.29 (20.63)</td>
                  <td>50.56 (20.68)</td>
                  <td>50.65 (12.67)</td>
                </tr>
                <tr valign="top">
                  <td colspan="5">
                    <bold>Search</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>LR<sup>d</sup></td>
                  <td>36.93 (17.77)</td>
                  <td>53.97 (24.17)</td>
                  <td>78.29 (10.72)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>RF<sup>e</sup></td>
                  <td>33.53 (23.36)</td>
                  <td>55.22 (18.1)</td>
                  <td>
                    <italic>92.65</italic>
                    <sup>f</sup>
                    <italic>(0.0)</italic>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>LSTM<sup>g</sup></td>
                  <td>46.73 (23.63)</td>
                  <td>69.68 (21.62)</td>
                  <td>89.96 (7.58)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>LSTM-GloVe<sup>h</sup></td>
                  <td>53.23 (28.45)</td>
                  <td>63.44 (27.4)</td>
                  <td>90.09 (3.73)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>LSTM-GloVe w/STE<sup>i</sup></td>
                  <td>69.17 (28.04)</td>
                  <td>46.85 (26.51)</td>
                  <td>91.73 (1.31)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>DL-LSTM<sup>j</sup></td>
                  <td>62.46 (30.4)</td>
                  <td>65.99 (26.19)</td>
                  <td>88.61 (7.97)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>DL-LSTM w/STE</td>
                  <td>69.61 (32.44)</td>
                  <td>56.84 (27.7)</td>
                  <td>87.59 (6.99)</td>
                </tr>
                <tr valign="top">
                  <td colspan="5">
                    <bold>Met</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>LR</td>
                  <td>62.57 (39.81)</td>
                  <td>63.64 (37.25)</td>
                  <td>58.58 (22)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>RF</td>
                  <td>78.76 (50.59)</td>
                  <td>71.77 (39.88)</td>
                  <td>73.78 (24.67)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>LSTM</td>
                  <td>76.54 (48.29)</td>
                  <td>72.52 (41.27)</td>
                  <td>67.89 (24.69)</td>
                </tr>
                <tr valign="top">
                  <td colspan="5">
                    <bold>Met+search</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>LR</td>
                  <td>55.99 (36.56)</td>
                  <td>62 (36.25)</td>
                  <td>61.25 (21.5)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>RF</td>
                  <td>81.39 (45.35)</td>
                  <td>73.77 (38.71)</td>
                  <td>87.96 (23.78)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>LSTM</td>
                  <td>78.18 (47.65)</td>
                  <td>77.75 (40.31)</td>
                  <td>88.14 (21.29)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>LSTM-GloVe</td>
                  <td>80.04 (49.37)</td>
                  <td>72.75 (40.35)</td>
                  <td>85.38 (26.99)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>LSTM-GloVe w/STE</td>
                  <td>81.85 (50.71)</td>
                  <td>74.21 (41.49)</td>
                  <td>85.42 (26.13)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>DL-LSTM</td>
                  <td>77.97 (48.94)</td>
                  <td>74.81 (40.53)</td>
                  <td>84.94 (24.07)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>DL-LSTM w/STE</td>
                  <td>80.16 (49.32)</td>
                  <td>72.99 (40.34)</td>
                  <td>87.04 (21.32)</td>
                </tr>
                <tr valign="top">
                  <td colspan="5">
                    <bold>Met+pol</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>LR</td>
                  <td>67.38 (44.61)</td>
                  <td>70.05 (44.09)</td>
                  <td>74.45 (32.82)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>RF</td>
                  <td>82.81 (57.23)</td>
                  <td>80.35 (51.24)</td>
                  <td>86.45 (40.63)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>LSTM</td>
                  <td>86.97 (63.01)</td>
                  <td>84.64 (55.59)</td>
                  <td>85.25 (43.19)</td>
                </tr>
                <tr valign="top">
                  <td colspan="5">
                    <bold>Met+pol+search</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>LR</td>
                  <td>66.91 (43.71)</td>
                  <td>69.13 (43.6)</td>
                  <td>74.45 (32.82)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>RF</td>
                  <td>82.76 (55.91)</td>
                  <td>78.91 (47.72)</td>
                  <td>89.43 (37.57)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>LSTM</td>
                  <td>87.11 (61.54)</td>
                  <td>84.71 (54.02)</td>
                  <td>90.74 (44.81)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>LSTM-GloVe</td>
                  <td>87.94 (63.81)</td>
                  <td>82.98 (53.78)</td>
                  <td>88.19 (46.55)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>LSTM-GloVe w/STE</td>
                  <td>87.63 (63.83)</td>
                  <td>83.81 (54.59)</td>
                  <td>88.24 (46.51)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>DL-LSTM</td>
                  <td>87.30 (63.02)</td>
                  <td>82.65 (53.65)</td>
                  <td>89.66 (47.35)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>DL-LSTM w/STE</td>
                  <td>87.60 (63.61)</td>
                  <td>83.40 (53.58)</td>
                  <td>89.25 (46.59)</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table5fn1">
                <p><sup>a</sup>O<sub>3</sub>: ozone.</p>
              </fn>
              <fn id="table5fn2">
                <p><sup>b</sup>NO<sub>2</sub>: nitrogen dioxide.</p>
              </fn>
              <fn id="table5fn3">
                <p><sup>c</sup>PM<sub>2.5</sub>: fine particulate matter.</p>
              </fn>
              <fn id="table5fn4">
                <p><sup>d</sup>LR: logistic regression.</p>
              </fn>
              <fn id="table5fn5">
                <p><sup>e</sup>RF: random forest.</p>
              </fn>
              <fn id="table5fn6">
                <p><sup>f</sup>This high accuracy is simply due to class imbalance; this model always predicts negative class, and the corresponding <italic>F</italic><sub>1</sub>-score is 0.</p>
              </fn>
              <fn id="table5fn7">
                <p><sup>g</sup>LSTM: long short-term memory.</p>
              </fn>
              <fn id="table5fn8">
                <p><sup>h</sup>GloVe: Global Vectors for Word Representation.</p>
              </fn>
              <fn id="table5fn9">
                <p><sup>i</sup>STE: search term expansion.</p>
              </fn>
              <fn id="table5fn10">
                <p><sup>j</sup>DL-LSTM: dictionary learner-long short-term memory.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
        </sec>
        <sec>
          <title>Using Search Data and Meteorological Data</title>
          <p>When meteorological data were available, we investigated the feasibility of using meteorological data with or without search activity data to nowcast air pollution under this condition. As shown in the “Met” and “Met+Search” sections of <xref ref-type="table" rid="table5">Table 5</xref>, the inclusion of web search data improves the nowcasting accuracy for all 3 pollutants. In addition, the LSTM-GloVe w/STE model achieved the highest <italic>F</italic><sub>1</sub>-score (50.71% for O<sub>3</sub> and 41.49% for NO<sub>2</sub>) for the detection of O<sub>3</sub> and NO<sub>2</sub> pollution. The LSTM-GloVe without STE model achieved the highest <italic>F</italic><sub>1</sub>-score (26.99%) for detecting PM<sub>2.5</sub> pollution.</p>
        </sec>
        <sec>
          <title>Using Search Data, Meteorological Data, and Historical Pollutant Concentration</title>
          <p>When historical pollution concentration is available, search activity data are added as auxiliary data to both meteorological data and historical pollution data. As shown in the “Met+Pol” and “Met+Pol+Search” sections of <xref ref-type="table" rid="table5">Table 5</xref>, the inclusion of web search data improves the nowcasting accuracy for O<sub>3</sub> and PM<sub>2.5</sub>. However, for NO<sub>2,</sub> the inclusion of web search data does not improve the nowcasting accuracy, which indicates that increases in NO<sub>2</sub> concentrations may not be directly noticeable by people sufficiently to increase their search interest. This difference in the performance for different pollutants and locations merits further investigation.</p>
        </sec>
      </sec>
      <sec>
        <title>City-Level Analysis of O<sub>3</sub> Pollution Prediction</title>
        <p>We investigated the potential of using search interest and meteorological data to replace ground-based O<sub>3</sub> sensor data for predicting O<sub>3</sub> pollution in individual cities. As shown in <xref ref-type="table" rid="table6">Table 6</xref>, including search interest data (Met+Search) to augment purely meteorological data (Met) increases both the accuracy and <italic>F</italic><sub>1</sub>-score metrics for most cities. Although these metrics do not reach performance when ground-level pollution sensors are available (Met+Pol), at least for two of the major MSAs (Philadelphia and Houston), search volume data indeed provides a useful alternative to pollution monitors, with only 1.6% and 0.14% degradation in accuracy, respectively. In addition, the differences in model performance across different cities indicate that web-based search patterns could vary from city to city. As shown in <xref ref-type="table" rid="table7">Table 7</xref>, the top 5 correlated terms differ across US cities over 10 years. The variation in search patterns could lead to degraded prediction performance in certain areas, leaving promising directions for improvement.</p>
        <table-wrap position="float" id="table6">
          <label>Table 6</label>
          <caption>
            <p>City-level accuracy and <italic>F</italic>1-score for detecting elevated ozone pollution in 10 US cities, with Met (long short-term memory model), Met+Search (dictionary learner-long short-term memory w/search term expansion) and Met+Pol (long short-term memory model) as features.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="100"/>
            <col width="0"/>
            <col width="100"/>
            <col width="0"/>
            <col width="150"/>
            <col width="0"/>
            <col width="100"/>
            <col width="0"/>
            <col width="70"/>
            <col width="0"/>
            <col width="70"/>
            <col width="0"/>
            <col width="70"/>
            <col width="0"/>
            <col width="90"/>
            <col width="0"/>
            <col width="70"/>
            <col width="0"/>
            <col width="70"/>
            <col width="0"/>
            <col width="80"/>
            <thead>
              <tr valign="top">
                <td colspan="3">Features</td>
                <td colspan="2">Los Angeles</td>
                <td colspan="2">District of Columbia</td>
                <td colspan="2">Philadelphia</td>
                <td colspan="2">Dallas</td>
                <td colspan="2">Atlanta</td>
                <td colspan="2">Boston</td>
                <td colspan="2">New York</td>
                <td colspan="2">Miami</td>
                <td colspan="2">Chicago</td>
                <td>Houston</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="22">
                  <bold>Accuracy, %</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Met<sup>a</sup></td>
                <td colspan="2">72.6</td>
                <td colspan="2">77.4</td>
                <td colspan="2">83.29</td>
                <td colspan="2">83.42</td>
                <td colspan="2">83.56</td>
                <td colspan="2">75.62</td>
                <td colspan="2">68.36</td>
                <td colspan="2">58.09</td>
                <td colspan="2">76.71</td>
                <td colspan="2">85.89</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Met+search</td>
                <td colspan="2">76.71</td>
                <td colspan="2">80.68</td>
                <td colspan="2">87.4</td>
                <td colspan="2">79.86</td>
                <td colspan="2">83.84</td>
                <td colspan="2">78.63</td>
                <td colspan="2">74.93</td>
                <td colspan="2">69.29</td>
                <td colspan="2">80</td>
                <td colspan="2">90.14</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Met+pol<sup>b</sup></td>
                <td colspan="2">85.89</td>
                <td colspan="2">86.99</td>
                <td colspan="2">89.04</td>
                <td colspan="2">89.04</td>
                <td colspan="2">88.22</td>
                <td colspan="2">84.66</td>
                <td colspan="2">86.85</td>
                <td colspan="2">82.02</td>
                <td colspan="2">86.85</td>
                <td colspan="2">90</td>
              </tr>
              <tr valign="top">
                <td colspan="22">
                  <bold><italic>F</italic><sub>1</sub>- score, %</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Met</td>
                <td colspan="2">51.69</td>
                <td colspan="2">48.28</td>
                <td colspan="2">53.79</td>
                <td colspan="2">53.28</td>
                <td colspan="2">48.72</td>
                <td colspan="2">46.06</td>
                <td colspan="2">44.07</td>
                <td colspan="2">32.52</td>
                <td colspan="2">56.19</td>
                <td colspan="2">57.26</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Met+search</td>
                <td colspan="2">54.3</td>
                <td colspan="2">50.53</td>
                <td colspan="2">58.56</td>
                <td colspan="2">41.9</td>
                <td colspan="2">42.72</td>
                <td colspan="2">48</td>
                <td colspan="2">47.86</td>
                <td colspan="2">35.84</td>
                <td colspan="2">57.56</td>
                <td colspan="2">59.09</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Met+pol</td>
                <td colspan="2">68.11</td>
                <td colspan="2">60.58</td>
                <td colspan="2">64.29</td>
                <td colspan="2">64.6</td>
                <td colspan="2">56.12</td>
                <td colspan="2">55.56</td>
                <td colspan="2">63.64</td>
                <td colspan="2">55.48</td>
                <td colspan="2">70.73</td>
                <td colspan="2">67.26</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table6fn1">
              <p><sup>a</sup>Met: meteorological data.</p>
            </fn>
            <fn id="table6fn2">
              <p><sup>b</sup>Pol: pollution data.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table7">
          <label>Table 7</label>
          <caption>
            <p>Top 5 correlated search terms for ozone pollution in 10 US cities: January 1, 2010, to December 31, 2019.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="470"/>
            <col width="0"/>
            <col width="500"/>
            <thead>
              <tr valign="top">
                <td colspan="3">City and search term</td>
                <td>Spearman correlation (lag=1)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="4">
                  <bold>Los Angeles</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Cough</td>
                <td colspan="2">−0.40</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Bronchitis</td>
                <td colspan="2">−0.33</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Wildfires</td>
                <td colspan="2">0.24</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Traffic</td>
                <td colspan="2">0.14</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Respiratory infection</td>
                <td colspan="2">−0.12</td>
              </tr>
              <tr valign="top">
                <td colspan="4">
                  <bold>District of Columbia</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Bronchitis</td>
                <td colspan="2">−0.25</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Cough</td>
                <td colspan="2">−0.25</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Coughing</td>
                <td colspan="2">−0.19</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Headache</td>
                <td colspan="2">−0.14</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Wildfires</td>
                <td colspan="2">0.13</td>
              </tr>
              <tr valign="top">
                <td colspan="4">
                  <bold>Philadelphia</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Cough</td>
                <td colspan="2">−0.33</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Traffic</td>
                <td colspan="2">0.27</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Bronchitis</td>
                <td colspan="2">−0.20</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Organic carbon</td>
                <td colspan="2">−0.10</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Respiratory infection</td>
                <td colspan="2">−0.09</td>
              </tr>
              <tr valign="top">
                <td colspan="4">
                  <bold>Dallas</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Cough</td>
                <td colspan="2">−0.25</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Bronchitis</td>
                <td colspan="2">−0.24</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Ozone</td>
                <td colspan="2">0.17</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Wildfires</td>
                <td colspan="2">0.15</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Coughing</td>
                <td colspan="2">−0.14</td>
              </tr>
              <tr valign="top">
                <td colspan="4">
                  <bold>Atlanta</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Bronchitis</td>
                <td colspan="2">−0.14</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Cough</td>
                <td colspan="2">−0.11</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Chest pain</td>
                <td colspan="2">−0.10</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Respiratory infection</td>
                <td colspan="2">−0.09</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Wheezing</td>
                <td colspan="2">−0.07</td>
              </tr>
              <tr valign="top">
                <td colspan="4">
                  <bold>Boston</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Smoke</td>
                <td colspan="2">−0.11</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Haze</td>
                <td colspan="2">−0.07</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Code red</td>
                <td colspan="2">−0.06</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Coughing</td>
                <td colspan="2">0.06</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Smog</td>
                <td colspan="2">0.05</td>
              </tr>
              <tr valign="top">
                <td colspan="4">
                  <bold>New York</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Bronchitis</td>
                <td colspan="2">−0.31</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Traffic</td>
                <td colspan="2">0.29</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Cough</td>
                <td colspan="2">−0.25</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Wildfires</td>
                <td colspan="2">0.19</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Wheezing</td>
                <td colspan="2">−0.15</td>
              </tr>
              <tr valign="top">
                <td colspan="4">
                  <bold>Miami</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Bronchitis</td>
                <td colspan="2">0.14</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Air pollution</td>
                <td colspan="2">0.13</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Cough</td>
                <td colspan="2">0.13</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Power plants</td>
                <td colspan="2">0.09</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Nitrogen dioxide</td>
                <td colspan="2">0.08</td>
              </tr>
              <tr valign="top">
                <td colspan="4">
                  <bold>Chicago</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Wildfires</td>
                <td colspan="2">0.18</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Smoke</td>
                <td colspan="2">0.08</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Shortness of breath</td>
                <td colspan="2">0.04</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Heart murmur</td>
                <td colspan="2">0.04</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Tail pipe</td>
                <td colspan="2">0.04</td>
              </tr>
              <tr valign="top">
                <td colspan="4">
                  <bold>Houston</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Ozone</td>
                <td colspan="2">0.12</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Air pollution</td>
                <td colspan="2">0.12</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Asthma</td>
                <td colspan="2">0.06</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Organic carbon</td>
                <td colspan="2">0.05</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Wildfires</td>
                <td colspan="2">0.05</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Sensitivity Analysis of Air Pollution Thresholds</title>
        <p>Classification thresholds play an important role in our model. In this study, an SD threshold from the mean of the corresponding pollutants was used as a “probability threshold” to detect air pollution at a spatial-temporal resolution. However, the proposed method is sensitive to this threshold. We further investigated the performance of the proposed method using a variety of fixed classification thresholds. As shown in <xref rid="figure5" ref-type="fig">Figures 5</xref>-<xref rid="figure7" ref-type="fig">7</xref>, we fixed the classification thresholds for all 10 cities to detect O<sub>3</sub>, NO<sub>2</sub>, and PM<sub>2.5</sub> pollutions. The results show that the meteorological and search data are complementary, and combining the search and meteorological data leads to better prediction performance for all classification thresholds.</p>
        <fig id="figure5" position="float">
          <label>Figure 5</label>
          <caption>
            <p>Accuracy (left figure) and <italic>F</italic>1-score (right figure) for detecting ozone (O<sub>3</sub>) pollution on various classification thresholds, with Met (long short-term memory model) and Met+Search (dictionary learner-long short-term memory w/search term expansion) as features. Met: meteorological data; ppb: parts per billion.</p>
          </caption>
          <graphic xlink:href="formative_v6i12e23422_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure6" position="float">
          <label>Figure 6</label>
          <caption>
            <p>Accuracy (left figure) and <italic>F</italic>1-score (right figure) for detecting nitrogen dioxide (NO<sub>2</sub>) pollution on various classification thresholds, with Met (long short-term memory model) and Met+Search (dictionary learner-long short-term memory w/search term expansion) as features. Met: meteorological data; ppb: parts per billion.</p>
          </caption>
          <graphic xlink:href="formative_v6i12e23422_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure7" position="float">
          <label>Figure 7</label>
          <caption>
            <p>Accuracy (left figure) and <italic>F</italic>1-score (right figure) for detecting fine particulate matter (PM<sub>2.5</sub>) pollution on various classification thresholds, with Met (long short-term memory model) and Met+Search (dictionary learner-long short-term memory w/search term expansion) as features. Met: meteorological data.</p>
          </caption>
          <graphic xlink:href="formative_v6i12e23422_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>In this study, we explored various existing air pollution prediction models and found that the use of a time series neural network approach achieved the highest predictive accuracy in most of our experiments. The results showed that the LSTM-based models achieved superior accuracy for the 3 air pollutants when both meteorological data and web search data were available. Furthermore, our results on the inclusion of web search data with meteorological data indicate that under short reporting delays, the LSTM models could provide highly accurate predictions compared with baseline models using meteorological and historical pollution concentration data.</p>
        <p>Compared with existing studies that predict urban air pollution concentrations using linear and nonlinear machine learning models [<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref41">41</xref>-<xref ref-type="bibr" rid="ref47">47</xref>], our proposed method can predict air pollution when source emissions and remotely sensed satellite data are infeasible (eg, sensed satellite data often suffer from a high missing rate owing to frequent cloud cover [<xref ref-type="bibr" rid="ref48">48</xref>]). Previous studies using web-based search behavior have emphasized the use of Google Trends [<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref49">49</xref>] and applied regularized linear regression to collinear web search queries to estimate disease rates from social media or web-based search data [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref50">50</xref>-<xref ref-type="bibr" rid="ref54">54</xref>]. Our research further explored the possibility of using LSTM models with semantic embeddings of search queries to predict air pollution. As shown in <xref rid="figure8" ref-type="fig">Figures 8</xref> and <xref rid="figure9" ref-type="fig">9</xref>, the semantic embeddings of search terms fine-tuned by the DL-LSTM model are less correlated compared with their initial GloVe embeddings, which shows that the collinearity between search terms is reduced during the training process.</p>
        <p>We also explored various combinations of search terms and found that a comprehensive set of user queries was critical for accurately capturing people’s responses to urban air pollution. In this study, we expanded the initial set of seed terms using semantic and temporal correlations with search queries from Google Correlate. We investigated the contribution of different search term groups by manually classifying the search terms into 4 categories, where the unclassified category includes terms with ambiguous meanings. <xref ref-type="table" rid="table8">Table 8</xref> shows the accuracy and <italic>F</italic><sub>1</sub>-score when we removed search terms by categories for predicting O<sub>3</sub>, NO<sub>2</sub>, and PM<sub>2.5</sub> pollution. Removing the search terms in the symptom, observation, and source categories led to a decrease in the accuracy score for detecting at least two pollutants. At the same time, removing the search terms with ambiguous meaning only led to a slightly higher accuracy score for all 3 pollutants.</p>
        <fig id="figure8" position="float">
          <label>Figure 8</label>
          <caption>
            <p>Cosine similarity between GloVe embeddings of seed search terms. GloVe: Global Vectors for Word Representation.</p>
          </caption>
          <graphic xlink:href="formative_v6i12e23422_fig8.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure9" position="float">
          <label>Figure 9</label>
          <caption>
            <p>Cosine similarity between trained embeddings of seed search terms.</p>
          </caption>
          <graphic xlink:href="formative_v6i12e23422_fig9.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <table-wrap position="float" id="table8">
          <label>Table 8</label>
          <caption>
            <p>Accuracy and <italic>F</italic>1-score of removing different categories of search terms for detecting ozone, nitrogen dioxide, and fine particulate matter pollution using search (dictionary learner-long short-term memory w/search term expansion) as features.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="330"/>
            <col width="0"/>
            <col width="320"/>
            <col width="0"/>
            <col width="320"/>
            <thead>
              <tr valign="bottom">
                <td colspan="3">Pollutant and terms</td>
                <td colspan="2">Accuracy (change; %)</td>
                <td><italic>F</italic><sub>1</sub>-score (change; %)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="6">
                  <bold>O<sub>3</sub><sup>a</sup></bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>All</td>
                <td colspan="2">0.6961</td>
                <td colspan="2">0.3244</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>All wo<sup>b</sup> symptom</td>
                <td colspan="2">0.647 (−7.1)</td>
                <td colspan="2">0.3024 (−6.8)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>All wo observation</td>
                <td colspan="2">0.622 (−10.6)</td>
                <td colspan="2">0.3264 (+0.6)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>All wo source</td>
                <td colspan="2">0.6712 (−3.6)</td>
                <td colspan="2">0.3033 (−6.5)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>All wo unclassified</td>
                <td colspan="2">0.7057 (+1.4)</td>
                <td colspan="2">0.3273 (+0.9)</td>
              </tr>
              <tr valign="top">
                <td colspan="6">
                  <bold>NO<sub>2</sub><sup>c</sup></bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>All</td>
                <td colspan="2">0.5684</td>
                <td colspan="2">0.2770</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>All wo symptom</td>
                <td colspan="2">0.4452 (−22.0)</td>
                <td colspan="2">0.2418 (−12.7)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>All wo observation</td>
                <td colspan="2">0.6125 (+7.8)</td>
                <td colspan="2">0.2480 (−10.5)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>All wo source</td>
                <td colspan="2">0.5452 (−4.1)</td>
                <td colspan="2">0.2647 (−4.4)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>All wo unclassified</td>
                <td colspan="2">0.6534 (+15.0)</td>
                <td colspan="2">0.2134 (−23.0)</td>
              </tr>
              <tr valign="top">
                <td colspan="6">
                  <bold>PM<sub>2.5</sub><sup>d</sup></bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>All</td>
                <td colspan="2">0.8759</td>
                <td colspan="2">0.0699</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>All wo symptom</td>
                <td colspan="2">0.7897 (−9.8)</td>
                <td colspan="2">0.1029 (+47.2)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>All wo observation</td>
                <td colspan="2">0.7496 (−14.4)</td>
                <td colspan="2">0.1049 (+50.1)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>All wo source</td>
                <td colspan="2">0.8994 (+2.7)</td>
                <td colspan="2">0.0393 (−43.8)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>All wo unclassified</td>
                <td colspan="2">0.8991 (+2.6)</td>
                <td colspan="2">0.0264 (−62.2)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table8fn1">
              <p><sup>a</sup>O<sub>3</sub>: ozone.</p>
            </fn>
            <fn id="table8fn2">
              <p><sup>b</sup>wo: without.</p>
            </fn>
            <fn id="table8fn3">
              <p><sup>c</sup>NO<sub>2</sub>: nitrogen dioxide.</p>
            </fn>
            <fn id="table8fn4">
              <p><sup>d</sup>PM<sub>2.5</sub>: fine particulate matter.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>By analyzing the coefficients of each search term, the results show that several search terms contribute more than other search terms. The average feature importance of the seed search terms was calculated using the RF model. As shown in Figure S1, Figure S2, and Figure S3 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>, search terms including “particular matter,” “rapid breathing,” and “throat irritation” have relatively high feature importance for detecting O<sub>3</sub>, NO<sub>2</sub>, and PM<sub>2.5</sub> pollution, respectively. The results also indicated that no search terms worked best for all 3 pollutants.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>A key limitation of this study is the tuning of the neural network model. First, the performance of neural network models is sensitive to several hyperparameters, including optimization choices, depth, width, and regularization. Owing to computational limitations, we adopted a simple LSTM architecture with a single 128-unit hidden layer and tuned the model using validation data sets for other hyperparameters. In addition, we noticed that stochastic components such as the random seed for the RF model and the randomness in the optimization process of LSTM models influenced the interpretation of the results. Therefore, we repeated the experiments 10 times with different random seeds for the RF and LSTM models. As the time cost of repeating LSTM models is high, we only repeated the RF, LSTM, and DL-LSTM models 10 times to predict O<sub>3</sub> pollution with all input features. The accuracy of the DL-LSTM model is mean 0.8744 (SD 0.0046). Compared with the LSTM model (mean 0.8714, SD 0.0036), the improvement was not significant (<italic>P</italic>=.11). Compared with the RF model (mean 0.8273, SD 0.0017), the improvement was significant (<italic>P</italic>&lt;.001). The <italic>F</italic><sub>1</sub>-score for the DL-LSTM model is mean 0.6314 (SD 0.0058). Compared with both the LSTM (mean 0.6019, SD 0.0096) and RF models (mean 0.5588, SD 0.0024), the improvements are significant (<italic>P</italic>&lt;.001), which shows that the results of the LSTM models are stable. There is room for further exploration of more sophisticated neural network model architectures for noninfectious disease prediction [<xref ref-type="bibr" rid="ref55">55</xref>-<xref ref-type="bibr" rid="ref57">57</xref>]. We leave the exploration of deeper and wider architectures to future work.</p>
        <p>Another limitation relates to the biases introduced by relying on search data, which may not reflect the underlying population demographics or experiences. Although some of these issues are alleviated automatically by training a model against ground sensor pollution levels, understanding and correcting these data biases requires further study. In the future, we plan to investigate other sources of crowd-based surveillance data, such as self-reports on social media, to augment traditional physical sensor methods, thus providing a more direct, human-centered measure of how people experience elevated air pollution levels.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>In this study, we posit that although web search data cannot yet completely replace ground-based pollution monitors, it may already serve as a valuable additional signal to augment ground-based pollution data, providing significant accuracy improvements for detecting unusual spikes in air pollution. We also found that the correlation between search terms and pollution concentration varies at the city level. Therefore, the model must be fine-tuned when applied to specific cities. For model and search term selection, we used the simplest LSTM architecture with a dictionary learner module and found that no search terms worked best for all the 3 pollutants. We propose the use of our model to learn the semantic correlations between available search terms to obtain better prediction results.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>The descriptions of search terms, data source, and model hyperparameters.</p>
        <media xlink:href="formative_v6i12e23422_app1.pdf" xlink:title="PDF File  (Adobe PDF File), 107 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Average feature importance for detecting ozone, nitrogen dioxide, and fine particulate matter pollution using random forest models.</p>
        <media xlink:href="formative_v6i12e23422_app2.pdf" xlink:title="PDF File  (Adobe PDF File), 385 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">DL-LSTM</term>
          <def>
            <p>dictionary learner-long short-term memory</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">GloVe</term>
          <def>
            <p>Global Vectors for Word Representation</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">LR</term>
          <def>
            <p>logistic regression</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">LSTM</term>
          <def>
            <p>long short-term memory</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">MSA</term>
          <def>
            <p>metropolitan statistical area</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">NO<sub>2</sub></term>
          <def>
            <p>nitrogen dioxide</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">O<sub>3</sub></term>
          <def>
            <p>ozone</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">PM<sub>2.5</sub></term>
          <def>
            <p>fine particulate matter</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">RF</term>
          <def>
            <p>random forest</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">RNN</term>
          <def>
            <p>recurrent neural network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">STE</term>
          <def>
            <p>search term expansion</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This work was supported by grants from the National Institutes of Health National Library of Medicine (R21LM013014). The funders had no role in the study design, data collection and analysis, decision to publish, or preparation of the manuscript.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Brynjolfsson</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Geva</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Reichman</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Crowd-squared: amplifying the predictive power of search trend data</article-title>
          <source>MIS Q</source>
          <year>2016</year>
          <month>4</month>
          <day>4</day>
          <volume>40</volume>
          <issue>4</issue>
          <fpage>941</fpage>
          <lpage>61</lpage>
          <pub-id pub-id-type="doi">10.25300/MISQ/2016/40.4.07</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fung</surname>
              <given-names>IC</given-names>
            </name>
            <name name-style="western">
              <surname>Tse</surname>
              <given-names>ZT</given-names>
            </name>
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>KW</given-names>
            </name>
          </person-group>
          <article-title>The use of social media in public health surveillance</article-title>
          <source>Western Pac Surveill Response J</source>
          <year>2015</year>
          <month>6</month>
          <day>26</day>
          <volume>6</volume>
          <issue>2</issue>
          <fpage>3</fpage>
          <lpage>6</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/26306208"/>
          </comment>
          <pub-id pub-id-type="doi">10.5365/WPSAR.2015.6.1.019</pub-id>
          <pub-id pub-id-type="medline">26306208</pub-id>
          <pub-id pub-id-type="pii">WPSAR.2015.6.2-003</pub-id>
          <pub-id pub-id-type="pmcid">PMC4542478</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hill</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Merchant</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Ungar</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Lessons learned about public health from online crowd surveillance</article-title>
          <source>Big Data</source>
          <year>2013</year>
          <month>10</month>
          <day>10</day>
          <volume>1</volume>
          <issue>3</issue>
          <fpage>160</fpage>
          <lpage>7</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/25045598"/>
          </comment>
          <pub-id pub-id-type="doi">10.1089/big.2013.0020</pub-id>
          <pub-id pub-id-type="medline">25045598</pub-id>
          <pub-id pub-id-type="pmcid">PMC4102381</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Broniatowski</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>Paul</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Dredze</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>National and local influenza surveillance through Twitter: an analysis of the 2012-2013 influenza epidemic</article-title>
          <source>PLoS One</source>
          <year>2013</year>
          <month>12</month>
          <day>9</day>
          <volume>8</volume>
          <issue>12</issue>
          <fpage>e83672</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0083672"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0083672</pub-id>
          <pub-id pub-id-type="medline">24349542</pub-id>
          <pub-id pub-id-type="pii">PONE-D-13-35058</pub-id>
          <pub-id pub-id-type="pmcid">PMC3857320</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Santillana</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Nguyen</surname>
              <given-names>AT</given-names>
            </name>
            <name name-style="western">
              <surname>Dredze</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Paul</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Nsoesie</surname>
              <given-names>EO</given-names>
            </name>
            <name name-style="western">
              <surname>Brownstein</surname>
              <given-names>JS</given-names>
            </name>
          </person-group>
          <article-title>Combining search, social media, and traditional data sources to improve influenza surveillance</article-title>
          <source>PLoS Comput Biol</source>
          <year>2015</year>
          <month>10</month>
          <volume>11</volume>
          <issue>10</issue>
          <fpage>e1004513</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pcbi.1004513"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pcbi.1004513</pub-id>
          <pub-id pub-id-type="medline">26513245</pub-id>
          <pub-id pub-id-type="pii">PCOMPBIOL-D-15-00856</pub-id>
          <pub-id pub-id-type="pmcid">PMC4626021</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kandula</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hsu</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Shaman</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Subregional nowcasts of seasonal influenza using search trends</article-title>
          <source>J Med Internet Res</source>
          <year>2017</year>
          <month>11</month>
          <day>06</day>
          <volume>19</volume>
          <issue>11</issue>
          <fpage>e370</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2017/11/e370/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/jmir.7486</pub-id>
          <pub-id pub-id-type="medline">29109069</pub-id>
          <pub-id pub-id-type="pii">v19i11e370</pub-id>
          <pub-id pub-id-type="pmcid">PMC5696582</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ning</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kou</surname>
              <given-names>SC</given-names>
            </name>
          </person-group>
          <article-title>Accurate regional influenza epidemics tracking using Internet search data</article-title>
          <source>Sci Rep</source>
          <year>2019</year>
          <month>03</month>
          <day>27</day>
          <volume>9</volume>
          <issue>1</issue>
          <fpage>5238</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-019-41559-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-019-41559-6</pub-id>
          <pub-id pub-id-type="medline">30918276</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-019-41559-6</pub-id>
          <pub-id pub-id-type="pmcid">PMC6437143</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fung</surname>
              <given-names>IC</given-names>
            </name>
            <name name-style="western">
              <surname>Tse</surname>
              <given-names>ZT</given-names>
            </name>
            <name name-style="western">
              <surname>Cheung</surname>
              <given-names>CN</given-names>
            </name>
            <name name-style="western">
              <surname>Miu</surname>
              <given-names>AS</given-names>
            </name>
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>KW</given-names>
            </name>
          </person-group>
          <article-title>Ebola and the social media</article-title>
          <source>Lancet</source>
          <year>2014</year>
          <month>12</month>
          <day>20</day>
          <volume>384</volume>
          <issue>9961</issue>
          <fpage>2207</fpage>
          <pub-id pub-id-type="doi">10.1016/S0140-6736(14)62418-1</pub-id>
          <pub-id pub-id-type="medline">25625391</pub-id>
          <pub-id pub-id-type="pii">S0140-6736(14)62418-1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chan</surname>
              <given-names>EH</given-names>
            </name>
            <name name-style="western">
              <surname>Sahai</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Conrad</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Brownstein</surname>
              <given-names>JS</given-names>
            </name>
          </person-group>
          <article-title>Using web search query data to monitor dengue epidemics: a new model for neglected tropical disease surveillance</article-title>
          <source>PLoS Negl Trop Dis</source>
          <year>2011</year>
          <month>05</month>
          <volume>5</volume>
          <issue>5</issue>
          <fpage>e1206</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pntd.0001206"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pntd.0001206</pub-id>
          <pub-id pub-id-type="medline">21647308</pub-id>
          <pub-id pub-id-type="pii">PNTD-D-11-00327</pub-id>
          <pub-id pub-id-type="pmcid">PMC3104029</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ayyoubzadeh</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Ayyoubzadeh</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Zahedi</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Ahmadi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Niakan Kalhori</surname>
              <given-names>SR</given-names>
            </name>
          </person-group>
          <article-title>Predicting COVID-19 incidence through analysis of Google trends data in Iran: data mining and deep learning pilot study</article-title>
          <source>JMIR Public Health Surveill</source>
          <year>2020</year>
          <month>04</month>
          <day>14</day>
          <volume>6</volume>
          <issue>2</issue>
          <fpage>e18828</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://publichealth.jmir.org/2020/2/e18828/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/18828</pub-id>
          <pub-id pub-id-type="medline">32234709</pub-id>
          <pub-id pub-id-type="pii">v6i2e18828</pub-id>
          <pub-id pub-id-type="pmcid">PMC7159058</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>de Nazelle</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Seto</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Donaire-Gonzalez</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Mendez</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Matamala</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Nieuwenhuijsen</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Jerrett</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Improving estimates of air pollution exposure through ubiquitous sensing technologies</article-title>
          <source>Environ Pollut</source>
          <year>2013</year>
          <month>05</month>
          <volume>176</volume>
          <fpage>92</fpage>
          <lpage>9</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/23416743"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.envpol.2012.12.032</pub-id>
          <pub-id pub-id-type="medline">23416743</pub-id>
          <pub-id pub-id-type="pii">S0269-7491(13)00008-0</pub-id>
          <pub-id pub-id-type="pmcid">PMC3600144</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Devarakonda</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sevusu</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Iftode</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Nath</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Real-time air quality monitoring through mobile sensing in metropolitan areas</article-title>
          <source>Proceedings of the 2nd ACM SIGKDD International Workshop on Urban Computing</source>
          <year>2013</year>
          <month>8</month>
          <conf-name>UrbComp '13</conf-name>
          <conf-date>August 11, 2013</conf-date>
          <conf-loc>Chicago, IL, USA</conf-loc>
          <fpage>1</fpage>
          <lpage>8</lpage>
          <pub-id pub-id-type="doi">10.1145/2505821.2505834</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Snik</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Rietjens</surname>
              <given-names>JH</given-names>
            </name>
            <name name-style="western">
              <surname>Apituley</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Volten</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Mijling</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Di Noia</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Heikamp</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Heinsbroek</surname>
              <given-names>RC</given-names>
            </name>
            <name name-style="western">
              <surname>Hasekamp</surname>
              <given-names>OP</given-names>
            </name>
            <name name-style="western">
              <surname>Smit</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Vonk</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Stam</surname>
              <given-names>DM</given-names>
            </name>
            <name name-style="western">
              <surname>van Harten</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>de Boer</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Keller</surname>
              <given-names>CU</given-names>
            </name>
          </person-group>
          <article-title>Mapping atmospheric aerosols with a citizen science network of smartphone spectropolarimeters</article-title>
          <source>Geophys Res Lett</source>
          <year>2014</year>
          <month>10</month>
          <day>27</day>
          <volume>41</volume>
          <issue>20</issue>
          <fpage>7351</fpage>
          <lpage>8</lpage>
          <pub-id pub-id-type="doi">10.1002/2014gl061462</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cohen</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Brauer</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Burnett</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Anderson</surname>
              <given-names>HR</given-names>
            </name>
            <name name-style="western">
              <surname>Frostad</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Estep</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Balakrishnan</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Brunekreef</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Dandona</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Dandona</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Feigin</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Freedman</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Hubbell</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Jobling</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kan</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Knibbs</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Martin</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Morawska</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Pope 3rd</surname>
              <given-names>CA</given-names>
            </name>
            <name name-style="western">
              <surname>Shin</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Straif</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Shaddick</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Thomas</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>van Dingenen</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>van Donkelaar</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Vos</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Murray</surname>
              <given-names>CJ</given-names>
            </name>
            <name name-style="western">
              <surname>Forouzanfar</surname>
              <given-names>MH</given-names>
            </name>
          </person-group>
          <article-title>Estimates and 25-year trends of the global burden of disease attributable to ambient air pollution: an analysis of data from the Global Burden of Diseases Study 2015</article-title>
          <source>Lancet</source>
          <year>2017</year>
          <month>05</month>
          <day>13</day>
          <volume>389</volume>
          <issue>10082</issue>
          <fpage>1907</fpage>
          <lpage>18</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S0140-6736(17)30505-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/S0140-6736(17)30505-6</pub-id>
          <pub-id pub-id-type="medline">28408086</pub-id>
          <pub-id pub-id-type="pii">S0140-6736(17)30505-6</pub-id>
          <pub-id pub-id-type="pmcid">PMC5439030</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zeger</surname>
              <given-names>SL</given-names>
            </name>
            <name name-style="western">
              <surname>Thomas</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Dominici</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Samet</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Schwartz</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Dockery</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Cohen</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Exposure measurement error in time-series studies of air pollution: concepts and consequences</article-title>
          <source>Environ Health Perspect</source>
          <year>2000</year>
          <month>05</month>
          <volume>108</volume>
          <issue>5</issue>
          <fpage>419</fpage>
          <lpage>26</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ehp.niehs.nih.gov/doi/10.1289/ehp.00108419?url_ver=Z39.88-2003&amp;rfr_id=ori:rid:crossref.org&amp;rfr_dat=cr_pub%3dpubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.1289/ehp.00108419</pub-id>
          <pub-id pub-id-type="medline">10811568</pub-id>
          <pub-id pub-id-type="pii">sc271_5_1835</pub-id>
          <pub-id pub-id-type="pmcid">PMC1638034</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sarnat</surname>
              <given-names>SE</given-names>
            </name>
            <name name-style="western">
              <surname>Sarnat</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Mulholland</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Isakov</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Özkaynak</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>HH</given-names>
            </name>
            <name name-style="western">
              <surname>Klein</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Tolbert</surname>
              <given-names>PE</given-names>
            </name>
          </person-group>
          <article-title>Application of alternative spatiotemporal metrics of ambient air pollution exposure in a time-series epidemiological study in Atlanta</article-title>
          <source>J Expo Sci Environ Epidemiol</source>
          <year>2013</year>
          <volume>23</volume>
          <issue>6</issue>
          <fpage>593</fpage>
          <lpage>605</lpage>
          <pub-id pub-id-type="doi">10.1038/jes.2013.41</pub-id>
          <pub-id pub-id-type="medline">23963512</pub-id>
          <pub-id pub-id-type="pii">jes201341</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Golan</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Moutinho</surname>
              <given-names>JL</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>HH</given-names>
            </name>
            <name name-style="western">
              <surname>Greenwald</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Sarnat</surname>
              <given-names>SE</given-names>
            </name>
            <name name-style="western">
              <surname>Russell</surname>
              <given-names>AG</given-names>
            </name>
            <name name-style="western">
              <surname>Sarnat</surname>
              <given-names>JA</given-names>
            </name>
          </person-group>
          <article-title>Errors associated with the use of roadside monitoring in the estimation of acute traffic pollutant-related health effects</article-title>
          <source>Environ Res</source>
          <year>2018</year>
          <month>08</month>
          <volume>165</volume>
          <fpage>210</fpage>
          <lpage>9</lpage>
          <pub-id pub-id-type="doi">10.1016/j.envres.2018.04.013</pub-id>
          <pub-id pub-id-type="medline">29727821</pub-id>
          <pub-id pub-id-type="pii">S0013-9351(18)30212-3</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zou</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Lampos</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Cox</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>Transfer learning for unsupervised influenza-like illness models from online search data</article-title>
          <source>Proceedings of the 2019 World Wide Web Conference</source>
          <year>2019</year>
          <conf-name>WWW '19</conf-name>
          <conf-date>May 13-17, 2019</conf-date>
          <conf-loc>San Francisco, CA, USA</conf-loc>
          <fpage>2505</fpage>
          <lpage>16</lpage>
          <pub-id pub-id-type="doi">10.1145/3308558.3313477</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lampos</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Miller</surname>
              <given-names>AC</given-names>
            </name>
            <name name-style="western">
              <surname>Crossan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Stefansen</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Advances in nowcasting influenza-like illness rates using search query logs</article-title>
          <source>Sci Rep</source>
          <year>2015</year>
          <month>08</month>
          <day>03</day>
          <volume>5</volume>
          <fpage>12760</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/srep12760"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/srep12760</pub-id>
          <pub-id pub-id-type="medline">26234783</pub-id>
          <pub-id pub-id-type="pii">srep12760</pub-id>
          <pub-id pub-id-type="pmcid">PMC4522652</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Graves</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Jaitly</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Towards end-to-end speech recognition with recurrent neural networks</article-title>
          <source>Proceedings of the 31st International Conference on International Conference on Machine Learning</source>
          <year>2014</year>
          <conf-name>ICML '14</conf-name>
          <conf-date>June 21-26, 2014</conf-date>
          <conf-loc>Beijing, China</conf-loc>
          <fpage>II-1764</fpage>
          <lpage>72</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mikolov</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Sutskever</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Corrado</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Dean</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Distributed representations of words and phrases and their compositionality</article-title>
          <source>Proceedings of the 26th International Conference on Neural Information Processing Systems</source>
          <year>2013</year>
          <conf-name>NeurIPS '13</conf-name>
          <conf-date>December 5-10, 2013</conf-date>
          <conf-loc>Lake Tahoe, NV, USA</conf-loc>
          <fpage>3111</fpage>
          <lpage>9</lpage>
          <pub-id pub-id-type="doi">10.4324/9780203776506-14</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pennington</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Socher</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Manning</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Glove: global vectors for word representation</article-title>
          <source>Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing</source>
          <year>2014</year>
          <conf-name>EMNLP '14</conf-name>
          <conf-date>October 25–29, 2014</conf-date>
          <conf-loc>Doha, Qatar</conf-loc>
          <fpage>1532</fpage>
          <lpage>43</lpage>
          <pub-id pub-id-type="doi">10.3115/v1/d14-1162</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pilotto</surname>
              <given-names>LS</given-names>
            </name>
            <name name-style="western">
              <surname>Douglas</surname>
              <given-names>RM</given-names>
            </name>
            <name name-style="western">
              <surname>Attewell</surname>
              <given-names>RG</given-names>
            </name>
            <name name-style="western">
              <surname>Wilson</surname>
              <given-names>SR</given-names>
            </name>
          </person-group>
          <article-title>Respiratory effects associated with indoor nitrogen dioxide exposure in children</article-title>
          <source>Int J Epidemiol</source>
          <year>1997</year>
          <month>08</month>
          <volume>26</volume>
          <issue>4</issue>
          <fpage>788</fpage>
          <lpage>96</lpage>
          <pub-id pub-id-type="doi">10.1093/ije/26.4.788</pub-id>
          <pub-id pub-id-type="medline">9279611</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chauhan</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Inskip</surname>
              <given-names>HM</given-names>
            </name>
            <name name-style="western">
              <surname>Linaker</surname>
              <given-names>CH</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Schreiber</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Johnston</surname>
              <given-names>SL</given-names>
            </name>
            <name name-style="western">
              <surname>Holgate</surname>
              <given-names>ST</given-names>
            </name>
          </person-group>
          <article-title>Personal exposure to nitrogen dioxide (NO2) and the severity of virus-induced asthma in children</article-title>
          <source>Lancet</source>
          <year>2003</year>
          <month>07</month>
          <day>07</day>
          <volume>361</volume>
          <issue>9373</issue>
          <fpage>1939</fpage>
          <lpage>44</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/12801737"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/s0140-6736(03)13582-9</pub-id>
          <pub-id pub-id-type="medline">12801737</pub-id>
          <pub-id pub-id-type="pii">S0140673603135829</pub-id>
          <pub-id pub-id-type="pmcid">PMC7112409</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rybarczyk</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zalakeviciute</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Machine learning approaches for outdoor air quality modelling: a systematic review</article-title>
          <source>Appl Sci</source>
          <year>2018</year>
          <month>12</month>
          <day>11</day>
          <volume>8</volume>
          <issue>12</issue>
          <fpage>2570</fpage>
          <pub-id pub-id-type="doi">10.3390/app8122570</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Di</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zanobetti</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Koutrakis</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Choirat</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Dominici</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Schwartz</surname>
              <given-names>JD</given-names>
            </name>
          </person-group>
          <article-title>Air pollution and mortality in the Medicare population</article-title>
          <source>N Engl J Med</source>
          <year>2017</year>
          <month>06</month>
          <day>29</day>
          <volume>376</volume>
          <issue>26</issue>
          <fpage>2513</fpage>
          <lpage>22</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/28657878"/>
          </comment>
          <pub-id pub-id-type="doi">10.1056/NEJMoa1702747</pub-id>
          <pub-id pub-id-type="medline">28657878</pub-id>
          <pub-id pub-id-type="pmcid">PMC5766848</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vedal</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Brauer</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>White</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Petkau</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Air pollution and daily mortality in a city with low levels of pollution</article-title>
          <source>Environ Health Perspect</source>
          <year>2003</year>
          <month>01</month>
          <volume>111</volume>
          <issue>1</issue>
          <fpage>45</fpage>
          <lpage>52</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ehp.niehs.nih.gov/doi/10.1289/ehp.5276?url_ver=Z39.88-2003&amp;rfr_id=ori:rid:crossref.org&amp;rfr_dat=cr_pub%3dpubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.1289/ehp.5276</pub-id>
          <pub-id pub-id-type="medline">12515678</pub-id>
          <pub-id pub-id-type="pmcid">PMC1241305</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sarnat</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Sarnat</surname>
              <given-names>SE</given-names>
            </name>
            <name name-style="western">
              <surname>Flanders</surname>
              <given-names>WD</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>HH</given-names>
            </name>
            <name name-style="western">
              <surname>Mulholland</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Baxter</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Isakov</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Özkaynak</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Spatiotemporally resolved air exchange rate as a modifier of acute air pollution-related morbidity in Atlanta</article-title>
          <source>J Expo Sci Environ Epidemiol</source>
          <year>2013</year>
          <volume>23</volume>
          <issue>6</issue>
          <fpage>606</fpage>
          <lpage>15</lpage>
          <pub-id pub-id-type="doi">10.1038/jes.2013.32</pub-id>
          <pub-id pub-id-type="medline">23778234</pub-id>
          <pub-id pub-id-type="pii">jes201332</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kelly</surname>
              <given-names>FJ</given-names>
            </name>
            <name name-style="western">
              <surname>Fussell</surname>
              <given-names>JC</given-names>
            </name>
          </person-group>
          <article-title>Air pollution and public health: emerging hazards and improved understanding of risk</article-title>
          <source>Environ Geochem Health</source>
          <year>2015</year>
          <month>08</month>
          <volume>37</volume>
          <issue>4</issue>
          <fpage>631</fpage>
          <lpage>49</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/26040976"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s10653-015-9720-1</pub-id>
          <pub-id pub-id-type="medline">26040976</pub-id>
          <pub-id pub-id-type="pmcid">PMC4516868</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sarnat</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Russell</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Moutinho</surname>
              <given-names>JL</given-names>
            </name>
            <name name-style="western">
              <surname>Golan</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Weber</surname>
              <given-names>RJ</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Sarnat</surname>
              <given-names>SE</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>HH</given-names>
            </name>
            <name name-style="western">
              <surname>Greenwald</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Developing multipollutant exposure indicators of traffic pollution: the dorm room inhalation to vehicle emissions (DRIVE) study</article-title>
          <source>Res Rep Health Eff Inst</source>
          <year>2018</year>
          <month>04</month>
          <issue>196</issue>
          <fpage>3</fpage>
          <lpage>75</lpage>
          <pub-id pub-id-type="medline">31872750</pub-id>
          <pub-id pub-id-type="pii">Res Rep Health Eff Inst. 2018 Apr;(196):3-75</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="web">
          <article-title>Google Trends</article-title>
          <source>Google</source>
          <access-date>2019-08-31</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://support.google.com/trends/answer/4365533?hl=en">https://support.google.com/trends/answer/4365533?hl=en</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Challet</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Bel Hadj Ayed</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Do Google Trend Data Contain More Predictability than Price Returns?</article-title>
          <source>SSRN J</source>
          <year>2014</year>
          <month>3</month>
          <day>7</day>
          <pub-id pub-id-type="doi">10.2139/ssrn.2405804</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kreindler</surname>
              <given-names>DM</given-names>
            </name>
            <name name-style="western">
              <surname>Lumsden</surname>
              <given-names>CJ</given-names>
            </name>
          </person-group>
          <person-group person-group-type="editor">
            <name name-style="western">
              <surname>Guastello</surname>
              <given-names>SJ</given-names>
            </name>
            <name name-style="western">
              <surname>Gregson</surname>
              <given-names>RA</given-names>
            </name>
          </person-group>
          <article-title>The effects of the irregular sample and missing data in time series analysis</article-title>
          <source>Nonlinear Dynamical Systems Analysis for the Behavioral Sciences Using Real Data</source>
          <year>2006</year>
          <publisher-loc>Boca Raton, FL, USA</publisher-loc>
          <publisher-name>CRC Press</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="web">
          <article-title>Google correlate</article-title>
          <source>Google</source>
          <access-date>2019-08-31</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://searchengineland.com/google-correlate-more-search-data-to-mine-78560">https://searchengineland.com/google-correlate-more-search-data-to-mine-78560</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kane</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Price</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Scotch</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Rabinowitz</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Comparison of ARIMA and Random Forest time series models for prediction of avian influenza H5N1 outbreaks</article-title>
          <source>BMC Bioinformatics</source>
          <year>2014</year>
          <month>08</month>
          <day>13</day>
          <volume>15</volume>
          <issue>1</issue>
          <fpage>276</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-15-276"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/1471-2105-15-276</pub-id>
          <pub-id pub-id-type="medline">25123979</pub-id>
          <pub-id pub-id-type="pii">1471-2105-15-276</pub-id>
          <pub-id pub-id-type="pmcid">PMC4152592</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hochreiter</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Schmidhuber</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Long short-term memory</article-title>
          <source>Neural Comput</source>
          <year>1997</year>
          <month>12</month>
          <day>15</day>
          <volume>9</volume>
          <issue>8</issue>
          <fpage>1735</fpage>
          <lpage>80</lpage>
          <pub-id pub-id-type="doi">10.1162/neco.1997.9.8.1735</pub-id>
          <pub-id pub-id-type="medline">9377276</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Elman</surname>
              <given-names>JL</given-names>
            </name>
          </person-group>
          <article-title>Distributed representations, simple recurrent networks, and grammatical structure</article-title>
          <source>Mach Learn</source>
          <year>1991</year>
          <month>9</month>
          <volume>7</volume>
          <issue>2-3</issue>
          <fpage>195</fpage>
          <lpage>225</lpage>
          <pub-id pub-id-type="doi">10.1007/bf00114844</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>He</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Ren</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Delving deep into rectifiers: surpassing human-level performance on ImageNet classification</article-title>
          <source>Proceedings of the 2015 IEEE International Conference on Computer Vision</source>
          <year>2015</year>
          <conf-name>ICCV '15</conf-name>
          <conf-date>December 7-13, 2015</conf-date>
          <conf-loc>Santiago, Chile</conf-loc>
          <fpage>1026</fpage>
          <lpage>34</lpage>
          <pub-id pub-id-type="doi">10.1109/iccv.2015.123</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="web">
          <source>CGAP Project-Nowcasting Air Pollution</source>
          <access-date>2021-12-10</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/emory-irlab/airpollutionnowcast">https://github.com/emory-irlab/airpollutionnowcast</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Carrière-Swallow</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Labbé</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Nowcasting with Google trends in an emerging market</article-title>
          <source>J Forecast</source>
          <year>2013</year>
          <month>7</month>
          <volume>32</volume>
          <issue>4</issue>
          <fpage>289</fpage>
          <lpage>98</lpage>
          <pub-id pub-id-type="doi">10.1002/for.1252</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kan</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Hong</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Investigating China’s urban air quality using big data, information theory, and machine learning</article-title>
          <source>Pol J Environ Stud</source>
          <year>2018</year>
          <volume>27</volume>
          <issue>2</issue>
          <fpage>565</fpage>
          <lpage>78</lpage>
          <pub-id pub-id-type="doi">10.15244/pjoes/75159</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>JL</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>PC</given-names>
            </name>
          </person-group>
          <article-title>A deep recurrent neural network for air quality classification</article-title>
          <source>J  Inf Hiding Multimed Signal Process</source>
          <year>2018</year>
          <month>3</month>
          <volume>9</volume>
          <issue>2</issue>
          <fpage>346</fpage>
          <lpage>54</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Mago</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chiang</surname>
              <given-names>YY</given-names>
            </name>
            <name name-style="western">
              <surname>Shahabi</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Ambite</surname>
              <given-names>JL</given-names>
            </name>
          </person-group>
          <article-title>Exploiting spatiotemporal patterns for accurate air quality forecasting using deep learning</article-title>
          <source>Proceedings of the 26th ACM SIGSPATIAL International Conference on Advances in Geographic Information Systems</source>
          <year>2018</year>
          <conf-name>SIGSPATIAL '18</conf-name>
          <conf-date>November 6-9, 2018</conf-date>
          <conf-loc>Seattle, WA, USA</conf-loc>
          <fpage>359</fpage>
          <lpage>68</lpage>
          <pub-id pub-id-type="doi">10.1145/3274895.3274907</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Short-term effects of air pollution on lower respiratory diseases and forecasting by the group method of data handling</article-title>
          <source>Atmos Environ</source>
          <year>2012</year>
          <month>05</month>
          <volume>51</volume>
          <fpage>29</fpage>
          <lpage>38</lpage>
          <pub-id pub-id-type="doi">10.1016/j.atmosenv.2012.01.051</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Bocquet</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Mallet</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Seigneur</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Baklanov</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Real-time air quality forecasting, part I: history, techniques, and current status</article-title>
          <source>Atmos Environ</source>
          <year>2012</year>
          <month>12</month>
          <volume>60</volume>
          <fpage>632</fpage>
          <lpage>55</lpage>
          <pub-id pub-id-type="doi">10.1016/j.atmosenv.2012.06.031</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Brokamp</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Jandarov</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Rao</surname>
              <given-names>MB</given-names>
            </name>
            <name name-style="western">
              <surname>LeMasters</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Ryan</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Exposure assessment models for elemental components of particulate matter in an urban environment: a comparison of regression and random forest approaches</article-title>
          <source>Atmos Environ (1994)</source>
          <year>2017</year>
          <month>03</month>
          <volume>151</volume>
          <fpage>1</fpage>
          <lpage>11</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/28959135"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.atmosenv.2016.11.066</pub-id>
          <pub-id pub-id-type="medline">28959135</pub-id>
          <pub-id pub-id-type="pmcid">PMC5611888</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cabaneros</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Calautit</surname>
              <given-names>JK</given-names>
            </name>
            <name name-style="western">
              <surname>Hughes</surname>
              <given-names>BR</given-names>
            </name>
          </person-group>
          <article-title>A review of artificial neural network models for ambient air pollution prediction</article-title>
          <source>Environ Model Soft</source>
          <year>2019</year>
          <month>09</month>
          <volume>119</volume>
          <fpage>285</fpage>
          <lpage>304</lpage>
          <pub-id pub-id-type="doi">10.1016/j.envsoft.2019.06.014</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Misra</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Takeuchi</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Assessing population sensitivity to urban air pollution using google trends and remote sensing datasets</article-title>
          <source>Int Arch Photogramm Remote Sens Spatial Inf Sci</source>
          <year>2020</year>
          <month>02</month>
          <day>14</day>
          <volume>XLII-3/W11</volume>
          <fpage>93</fpage>
          <lpage>100</lpage>
          <pub-id pub-id-type="doi">10.5194/isprs-archives-xlii-3-w11-93-2020</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jun</surname>
              <given-names>SP</given-names>
            </name>
            <name name-style="western">
              <surname>Yoo</surname>
              <given-names>HS</given-names>
            </name>
            <name name-style="western">
              <surname>Choi</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Ten years of research change using Google Trends: from the perspective of big data utilizations and applications</article-title>
          <source>Technol Forecast Soc Change</source>
          <year>2018</year>
          <month>05</month>
          <volume>130</volume>
          <fpage>69</fpage>
          <lpage>87</lpage>
          <pub-id pub-id-type="doi">10.1016/j.techfore.2017.11.009</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref50">
        <label>50</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Santillana</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kou</surname>
              <given-names>SC</given-names>
            </name>
          </person-group>
          <article-title>Accurate estimation of influenza epidemics using Google search data via ARGO</article-title>
          <source>Proc Natl Acad Sci U S A</source>
          <year>2015</year>
          <month>12</month>
          <day>24</day>
          <volume>112</volume>
          <issue>47</issue>
          <fpage>14473</fpage>
          <lpage>8</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/26553980"/>
          </comment>
          <pub-id pub-id-type="doi">10.1073/pnas.1515373112</pub-id>
          <pub-id pub-id-type="medline">26553980</pub-id>
          <pub-id pub-id-type="pii">1515373112</pub-id>
          <pub-id pub-id-type="pmcid">PMC4664296</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref51">
        <label>51</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lampos</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Cristianini</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Tracking the flu pandemic by monitoring the social web</article-title>
          <source>Proceedings of the 2nd International Workshop on Cognitive Information Processing</source>
          <year>2010</year>
          <conf-name>CIP '10</conf-name>
          <conf-date>June 14-16, 2010</conf-date>
          <conf-loc>Elba, Italy</conf-loc>
          <fpage>411</fpage>
          <lpage>6</lpage>
          <pub-id pub-id-type="doi">10.1109/cip.2010.5604088</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref52">
        <label>52</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lampos</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>De Bie</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Cristianini</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Flu detector - tracking epidemics on Twitter</article-title>
          <source>Proceedings of the European Conference on Machine Learning and Knowledge Discovery in Databases</source>
          <year>2010</year>
          <conf-name>ECML PKDD '10</conf-name>
          <conf-date>September 20-24, 2010</conf-date>
          <conf-loc>Barcelona, Spain</conf-loc>
          <fpage>599</fpage>
          <lpage>602</lpage>
          <pub-id pub-id-type="doi">10.1007/978-3-642-15939-8_42</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref53">
        <label>53</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lampos</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Preoţiuc-Pietro</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Cohn</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>A user-centric model of voting intention from Social Media</article-title>
          <source>Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics</source>
          <year>2013</year>
          <conf-name>ACL '13</conf-name>
          <conf-date>August 4-9, 2013</conf-date>
          <conf-loc>Sofia, Bulgaria</conf-loc>
          <fpage>993</fpage>
          <lpage>1003</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref54">
        <label>54</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zou</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Hastie</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Regularization and variable selection via the elastic net</article-title>
          <source>J Royal Statistical Soc B</source>
          <year>2005</year>
          <month>04</month>
          <volume>67</volume>
          <issue>2</issue>
          <fpage>301</fpage>
          <lpage>20</lpage>
          <pub-id pub-id-type="doi">10.1111/j.1467-9868.2005.00503.x</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref55">
        <label>55</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Deng</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Rangwala</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Ning</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Cola-GNN: cross-location attention based graph neural networks for long-term ILI prediction</article-title>
          <source>Proceedings of the 29th ACM International Conference on Information &amp; Knowledge Management</source>
          <year>2020</year>
          <month>10</month>
          <day>19</day>
          <conf-name>CIKM '20</conf-name>
          <conf-date>October 19-23, 2020</conf-date>
          <conf-loc>Virtual</conf-loc>
          <fpage>245</fpage>
          <lpage>54</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref56">
        <label>56</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zou</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Lampos</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Cox</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>Multi-task learning improves disease models from web search</article-title>
          <source>Proceedings of the 2018 World Wide Web Conference</source>
          <year>2018</year>
          <conf-name>WWW '18</conf-name>
          <conf-date>April 23-27, 2018</conf-date>
          <conf-loc>Lyon, France</conf-loc>
          <fpage>87</fpage>
          <lpage>96</lpage>
          <pub-id pub-id-type="doi">10.1145/3178876.3186050</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref57">
        <label>57</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Yakob</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Bonsall</surname>
              <given-names>MB</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Predicting seasonal influenza epidemics using cross-hemisphere influenza surveillance data and local internet query data</article-title>
          <source>Sci Rep</source>
          <year>2019</year>
          <month>03</month>
          <day>01</day>
          <volume>9</volume>
          <issue>1</issue>
          <fpage>3262</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-019-39871-2"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-019-39871-2</pub-id>
          <pub-id pub-id-type="medline">30824756</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-019-39871-2</pub-id>
          <pub-id pub-id-type="pmcid">PMC6397245</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
