<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JFR</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id>
      <journal-title>JMIR Formative Research</journal-title>
      <issn pub-type="epub">2561-326X</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v4i12e24490</article-id>
      <article-id pub-id-type="pmid">33331823</article-id>
      <article-id pub-id-type="doi">10.2196/24490</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Automated Categorization of Systemic Disease and Duration From Electronic Medical Record System Data Using Finite-State Machine Modeling: Prospective Validation Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Lovis</surname>
            <given-names>Christian</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Loher</surname>
            <given-names>Phillipe</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Rusu</surname>
            <given-names>Lucia</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Sai Prashanthi</surname>
            <given-names>Gumpili</given-names>
          </name>
          <degrees>MTech</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-0870-5596</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Deva</surname>
            <given-names>Ayush</given-names>
          </name>
          <degrees>BTech</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9645-6157</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Vadapalli</surname>
            <given-names>Ranganath</given-names>
          </name>
          <degrees>BSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-1586-4376</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Das</surname>
            <given-names>Anthony Vipin</given-names>
          </name>
          <degrees>FRCS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of eyeSmart EMR &#38; AEye</institution>
            <institution>LV Prasad Eye Institute</institution>
            <addr-line>Kallam Anji Reddy Campus, L V Prasad Marg</addr-line>
            <addr-line>Hyderabad, Telangana</addr-line>
            <country>India</country>
            <phone>91 9885071960</phone>
            <email>vipin@lvpei.org</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9692-2621</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of eyeSmart EMR &#38; AEye</institution>
        <institution>LV Prasad Eye Institute</institution>
        <addr-line>Hyderabad, Telangana</addr-line>
        <country>India</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>International Institute of Information Technology</institution>
        <addr-line>Hyderabad , Telangana</addr-line>
        <country>India</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Anthony Vipin Das <email>vipin@lvpei.org</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>12</month>
        <year>2020</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>17</day>
        <month>12</month>
        <year>2020</year>
      </pub-date>
      <volume>4</volume>
      <issue>12</issue>
      <elocation-id>e24490</elocation-id>
      <history>
        <date date-type="received">
          <day>22</day>
          <month>9</month>
          <year>2020</year>
        </date>
        <date date-type="rev-request">
          <day>7</day>
          <month>11</month>
          <year>2020</year>
        </date>
        <date date-type="rev-recd">
          <day>12</day>
          <month>11</month>
          <year>2020</year>
        </date>
        <date date-type="accepted">
          <day>17</day>
          <month>11</month>
          <year>2020</year>
        </date>
      </history>
      <copyright-statement>©Gumpili Sai Prashanthi, Ayush Deva, Ranganath Vadapalli, Anthony Vipin Das. Originally published in JMIR Formative Research (http://formative.jmir.org), 17.12.2020.</copyright-statement>
      <copyright-year>2020</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on http://formative.jmir.org, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="http://formative.jmir.org/2020/12/e24490/" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>One of the major challenges in the health care sector is that approximately 80% of generated data remains unstructured and unused. Since it is difficult to handle unstructured data from electronic medical record systems, it tends to be neglected for analyses in most hospitals and medical centers. Therefore, there is a need to analyze unstructured big data in health care systems so that we can optimally utilize and unearth all unexploited information from it.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>In this study, we aimed to extract a list of diseases and associated keywords along with the corresponding time durations from an indigenously developed electronic medical record system and describe the possibility of analytics from the acquired datasets.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We propose a novel, finite-state machine to sequentially detect and cluster disease names from patients’ medical history. We defined 3 states in the finite-state machine and transition matrix, which depend on the identified keyword. In addition, we also defined a state-change action matrix, which is essentially an action associated with each transition. The dataset used in this study was obtained from an indigenously developed electronic medical record system called eyeSmart that was implemented across a large, multitier ophthalmology network in India. The dataset included patients’ past medical history and contained records of 10,000 distinct patients.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>We extracted disease names and associated keywords by using the finite-state machine with an accuracy of 95%, sensitivity of 94.9%, and positive predictive value of 100%. For the extraction of the duration of disease, the machine’s accuracy was 93%, sensitivity was 92.9%, and the positive predictive value was 100%.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>We demonstrated that the finite-state machine we developed in this study can be used to accurately identify disease names, associated keywords, and time durations from a large cohort of patient records obtained using an electronic medical record system.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>electronic health records</kwd>
        <kwd>data analysis</kwd>
        <kwd>machine learning</kwd>
        <kwd>algorithms</kwd>
        <kwd>ophthalmology</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Electronic medical record (EMR) systems have been increasingly replacing paper-based records; using these systems has advantages such as increased efficiency and standardized quality, thereby enabling accurate clinical documentation [<xref ref-type="bibr" rid="ref1">1</xref>]. Research that is dependent on reviewing paper records is not only cumbersome but also prone to human errors. The amount of time taken to retrieve and analyze large volumes of data from EMR systems is minimal compared to the manual process. Moreover, the adoption of EMR systems has led to the availability of diverse sources of clinical information, such as demographic data, history of diagnosis, prescriptions, and laboratory test results, which have established EMR systems as a treasure trove for large-scale analysis of health data. As a result, to obtain meaningful insights, there is a need to extract useful information and patterns from the rapidly growing volumes of data.</p>
      <p>In general, 3 types of data are available as EMRs: structured, semistructured, and unstructured data [<xref ref-type="bibr" rid="ref2">2</xref>]. Fixed-mode databases contain basic information and are usually used to store structured data. Unstructured data includes reports; records regarding surgery, medical history, and discharge; and clinical notes. One of the major challenges in the health care sector is that approximately 80% of the data remains unstructured and unused after it has been generated [<xref ref-type="bibr" rid="ref3">3</xref>]. Since it is difficult to handle this sort of unstructured data obtained from EMRs, it tends to be neglected for analysis in most hospitals or medical centers [<xref ref-type="bibr" rid="ref4">4</xref>]. Therefore, there is a need to analyze unstructured big data in health care systems so that we can optimally utilize the data and unearth all possible unexploited information from it.</p>
      <p>The aim of this study was to extract a list of mentioned diseases and associated keywords, along with time durations, from the indigenously developed EMR system eyeSmart, which has been implemented across a large multitier ophthalmology network in India. We also aimed to describe the possibility of analytics from the datasets thus acquired.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Data Extraction</title>
        <p>We retrieved systemic disease information of a subset of patients who presented to a large multitier ophthalmology network in India between August 2010 and December 2019 by using eyeSmart EMR system [<xref ref-type="bibr" rid="ref1">1</xref>]. The dataset analyzed included the past medical history of patients and contained 10,000 records of distinct patients. From the given plaintext data about the medical history of the patients, we retrieved the names of systemic disease(s) from a fixed set of known disease names (<xref ref-type="boxed-text" rid="box1">Textbox 1</xref>) documented in the patients’ past medical history column and the duration of the disease, using Python and the techniques mentioned below.</p>
        <boxed-text id="box1" position="float">
          <title>Search terms used to retrieve disease names and other associated keywords from the dataset.</title>
          <p>
            <bold>Disease names (keyword) and their associated keywords:</bold>
          </p>
          <list list-type="bullet">
            <list-item>
              <p>Diabetes mellitus</p>
              <list>
                <list-item>
                  <p>DM</p>
                </list-item>
                <list-item>
                  <p>Insulin</p>
                </list-item>
                <list-item>
                  <p>FBS</p>
                </list-item>
                <list-item>
                  <p>PPBS</p>
                </list-item>
                <list-item>
                  <p>IDDM</p>
                </list-item>
              </list>
              <p/>
            </list-item>
            <list-item>
              <p>Hypertension</p>
              <list>
                <list-item>
                  <p>HTN</p>
                </list-item>
              </list>
              <p/>
            </list-item>
            <list-item>
              <p>Asthma</p>
            </list-item>
            <list-item>
              <p>Acid peptic disease</p>
              <list>
                <list-item>
                  <p>Gastric</p>
                </list-item>
                <list-item>
                  <p>Ulcer</p>
                </list-item>
              </list>
              <p/>
            </list-item>
            <list-item>
              <p>Hypothyroidism</p>
            </list-item>
            <list-item>
              <p>Hyperthyroidism</p>
            </list-item>
            <list-item>
              <p>Rheumatoid arthritis</p>
              <list>
                <list-item>
                  <p>RA</p>
                </list-item>
              </list>
              <p/>
            </list-item>
            <list-item>
              <p>Allergy</p>
            </list-item>
            <list-item>
              <p>Tuberculosis</p>
            </list-item>
            <list-item>
              <p>Sinusitis</p>
            </list-item>
            <list-item>
              <p>Arthritis</p>
              <list>
                <list-item>
                  <p>Joint pain</p>
                </list-item>
              </list>
              <p/>
            </list-item>
            <list-item>
              <p>Coronary artery disease</p>
              <list>
                <list-item>
                  <p>CAD</p>
                </list-item>
              </list>
              <p/>
            </list-item>
            <list-item>
              <p>Cholesterol</p>
            </list-item>
            <list-item>
              <p>Migraine</p>
            </list-item>
            <list-item>
              <p>Cancer</p>
            </list-item>
            <list-item>
              <p>Paralysis</p>
            </list-item>
            <list-item>
              <p>Spondylitis</p>
            </list-item>
            <list-item>
              <p>Hepatitis</p>
            </list-item>
            <list-item>
              <p>Epilepsy</p>
              <list>
                <list-item>
                  <p>Fits</p>
                </list-item>
                <list-item>
                  <p>Seizures</p>
                </list-item>
              </list>
              <p/>
            </list-item>
            <list-item>
              <p>Malaria</p>
            </list-item>
          </list>
        </boxed-text>
      </sec>
      <sec>
        <title>Data Availability</title>
        <p>The dataset analyzed during the current study is not publicly available as it contains confidential patient information, but it can be made available from the corresponding author on reasonable request.</p>
      </sec>
      <sec>
        <title>Ethical Approval</title>
        <p>This study was approved by the Institutional Review Board of LV Prasad Eye Institute, Hyderabad (Ethics Ref. No. LEC BHR-R-09-20-497), and all procedures were in accordance with the tenets of the Declaration of Helsinki. All data were fully anonymized prior to access by the study group.</p>
      </sec>
      <sec>
        <title>Assumptions About the Data</title>
        <p>We made the following assumptions about the data in order to set some baselines for the information retrieval task:</p>
        <list list-type="order">
          <list-item>
            <p>The names of systemic diseases were spelled correctly.</p>
          </list-item>
          <list-item>
            <p>Duration of the disease (if it exists) always followed the name of the disease and did not precede the documented disease.</p>
          </list-item>
          <list-item>
            <p>If a disease name (D1) was followed by another disease name (D2) without any duration tag in between, then the duration for D1 was assumed to be missing and the next duration tag encountered would be associated with D2.</p>
          </list-item>
        </list>
        <p>Given the unstructured plaintext data about systemic diseases and their durations, the following 2 steps were used to extract useful information and convert it into a structured data format.</p>
        <sec>
          <title>Tag Identification</title>
          <p>This involved the identification of disease names and duration tags in the extracted data. The task was to identify the presence of one or more of the enlisted diseases (<xref ref-type="boxed-text" rid="box1">Textbox 1</xref>) in the plaintext data. Since we assumed that the disease names were spelled correctly, we used string matching in Python to check if any disease names were present. Similarly, to identify the duration tags, we used regular expressions in Python to identify both (1) the value of duration (ie, a number) and (2) the unit of duration (ie, day, week, month, or year)</p>
        </sec>
        <sec>
          <title>Clustering</title>
          <p>This step involved correctly clustering the information, that is, finding and establishing the relations between different tags (in this case, duration and disease name).</p>
          <p>Specifically, once the duration tags were identified, it was important to associate the correct duration with the corresponding disease, which was a challenging part.</p>
          <p>Therefore, we propose a novel, finite-state machine (FSM) to sequentially detect and cluster disease name(s) from the patient’s medical history records.</p>
          <p>We defined 3 states in our FSM and the transition matrix that depends on the identified tag. In addition, we also defined a state-change action matrix, which is essentially an action associated with each transition. These are explained in detail below and illustrated in <xref rid="figure1" ref-type="fig">Figure 1</xref>.</p>
          <fig id="figure1" position="float">
            <label>Figure 1</label>
            <caption>
              <p>Flowchart depicting the 3 states of the finite-state machine modeling.</p>
            </caption>
            <graphic xlink:href="formative_v4i12e24490_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
      </sec>
      <sec>
        <title>State Definition</title>
        <sec>
          <title>State 0</title>
          <p>This is the starting state. It has no unrecorded information. In this state, previous disease is “null” (prev_disease = NULL).</p>
        </sec>
        <sec>
          <title>State 1</title>
          <p>This is the state where we have found a disease previously that has not yet been recorded or stored in prev_disease. Here, we expect to find a time-matching regular expression to complete the record for that disease.</p>
        </sec>
        <sec>
          <title>State 2</title>
          <p>This is the state that is reached after encountering a “NO” string from State 0. A “NO” string indicates absence of the particular disease that follows the word “NO.” This means that the next sentence is about a disease that is not present and should not be included in the list. The state-change matrix and state-change action matrix are presented in <xref ref-type="table" rid="table1">Tables 1</xref> and <xref ref-type="table" rid="table2">2</xref>, respectively.</p>
          <table-wrap position="float" id="table1">
            <label>Table 1</label>
            <caption>
              <p>State-change matrix.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="190"/>
              <col width="270"/>
              <col width="270"/>
              <col width="270"/>
              <thead>
                <tr valign="top">
                  <td>Disease state</td>
                  <td colspan="3">Input string</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>“NO”</td>
                  <td>Disease</td>
                  <td>Time</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>0</td>
                  <td>2</td>
                  <td>1</td>
                  <td>Error</td>
                </tr>
                <tr valign="top">
                  <td>1</td>
                  <td>Error</td>
                  <td>1</td>
                  <td>0</td>
                </tr>
                <tr valign="top">
                  <td>2</td>
                  <td>Error</td>
                  <td>0</td>
                  <td>Error</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
          <table-wrap position="float" id="table2">
            <label>Table 2</label>
            <caption>
              <p>State-change action matrix.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="70"/>
              <col width="210"/>
              <col width="360"/>
              <col width="360"/>
              <thead>
                <tr valign="top">
                  <td>State</td>
                  <td colspan="3">Input string</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>“NO”</td>
                  <td>Disease</td>
                  <td>Time</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>0</td>
                  <td>N/A<sup>a</sup></td>
                  <td>1. Print(Found a disease. Hoping for it to be followed by a time since when disease existed.)<break/>2. Update prev_disease = &#60;disease&#62;.</td>
                  <td>Print(Warning: Found a time unit before finding a disease)</td>
                </tr>
                <tr valign="top">
                  <td>1</td>
                  <td>Print(Disease Name followed by NO. Not according to how it should be)</td>
                  <td>1. Record &#60;prev_disease, No time&#62;.<break/>2. Print(Warning: prev_disease was not followed by a time.)<break/>3. Update prev_disease = &#60;disease&#62;</td>
                  <td>1. Record&#60;prev_disease, &#60;time&#62;&#62;.<break/>2. Print(Successfully detected disease and time since when).<break/>3. Update prev_disease = NULL.</td>
                </tr>
                <tr valign="top">
                  <td>2</td>
                  <td>Print(Error: Found two consecutive NOs)</td>
                  <td>Print(Detected a “NO Disease statement. Ignoring and not recording.)</td>
                  <td>Print(Found a time unit after NO. Something wrong)</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table2fn1">
                <p><sup>a</sup>N/A: not applicable.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
        </sec>
      </sec>
      <sec>
        <title>FSM Output Measurements</title>
        <p>We measured sensitivity, positive predictive value (PPV), and accuracy of the FSM to identify the disease name and associated keywords as well as the associated duration.</p>
        <graphic xlink:href="formative_v4i12e24490_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        <graphic xlink:href="formative_v4i12e24490_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        <graphic xlink:href="formative_v4i12e24490_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <p>We evaluated the information extraction accuracy of the FSM by comparing the results to those of an expert human gold standard. The human had good knowledge about the medical terms used and their diagnoses. The report was further crosschecked by another person to minimize human error. In all, 100 records were randomly sampled and manually annotated and compared to the output of the algorithm. The record was annotated as true positive only when all the disease names and any associated keywords were accurately extracted along with the accurate disease duration.</p>
      <p>To compare the predictions of the FSM to a gold standard (ie, manually annotated data, in our case), a confusion matrix was used. <xref ref-type="table" rid="table3">Table 3</xref> represents a generic 2×2 confusion matrix used to identify the diagnosis. <xref ref-type="table" rid="table4">Table 4</xref> represents a confusion matrix used to identify the duration associated with that particular diagnosis.</p>
      <p>For the extraction of disease names and associated keywords, we reported an accuracy of 95%, sensitivity of 94.9%, and PPV of 100%. For the extraction of the disease duration, we reported an accuracy of 93%, sensitivity of 92.9%, and PPV of 100%.</p>
      <table-wrap position="float" id="table3">
        <label>Table 3</label>
        <caption>
          <p>Confusion matrix to determine the diagnosis of disease (n=100).</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="300"/>
          <col width="350"/>
          <col width="350"/>
          <thead>
            <tr valign="top">
              <td>Gold-standard method</td>
              <td colspan="2">FSM<sup>a</sup> result</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Predicted “Yes”</td>
              <td>Predicted “No”</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>Actual “Yes”</td>
              <td>94</td>
              <td>5</td>
            </tr>
            <tr valign="top">
              <td>Actual “No”</td>
              <td>0</td>
              <td>1</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table3fn1">
            <p><sup>a</sup>FSM: finite-state machine.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
      <table-wrap position="float" id="table4">
        <label>Table 4</label>
        <caption>
          <p>Confusion matrix to determine duration of the associated diagnosis (n=100).</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="300"/>
          <col width="350"/>
          <col width="350"/>
          <thead>
            <tr valign="top">
              <td>Gold-standard method</td>
              <td colspan="2">FSM<sup>a</sup> result</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Predicted “Yes”</td>
              <td>Predicted “No”</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>Actual “Yes”</td>
              <td>92</td>
              <td>7</td>
            </tr>
            <tr valign="top">
              <td>Actual “No”</td>
              <td>0</td>
              <td>1</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table4fn1">
            <p><sup>a</sup>FSM: finite-state machine.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>In this study, we demonstrated that FSM can be used to accurately identify the disease name, associated keywords, and disease duration from a large cohort of patient records obtained using an EMR system that has been implemented across a large, multitier ophthalmology network in India. Many previous studies have used regular expressions and natural language processing (NLP) to extract disease names or keywords [<xref ref-type="bibr" rid="ref5">5</xref>-<xref ref-type="bibr" rid="ref17">17</xref>]. Hobbs et al [<xref ref-type="bibr" rid="ref18">18</xref>] used cascading finite-state automatas for extracting information from natural language text [<xref ref-type="bibr" rid="ref18">18</xref>]. Leroy et al [<xref ref-type="bibr" rid="ref19">19</xref>] used finite-state automata to structure the relation between extract entities, but attempts to extract the duration of the disease along with the disease name itself have not been made previously.</p>
        <p>A variety of valuable medical information is stored in texts that are unstructured, but there are many challenges in dealing with such data as the text may contain many errors, incorrect usage of grammar, and improper structural framework, which would increase the challenges in analyzing and processing of data.</p>
        <p>Unstructured data gives a wider picture about patient data and aids clinicians in connecting the dots and presenting a more accurate picture of the health of the patient. Extracting useful information from these records help doctors to identify a patient’s medical history and also make important predictions.</p>
        <p>There are inherent challenges in how information in an unstructured format is inputted into the EMR. This is governed by the training, literacy, and typing skill of the user in question. These challenges include errors in spelling and nonconformity of the structure of the data inputted. Automating the analysis of data in such formats helps reduce the time taken for manual mining of information.</p>
      </sec>
      <sec>
        <title>Comparison With Prior Work</title>
        <p>In a study on asthma by Zeng et al [<xref ref-type="bibr" rid="ref5">5</xref>], which involved using NLP for extracting principal diagnosis, comorbidity, and smoking status, the accuracy of the algorithm was 82%, 87%, and 90%, respectively. Rosier et al [<xref ref-type="bibr" rid="ref6">6</xref>] used clinical records to extract data on pacemaker implantation procedures by using regular expressions. The system extracted information with a very high PPV (&#62;95%) and sensitivity (&#62;90%). In a study by Murtaugh et al [<xref ref-type="bibr" rid="ref7">7</xref>], which involved extraction of body weight values from clinical notes, the accuracy was 98.3% and precision was 98.8%. These values are similar to our findings in this study involving identification of systemic diseases and their durations.</p>
        <p>Systemic diseases are frequently considered to be the underlying cause of many medical conditions. Systemic disease history is a particularly important component in the examination of patients with eye disease. Various systemic diseases affect the eye, notably diabetic retinopathy, dry eye disease, cataract, and thyroid eye disease [<xref ref-type="bibr" rid="ref20">20</xref>]. An understanding of the duration of the systemic diseases is vital to prognosticate the severity of the ocular condition and the treatment outcomes.</p>
        <p>In a systemic review based on extracting information from the text of EMRs to improve case detection, Ford et al [<xref ref-type="bibr" rid="ref8">8</xref>] compared the accuracy of case-detection algorithms by comparing codes and text. For codes-only algorithms, the median sensitivity was 61.7% and PPV was 72%. For text-only algorithms, the median sensitivity was 78.1% and PPV was 73%. Moreover, for a combination of text and codes, the median sensitivity was 78.1% and PPV was 86%. The medical conditions included in this review were respiratory infections, bowel disease, cancer, and diabetes. The algorithm sensitivity ranged from 48.4% to 99.2%, specificity ranged from 90% to 99.4%, and PPV ranged from 54% to 97.9% [<xref ref-type="bibr" rid="ref8">8</xref>].</p>
        <p>Zheng et al [<xref ref-type="bibr" rid="ref13">13</xref>] used both structured and unstructured EMRs for developing and testing a web-based diabetes case detection algorithm. The NLP-based algorithm had a PPV of 90%. Petch et al [<xref ref-type="bibr" rid="ref14">14</xref>] extracted 15 clinical features from dictated ambulatory consult notes by using a commercially available NLP-based tool. NLP performed best for features that were classified as simple, yielding an overall accuracy of 96%. However, the performance was lower for other features that were of moderate and complex linguistic complexity.</p>
        <p>The scope of this study, as the first experiment of this nature, was to successfully categorize systemic diseases and their durations from a cohort of patient records. The next models will focus on categorizing clinical findings based on slit lamp examination of various parts of the eye and the plan of management written by the health care provider. The tasks we undertook in this study were relatively challenging. The major challenges were that a patient’s medical history may contain information about multiple diseases. The presence of the name of a disease does not always imply the patient was diagnosed with that disease. We can have instances where a doctor may write that the patient had no history of a particular disease. Moreover, not all disease names identified in the data had a duration associated with them. For example, there can be 3 disease names and only 2 duration related tags. All these challenges were addressed by the current methodology of FSM, as described in this study.</p>
      </sec>
      <sec>
        <title>Study Limitations</title>
        <p>One of the limitations of this study is that if the duration of disease were preceded by the disease name, it could not be identified and that disease names could not be identified if there were any spelling mistakes.</p>
        <p>Since the current dataset had negligible spelling mistakes, and the disease names were always followed by the duration, the state space of the current FSM was small. However, an advantage of modeling this as an FSM is that it can be easily extended to run on datasets where these assumptions do not hold true. Thus, we propose that FSM is a very robust framework to address challenges of automated systemic disease and duration categorization. Our findings also suggest that this method can be used more generally for both clinical and research purposes, to identify the disease and duration.</p>
      </sec>
      <sec>
        <title>Future Directions</title>
        <p>Future work involves using the FSM on more datasets, understanding the complexities of the unstructured datasets that are used as inputs, and incorporating more changes to make the FSM more robust. This is an ongoing process of periodically analyzing the input data to modify the state changes to enable a more accurate categorization of the required variables.</p>
        <p>The adoption of EMR in a large country like India is rather low. There are various associated challenges, but the potential long-term benefits for research and education are promising. Structured datasets from the EMR are crucial for any meaningful research to be conducted. Unstructured datasets also need to be analyzed in an automated fashion to minimize the time required for analyses.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>In conclusion, we present a novel technique that was developed to analyze unstructured data of systemic diseases and their durations in a large cohort of patient records in a multitier ophthalmology network in India.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">EMR</term>
          <def>
            <p>electronic medical record</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">FSM</term>
          <def>
            <p>finite-state machine</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">PPV</term>
          <def>
            <p>positive predictive value</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>We wish to acknowledge the support of our Department of eyeSmart EMR &#38; AEye team and are especially grateful to Mr. Mohammad Pasha for his assistance.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Das</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kammari</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Vadapalli</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Basu</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Big data and the eyeSmart electronic medical record system - An 8-year experience from a three-tier eye care network in India</article-title>
          <source>Indian J Ophthalmol</source>
          <year>2020</year>
          <month>03</month>
          <volume>68</volume>
          <issue>3</issue>
          <fpage>427</fpage>
          <lpage>432</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.ijo.in/article.asp?issn=0301-4738;year=2020;volume=68;issue=3;spage=427;epage=432;aulast=Das"/>
          </comment>
          <pub-id pub-id-type="doi">10.4103/ijo.IJO_710_19</pub-id>
          <pub-id pub-id-type="medline">32056994</pub-id>
          <pub-id pub-id-type="pii">IndianJOphthalmol_2020_68_3_427_278371</pub-id>
          <pub-id pub-id-type="pmcid">PMC7043185</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Cai</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Fang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Data processing and text mining technologies on electronic medical records: a review</article-title>
          <source>J Healthc Eng</source>
          <year>2018</year>
          <volume>2018</volume>
          <fpage>4302425</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1155/2018/4302425"/>
          </comment>
          <pub-id pub-id-type="doi">10.1155/2018/4302425</pub-id>
          <pub-id pub-id-type="medline">29849998</pub-id>
          <pub-id pub-id-type="pmcid">PMC5911323</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kong</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Managing unstructured big data in healthcare system</article-title>
          <source>Healthc Inform Res</source>
          <year>2019</year>
          <month>01</month>
          <volume>25</volume>
          <issue>1</issue>
          <fpage>1</fpage>
          <lpage>2</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.e-hir.org/DOIx.php?id=10.4258/hir.2019.25.1.1"/>
          </comment>
          <pub-id pub-id-type="doi">10.4258/hir.2019.25.1.1</pub-id>
          <pub-id pub-id-type="medline">30788175</pub-id>
          <pub-id pub-id-type="pmcid">PMC6372467</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rusu</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Halcu</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Grigoriu</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Neculoiu</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Sandulescu</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Marinescu</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Converting unstructured and semi-structured data into knowledge</article-title>
          <year>2013</year>
          <conf-name>2013 11th RoEduNet International Conference</conf-name>
          <conf-date>Jan 7-13, 2019</conf-date>
          <conf-loc>Sinaia, Romania</conf-loc>
          <publisher-name>IEEE</publisher-name>
          <fpage>1</fpage>
          <lpage>4</lpage>
          <pub-id pub-id-type="doi">10.1109/roedunet.2013.6511736</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zeng</surname>
              <given-names>QT</given-names>
            </name>
            <name name-style="western">
              <surname>Goryachev</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Weiss</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sordo</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Murphy</surname>
              <given-names>SN</given-names>
            </name>
            <name name-style="western">
              <surname>Lazarus</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Extracting principal diagnosis, co-morbidity and smoking status for asthma research: evaluation of a natural language processing system</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2006</year>
          <month>07</month>
          <day>26</day>
          <volume>6</volume>
          <fpage>30</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/1472-6947-6-30"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/1472-6947-6-30</pub-id>
          <pub-id pub-id-type="medline">16872495</pub-id>
          <pub-id pub-id-type="pii">1472-6947-6-30</pub-id>
          <pub-id pub-id-type="pmcid">PMC1553439</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rosier</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Burgun</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mabo</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Using regular expressions to extract information on pacemaker implantation procedures from clinical reports</article-title>
          <source>AMIA Annu Symp Proc</source>
          <year>2008</year>
          <month>11</month>
          <day>06</day>
          <fpage>81</fpage>
          <lpage>5</lpage>
          <pub-id pub-id-type="medline">18998970</pub-id>
          <pub-id pub-id-type="pmcid">PMC2656039</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Murtaugh</surname>
              <given-names>Ma</given-names>
            </name>
            <name name-style="western">
              <surname>Gibson</surname>
              <given-names>Bs</given-names>
            </name>
            <name name-style="western">
              <surname>Redd</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Zeng-Treitler</surname>
              <given-names>Q</given-names>
            </name>
          </person-group>
          <article-title>Regular expression-based learning to extract bodyweight values from clinical notes</article-title>
          <source>J Biomed Inform</source>
          <year>2015</year>
          <month>04</month>
          <volume>54</volume>
          <fpage>186</fpage>
          <lpage>90</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(15)00041-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2015.02.009</pub-id>
          <pub-id pub-id-type="medline">25746391</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(15)00041-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ford</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Carroll</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Scott</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Cassell</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Extracting information from the text of electronic medical records to improve case detection: a systematic review</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2016</year>
          <month>09</month>
          <volume>23</volume>
          <issue>5</issue>
          <fpage>1007</fpage>
          <lpage>15</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/26911811"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocv180</pub-id>
          <pub-id pub-id-type="medline">26911811</pub-id>
          <pub-id pub-id-type="pii">ocv180</pub-id>
          <pub-id pub-id-type="pmcid">PMC4997034</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bui</surname>
              <given-names>DDA</given-names>
            </name>
            <name name-style="western">
              <surname>Zeng-Treitler</surname>
              <given-names>Q</given-names>
            </name>
          </person-group>
          <article-title>Learning regular expressions for clinical text classification</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2014</year>
          <volume>21</volume>
          <issue>5</issue>
          <fpage>850</fpage>
          <lpage>7</lpage>
          <pub-id pub-id-type="doi">10.1136/amiajnl-2013-002411</pub-id>
          <pub-id pub-id-type="medline">24578357</pub-id>
          <pub-id pub-id-type="pii">amiajnl-2013-002411</pub-id>
          <pub-id pub-id-type="pmcid">PMC4147608</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Hao</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>A pattern-based method for medical entity recognition from Chinese diagnostic imaging text</article-title>
          <source>Front Artif Intell</source>
          <year>2019</year>
          <month>5</month>
          <day>14</day>
          <volume>2</volume>
          <pub-id pub-id-type="doi">10.3389/frai.2019.00001</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Guetterman</surname>
              <given-names>TC</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>DeJonckheere</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Basu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Scruggs</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Vydiswaran</surname>
              <given-names>VV</given-names>
            </name>
          </person-group>
          <article-title>Augmenting qualitative text analysis with natural language processing: methodological study</article-title>
          <source>J Med Internet Res</source>
          <year>2018</year>
          <month>06</month>
          <day>29</day>
          <volume>20</volume>
          <issue>6</issue>
          <fpage>e231</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2018/6/e231/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/jmir.9702</pub-id>
          <pub-id pub-id-type="medline">29959110</pub-id>
          <pub-id pub-id-type="pii">v20i6e231</pub-id>
          <pub-id pub-id-type="pmcid">PMC6045788</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kaufman</surname>
              <given-names>DR</given-names>
            </name>
            <name name-style="western">
              <surname>Sheehan</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Stetson</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Bhatt</surname>
              <given-names>AR</given-names>
            </name>
            <name name-style="western">
              <surname>Field</surname>
              <given-names>AI</given-names>
            </name>
            <name name-style="western">
              <surname>Patel</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Maisel</surname>
              <given-names>JM</given-names>
            </name>
          </person-group>
          <article-title>Natural language processing-enabled and conventional data capture methods for input to electronic health records: a comparative usability study</article-title>
          <source>JMIR Med Inform</source>
          <year>2016</year>
          <month>10</month>
          <day>28</day>
          <volume>4</volume>
          <issue>4</issue>
          <fpage>e35</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2016/4/e35/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/medinform.5544</pub-id>
          <pub-id pub-id-type="medline">27793791</pub-id>
          <pub-id pub-id-type="pii">v4i4e35</pub-id>
          <pub-id pub-id-type="pmcid">PMC5106560</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Hao</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Shin</surname>
              <given-names>AY</given-names>
            </name>
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Ngo</surname>
              <given-names>AD</given-names>
            </name>
            <name name-style="western">
              <surname>Jackson-Browne</surname>
              <given-names>MS</given-names>
            </name>
            <name name-style="western">
              <surname>Feller</surname>
              <given-names>DJ</given-names>
            </name>
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Dai</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>McElhinney</surname>
              <given-names>DB</given-names>
            </name>
            <name name-style="western">
              <surname>Culver</surname>
              <given-names>DS</given-names>
            </name>
            <name name-style="western">
              <surname>Alfreds</surname>
              <given-names>ST</given-names>
            </name>
            <name name-style="western">
              <surname>Stearns</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Sylvester</surname>
              <given-names>KG</given-names>
            </name>
            <name name-style="western">
              <surname>Widen</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Ling</surname>
              <given-names>XB</given-names>
            </name>
          </person-group>
          <article-title>Web-based real-time case finding for the population health management of patients with diabetes mellitus: a prospective validation of the natural language processing-based algorithm with statewide electronic medical records</article-title>
          <source>JMIR Med Inform</source>
          <year>2016</year>
          <month>11</month>
          <day>11</day>
          <volume>4</volume>
          <issue>4</issue>
          <fpage>e37</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2016/4/e37/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/medinform.6328</pub-id>
          <pub-id pub-id-type="medline">27836816</pub-id>
          <pub-id pub-id-type="pii">v4i4e37</pub-id>
          <pub-id pub-id-type="pmcid">PMC5124114</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Petch</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Batt</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Murray</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Mamdani</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Extracting clinical features from dictated ambulatory consult notes using a commercially available natural language processing tool: pilot, retrospective, cross-sectional validation study</article-title>
          <source>JMIR Med Inform</source>
          <year>2019</year>
          <month>11</month>
          <day>01</day>
          <volume>7</volume>
          <issue>4</issue>
          <fpage>e12575</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2019/4/e12575/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/12575</pub-id>
          <pub-id pub-id-type="medline">31682579</pub-id>
          <pub-id pub-id-type="pii">v7i4e12575</pub-id>
          <pub-id pub-id-type="pmcid">PMC6913750</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Dredze</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Weiner</surname>
              <given-names>JP</given-names>
            </name>
            <name name-style="western">
              <surname>Hernandez</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Kimura</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kharrazi</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Extraction of geriatric syndromes from electronic health record clinical notes: assessment of statistical natural language processing methods</article-title>
          <source>JMIR Med Inform</source>
          <year>2019</year>
          <month>03</month>
          <day>26</day>
          <volume>7</volume>
          <issue>1</issue>
          <fpage>e13039</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2019/1/e13039/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/13039</pub-id>
          <pub-id pub-id-type="medline">30862607</pub-id>
          <pub-id pub-id-type="pii">v7i1e13039</pub-id>
          <pub-id pub-id-type="pmcid">PMC6454337</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Agaronnik</surname>
              <given-names>ND</given-names>
            </name>
            <name name-style="western">
              <surname>Lindvall</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>El-Jawahri</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Iezzoni</surname>
              <given-names>LI</given-names>
            </name>
          </person-group>
          <article-title>Challenges of developing a natural language processing method with electronic health records to identify persons with chronic mobility disability</article-title>
          <source>Arch Phys Med Rehabil</source>
          <year>2020</year>
          <month>10</month>
          <volume>101</volume>
          <issue>10</issue>
          <fpage>1739</fpage>
          <lpage>1746</lpage>
          <pub-id pub-id-type="doi">10.1016/j.apmr.2020.04.024</pub-id>
          <pub-id pub-id-type="medline">32446905</pub-id>
          <pub-id pub-id-type="pii">S0003-9993(20)30291-4</pub-id>
          <pub-id pub-id-type="pmcid">PMC7529728</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Koleck</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Dreisbach</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Bourne</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Bakken</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Natural language processing of symptoms documented in free-text narratives of electronic health records: a systematic review</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2019</year>
          <month>04</month>
          <day>01</day>
          <volume>26</volume>
          <issue>4</issue>
          <fpage>364</fpage>
          <lpage>379</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/30726935"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocy173</pub-id>
          <pub-id pub-id-type="medline">30726935</pub-id>
          <pub-id pub-id-type="pii">5307912</pub-id>
          <pub-id pub-id-type="pmcid">PMC6657282</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hobbs</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Appelt</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Bear</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Israel</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Kameyama</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Stickel</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <person-group person-group-type="editor">
            <name name-style="western">
              <surname>Roche</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Schabes</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>FASTUS: A Cascaded Finite-State Transducer for Extracting Information from Natural-Language Text</article-title>
          <source>Finite-state Language Processing</source>
          <year>1997</year>
          <month>05</month>
          <publisher-loc>Cambridge, Massachusetts</publisher-loc>
          <publisher-name>MIT Press</publisher-name>
          <fpage>383</fpage>
          <lpage>406</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Leroy</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Martinez</surname>
              <given-names>JD</given-names>
            </name>
          </person-group>
          <article-title>A shallow parser based on closed-class words to capture relations in biomedical text</article-title>
          <source>J Biomed Inform</source>
          <year>2003</year>
          <month>06</month>
          <volume>36</volume>
          <issue>3</issue>
          <fpage>145</fpage>
          <lpage>58</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S153204640300039X"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/s1532-0464(03)00039-x</pub-id>
          <pub-id pub-id-type="medline">14615225</pub-id>
          <pub-id pub-id-type="pii">S153204640300039X</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pinazo-Durán</surname>
              <given-names>María D</given-names>
            </name>
            <name name-style="western">
              <surname>Zanón-Moreno</surname>
              <given-names>Vicente</given-names>
            </name>
            <name name-style="western">
              <surname>García-Medina</surname>
              <given-names>José J</given-names>
            </name>
            <name name-style="western">
              <surname>Arévalo</surname>
              <given-names>J Fernando</given-names>
            </name>
            <name name-style="western">
              <surname>Gallego-Pinazo</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Nucci</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Eclectic ocular comorbidities and systemic diseases with eye involvement: a review</article-title>
          <source>Biomed Res Int</source>
          <year>2016</year>
          <volume>2016</volume>
          <fpage>6215745</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1155/2016/6215745"/>
          </comment>
          <pub-id pub-id-type="doi">10.1155/2016/6215745</pub-id>
          <pub-id pub-id-type="medline">27051666</pub-id>
          <pub-id pub-id-type="pmcid">PMC4808667</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
