Introduction

NLPSandbox.io is an open platform for benchmarking modular natural language processing (NLP) tools on both public and private datasets. Academics, students, and industry professionals are invited to browse the available tasks and participate by developing and submitting an NLP Sandbox tool.

A series of tasks benchmarked in the NLP Sandbox are PHI annotation and de-identification tasks. One of the datasets used to benchmark the performance of NLP Sandbox tools submitted to solve these tasks is from the 2014 i2b2 NLP De-identification Challenge. This dataset does not include sensitive information since the authors have replaced the PHIs with synthetic values that prevent the re-identification of patients.

This notebook takes as input the official training and evaluation dataset files of the 2014 i2b2 NLP De-identification Challenge and generates a new dataset using the NLP Sandbox Schemas. The JSON files generated can then be stored in an instance of the NLP Sandbox Data Node. A Python script that reads a dataset JSON file and push its content to a Data Node is provided in the GitHub repository of the NLP Sandbox Client Library for Python.

Setup

Requirements

This notebook needs three files downloaded from the 2014 i2b2 NLP De-identification Challenge Dataset and stored in Synapse.

Name Description Synapse ID
Training Data: PHI Gold Set 1 Training notes and annotations (part 1). syn23196860
Training Data: PHI Gold Set 2 Training notes and annotations (part 2). syn23196850
Test Data: PHI Gold Set - Fixed Evaluation notes and annotations. syn23196857

The terms and conditions of use for this i2b2 dataset prevent us from making these Synapse files publicly available. You need to download these files from their official repository (link above) and push these files to Synapse. The last step is to update the Synapse IDs of these files in this notebook (see Section Configuration).

Set the value of SYNAPSE_TOKEN in the configuration file .env to one of your Synapse personal access tokens. Token can be generated in Synapse via your User Menu > Settings > Persona Access Tokens.

Conda environments

List the Conda environments.

options(reticulate.repl.quiet = TRUE)
conda_list(conda = "auto")

Activate the environment i2b2-phi-dataset.

use_condaenv("i2b2-phi-dataset", required = TRUE)

Logging into Synapse

synapseclient <- reticulate::import('synapseclient')
syn <- synapseclient$Synapse()
syn$login()

Configuration

schemas_version <- Sys.getenv("APP_NLPSANDBOX_SCHEMAS_VERSION")
dataset_version <- Sys.getenv("APP_DATASET_VERSION")

# Original files of the 2014 i2b2 NLP De-identification challenge.
dataset_archives = list(
  training_1 = list(
    synId = "syn23196860",  # 2014_training-PHI-Gold-Set1.tar.gz
    type = "training"
  ),
  training_2 = list(
    synId = "syn23196850",  # training-PHI-Gold-Set2.tar.gz
    type = "training"
  ),
  evaluation = list(
    synId = "syn23196857",  # testing-PHI-Gold-fixed.tar.gz
    type = "evaluation"
  )
)

# Project folders
project_dir <- rprojroot::find_rstudio_root_file()
data_dir <- file.path(project_dir, "data")
output_dir <- file.path(project_dir, "output")

# Output folders
job_name <- "i2b2-phi-dataset-nlpsandbox"
job_version <- dataset_version
job_dir <- file.path(output_dir, job_name)
job_output_dir <- file.path(job_dir, job_version)

# Local output files
output_files = list(
  training = list(
    annotations_json = file.path(job_output_dir, paste0("training-set-annotations.json")),
    patients_json = file.path(job_output_dir, paste0("training-set-patients.json"))
  ),
  evaluation = list(
    annotations_json = file.path(job_output_dir, paste0("evaluation-set-annotations.json")),
    patients_json = file.path(job_output_dir, paste0("evaluation-set-patients.json"))
  )
)

# Number of CPU cores
num_cores = parallel::detectCores()

Pulling data from Synapse

Get the dataset archives from Synapse, extract them and get the paths to the XML files. Each XML file includes the text of the clinical note (TEXT node) and a list of annotations (TAGS node).

dataset_archives <- lapply(dataset_archives, function(archive) {
  file_handle <- syn$get(archive$synId, downloadLocation=data_dir,
                         ifcollision='overwrite.local')

  # Extract tar.gz archive and get list of XML files
  untar(file_handle$path, exdir=data_dir)
  xml_paths <- file.path(data_dir, untar(file_handle$path, list=TRUE)) %>%
    str_subset(pattern = "\\.xml$")

  archive$xml_paths <- xml_paths
  archive
})

The training and evaluation datasets are split into multiple parts. Combine these parts into training and evaluation datasets.

datasets <- list(
  training = list(
    type = "training",
    xml_paths = list.filter(dataset_archives, type == 'training') %>%
      list.select(xml_paths) %>%
      unlist(use.names = F)
  ),
  evaluation = list(
    type = "evaluation",
    xml_paths = list.filter(dataset_archives, type == 'evaluation') %>%
      list.select(xml_paths) %>%
      unlist(use.names = F)
  )
)

Number of clinical notes per dataset:

lapply(datasets, function(dataset) { length(dataset$xml_paths) })
## $training
## [1] 790
## 
## $evaluation
## [1] 514

Generating the dataset using NLP Sandbox Schemas

Extract the annotations from the XML files and save them to data frames.

datasets <- lapply(datasets, function(dataset) {
  annotations <- do.call(rbind, mclapply(dataset$xml_paths, function(xml_path) {
    doc <- xml2::read_xml(xml_path)
    xml_find_all(doc, xpath = "//TAGS/*") %>%
      purrr::map_dfr(~ {
        # save the primary type of the annotations
        name <- xml_name(.)

        # attributes of the XML tag
        attrs <- xml_attrs(.) %>%
          tibble::enframe() %>%
          tidyr::spread(name, value)

        # extract patient id and note index from the note filename
        df <- strsplit(fs::path_ext_remove(fs::path_file(xml_path)), "-")[[1]]
        patient_id <- df[1]
        note_index <- df[2]

        # combine the annotations
        cbind.data.frame(name, attrs, patient_id, note_index) %>%
          tibble::set_tidy_names() %>%
          as_tibble()
      })
  }, mc.cores = num_cores, mc.set.seed = TRUE))

  dataset$annotations <- annotations
  dataset
})

Number of annotations per dataset:

lapply(datasets, function(dataset) { nrow(dataset$annotations) })
## $training
## [1] 17405
## 
## $evaluation
## [1] 11462

Preparation

Setting the type of annotation properties

All the properties are currently strings. The transformation below are applied:

  • start is set to integer
  • end is set to integer
datasets <- lapply(datasets, function(dataset) {
  dataset$annotations$start <- as.integer(dataset$annotations$start)
  dataset$annotations$end <- as.integer(dataset$annotations$end)
  dataset
})

Replacing annotation property end by length

The motivation for replacing the property end by length computed as end - start is that the length is more convenient to user and easier to validate because length is relative to start.

datasets <- lapply(datasets, function(dataset) {
  if ("end" %in% colnames(dataset$annotations)) {
    dataset$annotations$length <- dataset$annotations$end - dataset$annotations$start
    dataset$annotations$end <- NULL
  }
  dataset
})
# datasets$evaluation$annotations
names(datasets$evaluation$annotations)
## [1] "name"       "comment"    "id"         "start"      "text"      
## [6] "TYPE"       "patient_id" "note_index" "length"

Generating annotations

List of annotations

Annotation Description NLP Sandbox schema
TextDateAnnotation A date annotation in a text. HTML notebook
TextPersonNameAnnotation A person name annotation in a text. HTML notebook
TextLocationAnnotation A location annotation in a text. HTML notebook
TextIdAnnotation An ID annotation in a text. HTML notebook
TextContactAnnotation A contact annotation in a text. HTML notebook

Date annotations

date_annotations <- lapply(names(datasets), function(dataset_name) {
  dataset <- datasets[[dataset_name]]
  annotations <- dataset$annotations
  annotations <- annotations[annotations$name == "DATE",]
  data.frame(
    noteId = paste0(annotations$patient_id, "-", annotations$note_index),
    start = annotations$start,
    length = annotations$length,
    text = annotations$text,
    confidence = 100,
    dateFormat = "",
    stringsAsFactors = FALSE)
})
names(date_annotations) <- names(datasets)

Person name annotations

person_name_annotations <- lapply(names(datasets), function(dataset_name) {
  dataset <- datasets[[dataset_name]]
  annotations <- dataset$annotations
  annotations <- annotations[annotations$name == "NAME",]
  data.frame(
    noteId = paste0(annotations$patient_id, "-", annotations$note_index),
    start = annotations$start,
    length = annotations$length,
    text = annotations$text,
    confidence = 100,
    stringsAsFactors = FALSE)
})
names(person_name_annotations) <- names(datasets)

Location annotations

# Lookup table used to convert location types
locationTypeLUT <- data.frame(
  i2b2 = c("CITY",
           "COUNTRY",
           "HOSPITAL",
           "LOCATION-OTHER",
           "ORGANIZATION",
           "STATE",
           "STREET",
           "ZIP"),
  nlp_sandbox = c("city",
                  "country",
                  "hospital",
                  "other",
                  "organization",
                  "state",
                  "street",
                  "zip")
)
location_annotations <- lapply(names(datasets), function(dataset_name) {
  dataset <- datasets[[dataset_name]]
  annotations <- dataset$annotations
  annotations <- annotations[annotations$name == "LOCATION",]
  data.frame(
    noteId = paste0(annotations$patient_id, "-", annotations$note_index),
    start = annotations$start,
    length = annotations$length,
    text = annotations$text,
    confidence = 100,
    locationType = locationTypeLUT$nlp_sandbox[match(annotations$TYPE, locationTypeLUT$i2b2)],
    stringsAsFactors = FALSE)
})
names(location_annotations) <- names(datasets)

ID annotations

# Lookup table used to convert id types
idTypeLUT <- data.frame(
  i2b2 = c("ACCOUNT",
           "BIOID",
           "DEVICE",
           "HEALTHPLAN",
           "IDNUM",
           "LICENSE",
           "MEDICALRECORD",
           "SSN",
           "VEHICLE"),
  nlp_sandbox = c("account",
                  "bio_id",
                  "device",
                  "health_plan",
                  "id_number",
                  "license",
                  "medical_record",
                  "ssn",
                  "vehicle")
)
id_annotations <- lapply(names(datasets), function(dataset_name) {
  dataset <- datasets[[dataset_name]]
  annotations <- dataset$annotations
  annotations <- annotations[annotations$name == "ID",]
  data.frame(
    noteId = paste0(annotations$patient_id, "-", annotations$note_index),
    start = annotations$start,
    length = annotations$length,
    text = annotations$text,
    confidence = 100,
    idType = idTypeLUT$nlp_sandbox[match(annotations$TYPE, idTypeLUT$i2b2)],
    stringsAsFactors = FALSE)
})
names(id_annotations) <- names(datasets)

Contact information annotations

# Lookup table used to convert contact types
contactTypeLUT <- data.frame(
  i2b2 = c("EMAIL",
           "FAX",
           "IPADDR",
           "PHONE",
           "URL"),
  nlp_sandbox = c("email",
                  "fax",
                  "ip_address",
                  "phone",
                  "url")
)
contact_annotations <- lapply(names(datasets), function(dataset_name) {
  dataset <- datasets[[dataset_name]]
  annotations <- dataset$annotations
  annotations <- annotations[annotations$name == "CONTACT",]
  data.frame(
    noteId = paste0(annotations$patient_id, "-", annotations$note_index),
    start = annotations$start,
    length = annotations$length,
    text = annotations$text,
    confidence = 100,
    contactType = contactTypeLUT$nlp_sandbox[match(annotations$TYPE, contactTypeLUT$i2b2)],
    stringsAsFactors = FALSE)
})
names(contact_annotations) <- names(datasets)

Exporting data to files

Full dataset

# create output dir if needed
dir.create(job_output_dir, recursive = TRUE, showWarnings = FALSE)

# save the patients of the training and evaluation set
noop <- lapply(names(datasets), function(dataset_name) {
  # dataset_name <- "evaluation"
  dataset <- datasets[[dataset_name]]
  patient_ids <- sort(unique(dataset$annotations$patient_id))
  # patient_ids <- patient_ids[1:2]
  patient_bundles <- lapply(patient_ids, function(patient_id) {
    # patient_id <- "110"

    # create FHIR Patient object
    patient <- list(
      identifier = patient_id,
      gender = 'unknown'
    )

    # create Notes objects for the patient specified
    tag <- paste0(patient_id, "-")
    xml_paths <- sort(dataset$xml_paths[grepl(tag, dataset$xml_paths)])
    note_bundles <- lapply(xml_paths, function(xml_path){
      # xml_path <- "/home/rstudio/nlp-sandbox-analysis/data/testing-PHI-Gold-fixed/110-01.xml"
      # create note object
      note_xml_doc <- read_xml(xml_path, options = "NOCDATA")
      note_text <- xml_text(xml_find_all(note_xml_doc, "//TEXT"), trim = FALSE)
      note <- list(
        text=note_text,
        type="",
        patientId="PATIENT_ID"
      )

      # create the Annotation objects for the note specified
      noteId <- gsub(pattern = "\\.xml$", "", basename(xml_path))

      ## get date annotations
      df <- date_annotations[[dataset_name]]
      df <- df[df$noteId == noteId,]
      text_date_annotations <- df %>% purrr::transpose()

      ## get person name annotations
      df <- person_name_annotations[[dataset_name]]
      df <- df[df$noteId == noteId,]
      text_person_name_annotations <- df %>% purrr::transpose()

      ## get location annotations
      df <- location_annotations[[dataset_name]]
      df <- df[df$noteId == noteId,]
      text_location_annotations <- df %>% purrr::transpose()

      ## get contact annotations
      df <- contact_annotations[[dataset_name]]
      df <- df[df$noteId == noteId,]
      text_contact_annotations <- df %>% purrr::transpose()

      ## get id annotations
      df <- id_annotations[[dataset_name]]
      df <- df[df$noteId == noteId,]
      text_id_annotations <- df %>% purrr::transpose()

      annotation=list(
        annotationSource=list(
          resourceSource=list(
            name="NOTE_NAME"
          )
        ),
        textDateAnnotations=text_date_annotations,
        textPersonNameAnnotations=text_person_name_annotations,
        textLocationAnnotations=text_location_annotations,
        textContactAnnotations=text_contact_annotations,
        textIdAnnotations=text_id_annotations
      )

      # create note bundle
      note_bundle <- list(
        note=note,
        annotation=annotation
      )
    })

    # create patient bundle
    patient_bundle <- list(
      patient=patient,
      note_bundles=note_bundles
    )
  })

  data <- list(
    patient_bundles=patient_bundles
  )

  # create output dataset directory and save bundles
  # dataset_dir <- file.path(job_output_dir, dataset_name)
  dir.create(job_output_dir, recursive = TRUE, showWarnings = FALSE)
  data_json <- jsonlite::toJSON(data, auto_unbox = TRUE, pretty = TRUE)
  write(data_json, file.path(job_output_dir, paste0("i2b2-phi-dataset-", dataset_name, ".json")))
})

Example dataset

The data from a single patient (patient ID 110).

# create output dir if needed
dir.create(job_output_dir, recursive = TRUE, showWarnings = FALSE)

dataset_name <- "evaluation"
dataset <- datasets[[dataset_name]]
patient_ids <- as.character(100:109)

# patient_ids <- patient_ids[1:2]
patient_bundles <- lapply(patient_ids, function(patient_id) {
  # create FHIR Patient object
  patient <- list(
    identifier = patient_id,
    gender = 'unknown'
  )

  # create Notes objects for the patient specified
  tag <- paste0(patient_id, "-")
  xml_paths <- sort(dataset$xml_paths[grepl(tag, dataset$xml_paths)])
  note_bundles <- lapply(xml_paths, function(xml_path){
    # create note object
    note_xml_doc <- read_xml(xml_path, options = "NOCDATA")
    note_text <- xml_text(xml_find_all(note_xml_doc, "//TEXT"), trim = FALSE)
    note <- list(
      text=note_text,
      type="",
      patientId="PATIENT_ID"
    )

    # create the Annotation objects for the note specified
    noteId <- gsub(pattern = "\\.xml$", "", basename(xml_path))

    ## get date annotations
    df <- date_annotations[[dataset_name]]
    df <- df[df$noteId == noteId,]
    text_date_annotations <- df %>% purrr::transpose()

    ## get person name annotations
    df <- person_name_annotations[[dataset_name]]
    df <- df[df$noteId == noteId,]
    text_person_name_annotations <- df %>% purrr::transpose()

    ## get location annotations
    df <- location_annotations[[dataset_name]]
    df <- df[df$noteId == noteId,]
    text_location_annotations <- df %>% purrr::transpose()

    annotation=list(
      annotationSource=list(
        resourceSource=list(
          name="NOTE_NAME"
        )
      ),
      textDateAnnotations=text_date_annotations,
      textPersonNameAnnotations=text_person_name_annotations,
      textLocationAnnotations=text_location_annotations
    )

    # create note bundle
    note_bundle <- list(
      note=note,
      annotation=annotation
    )
  })

  # create patient bundle
  patient_bundle <- list(
    patient=patient,
    note_bundles=note_bundles
  )
})

data <- list(
  patient_bundles=patient_bundles
)

# create output dataset directory and save bundles
# dataset_dir <- file.path(job_output_dir, dataset_name)
dir.create(job_output_dir, recursive = TRUE, showWarnings = FALSE)
data_json <- jsonlite::toJSON(data, auto_unbox = TRUE, pretty = TRUE)
write(data_json, file.path(job_output_dir, paste0("i2b2-phi-dataset-example.json")))

Generated files

list.files(path=job_output_dir, full.names=TRUE)
## [1] "/project/output/i2b2-phi-dataset-nlpsandbox/1.2.1PASSWORD=domp.vig_psar4FESM/i2b2-phi-dataset-evaluation.json"
## [2] "/project/output/i2b2-phi-dataset-nlpsandbox/1.2.1PASSWORD=domp.vig_psar4FESM/i2b2-phi-dataset-example.json"   
## [3] "/project/output/i2b2-phi-dataset-nlpsandbox/1.2.1PASSWORD=domp.vig_psar4FESM/i2b2-phi-dataset-training.json"