NLPSandbox.io is an open platform for benchmarking modular natural language processing (NLP) tools on both public and private datasets. Academics, students, and industry professionals are invited to browse the available tasks and participate by developing and submitting an NLP Sandbox tool.
A series of tasks benchmarked in the NLP Sandbox are PHI annotation and de-identification tasks. One of the datasets used to benchmark the performance of NLP Sandbox tools submitted to solve these tasks is from the 2014 i2b2 NLP De-identification Challenge. This dataset does not include sensitive information since the authors have replaced the PHIs with synthetic values that prevent the re-identification of patients.
This notebook takes as input the official training and evaluation dataset files of the 2014 i2b2 NLP De-identification Challenge and generates a new dataset using the NLP Sandbox Schemas. The JSON files generated can then be stored in an instance of the NLP Sandbox Data Node. A Python script that reads a dataset JSON file and push its content to a Data Node is provided in the GitHub repository of the NLP Sandbox Client Library for Python.
This notebook needs three files downloaded from the 2014 i2b2 NLP De-identification Challenge Dataset and stored in Synapse.
Name | Description | Synapse ID |
---|---|---|
Training Data: PHI Gold Set 1 | Training notes and annotations (part 1). | syn23196860 |
Training Data: PHI Gold Set 2 | Training notes and annotations (part 2). | syn23196850 |
Test Data: PHI Gold Set - Fixed | Evaluation notes and annotations. | syn23196857 |
The terms and conditions of use for this i2b2 dataset prevent us from making these Synapse files publicly available. You need to download these files from their official repository (link above) and push these files to Synapse. The last step is to update the Synapse IDs of these files in this notebook (see Section Configuration
).
Set the value of SYNAPSE_TOKEN
in the configuration file .env
to one of your Synapse personal access tokens. Token can be generated in Synapse via your User Menu > Settings > Persona Access Tokens.
List the Conda environments.
options(reticulate.repl.quiet = TRUE)
conda_list(conda = "auto")
Activate the environment i2b2-phi-dataset
.
use_condaenv("i2b2-phi-dataset", required = TRUE)
synapseclient <- reticulate::import('synapseclient')
syn <- synapseclient$Synapse()
syn$login()
schemas_version <- Sys.getenv("APP_NLPSANDBOX_SCHEMAS_VERSION")
dataset_version <- Sys.getenv("APP_DATASET_VERSION")
# Original files of the 2014 i2b2 NLP De-identification challenge.
dataset_archives = list(
training_1 = list(
synId = "syn23196860", # 2014_training-PHI-Gold-Set1.tar.gz
type = "training"
),
training_2 = list(
synId = "syn23196850", # training-PHI-Gold-Set2.tar.gz
type = "training"
),
evaluation = list(
synId = "syn23196857", # testing-PHI-Gold-fixed.tar.gz
type = "evaluation"
)
)
# Project folders
project_dir <- rprojroot::find_rstudio_root_file()
data_dir <- file.path(project_dir, "data")
output_dir <- file.path(project_dir, "output")
# Output folders
job_name <- "i2b2-phi-dataset-nlpsandbox"
job_version <- dataset_version
job_dir <- file.path(output_dir, job_name)
job_output_dir <- file.path(job_dir, job_version)
# Local output files
output_files = list(
training = list(
annotations_json = file.path(job_output_dir, paste0("training-set-annotations.json")),
patients_json = file.path(job_output_dir, paste0("training-set-patients.json"))
),
evaluation = list(
annotations_json = file.path(job_output_dir, paste0("evaluation-set-annotations.json")),
patients_json = file.path(job_output_dir, paste0("evaluation-set-patients.json"))
)
)
# Number of CPU cores
num_cores = parallel::detectCores()
Get the dataset archives from Synapse, extract them and get the paths to the XML files. Each XML file includes the text of the clinical note (TEXT node) and a list of annotations (TAGS node).
dataset_archives <- lapply(dataset_archives, function(archive) {
file_handle <- syn$get(archive$synId, downloadLocation=data_dir,
ifcollision='overwrite.local')
# Extract tar.gz archive and get list of XML files
untar(file_handle$path, exdir=data_dir)
xml_paths <- file.path(data_dir, untar(file_handle$path, list=TRUE)) %>%
str_subset(pattern = "\\.xml$")
archive$xml_paths <- xml_paths
archive
})
The training and evaluation datasets are split into multiple parts. Combine these parts into training and evaluation datasets.
datasets <- list(
training = list(
type = "training",
xml_paths = list.filter(dataset_archives, type == 'training') %>%
list.select(xml_paths) %>%
unlist(use.names = F)
),
evaluation = list(
type = "evaluation",
xml_paths = list.filter(dataset_archives, type == 'evaluation') %>%
list.select(xml_paths) %>%
unlist(use.names = F)
)
)
Number of clinical notes per dataset:
lapply(datasets, function(dataset) { length(dataset$xml_paths) })
## $training
## [1] 790
##
## $evaluation
## [1] 514
Extract the annotations from the XML files and save them to data frames.
datasets <- lapply(datasets, function(dataset) {
annotations <- do.call(rbind, mclapply(dataset$xml_paths, function(xml_path) {
doc <- xml2::read_xml(xml_path)
xml_find_all(doc, xpath = "//TAGS/*") %>%
purrr::map_dfr(~ {
# save the primary type of the annotations
name <- xml_name(.)
# attributes of the XML tag
attrs <- xml_attrs(.) %>%
tibble::enframe() %>%
tidyr::spread(name, value)
# extract patient id and note index from the note filename
df <- strsplit(fs::path_ext_remove(fs::path_file(xml_path)), "-")[[1]]
patient_id <- df[1]
note_index <- df[2]
# combine the annotations
cbind.data.frame(name, attrs, patient_id, note_index) %>%
tibble::set_tidy_names() %>%
as_tibble()
})
}, mc.cores = num_cores, mc.set.seed = TRUE))
dataset$annotations <- annotations
dataset
})
Number of annotations per dataset:
lapply(datasets, function(dataset) { nrow(dataset$annotations) })
## $training
## [1] 17405
##
## $evaluation
## [1] 11462
All the properties are currently strings. The transformation below are applied:
start
is set to integerend
is set to integerdatasets <- lapply(datasets, function(dataset) {
dataset$annotations$start <- as.integer(dataset$annotations$start)
dataset$annotations$end <- as.integer(dataset$annotations$end)
dataset
})
end
by length
The motivation for replacing the property end
by length
computed as end
- start
is that the length is more convenient to user and easier to validate because length
is relative to start
.
datasets <- lapply(datasets, function(dataset) {
if ("end" %in% colnames(dataset$annotations)) {
dataset$annotations$length <- dataset$annotations$end - dataset$annotations$start
dataset$annotations$end <- NULL
}
dataset
})
# datasets$evaluation$annotations
names(datasets$evaluation$annotations)
## [1] "name" "comment" "id" "start" "text"
## [6] "TYPE" "patient_id" "note_index" "length"
date_annotations <- lapply(names(datasets), function(dataset_name) {
dataset <- datasets[[dataset_name]]
annotations <- dataset$annotations
annotations <- annotations[annotations$name == "DATE",]
data.frame(
noteId = paste0(annotations$patient_id, "-", annotations$note_index),
start = annotations$start,
length = annotations$length,
text = annotations$text,
confidence = 100,
dateFormat = "",
stringsAsFactors = FALSE)
})
names(date_annotations) <- names(datasets)
person_name_annotations <- lapply(names(datasets), function(dataset_name) {
dataset <- datasets[[dataset_name]]
annotations <- dataset$annotations
annotations <- annotations[annotations$name == "NAME",]
data.frame(
noteId = paste0(annotations$patient_id, "-", annotations$note_index),
start = annotations$start,
length = annotations$length,
text = annotations$text,
confidence = 100,
stringsAsFactors = FALSE)
})
names(person_name_annotations) <- names(datasets)
# Lookup table used to convert location types
locationTypeLUT <- data.frame(
i2b2 = c("CITY",
"COUNTRY",
"HOSPITAL",
"LOCATION-OTHER",
"ORGANIZATION",
"STATE",
"STREET",
"ZIP"),
nlp_sandbox = c("city",
"country",
"hospital",
"other",
"organization",
"state",
"street",
"zip")
)
location_annotations <- lapply(names(datasets), function(dataset_name) {
dataset <- datasets[[dataset_name]]
annotations <- dataset$annotations
annotations <- annotations[annotations$name == "LOCATION",]
data.frame(
noteId = paste0(annotations$patient_id, "-", annotations$note_index),
start = annotations$start,
length = annotations$length,
text = annotations$text,
confidence = 100,
locationType = locationTypeLUT$nlp_sandbox[match(annotations$TYPE, locationTypeLUT$i2b2)],
stringsAsFactors = FALSE)
})
names(location_annotations) <- names(datasets)
# Lookup table used to convert id types
idTypeLUT <- data.frame(
i2b2 = c("ACCOUNT",
"BIOID",
"DEVICE",
"HEALTHPLAN",
"IDNUM",
"LICENSE",
"MEDICALRECORD",
"SSN",
"VEHICLE"),
nlp_sandbox = c("account",
"bio_id",
"device",
"health_plan",
"id_number",
"license",
"medical_record",
"ssn",
"vehicle")
)
id_annotations <- lapply(names(datasets), function(dataset_name) {
dataset <- datasets[[dataset_name]]
annotations <- dataset$annotations
annotations <- annotations[annotations$name == "ID",]
data.frame(
noteId = paste0(annotations$patient_id, "-", annotations$note_index),
start = annotations$start,
length = annotations$length,
text = annotations$text,
confidence = 100,
idType = idTypeLUT$nlp_sandbox[match(annotations$TYPE, idTypeLUT$i2b2)],
stringsAsFactors = FALSE)
})
names(id_annotations) <- names(datasets)
# Lookup table used to convert contact types
contactTypeLUT <- data.frame(
i2b2 = c("EMAIL",
"FAX",
"IPADDR",
"PHONE",
"URL"),
nlp_sandbox = c("email",
"fax",
"ip_address",
"phone",
"url")
)
contact_annotations <- lapply(names(datasets), function(dataset_name) {
dataset <- datasets[[dataset_name]]
annotations <- dataset$annotations
annotations <- annotations[annotations$name == "CONTACT",]
data.frame(
noteId = paste0(annotations$patient_id, "-", annotations$note_index),
start = annotations$start,
length = annotations$length,
text = annotations$text,
confidence = 100,
contactType = contactTypeLUT$nlp_sandbox[match(annotations$TYPE, contactTypeLUT$i2b2)],
stringsAsFactors = FALSE)
})
names(contact_annotations) <- names(datasets)
# create output dir if needed
dir.create(job_output_dir, recursive = TRUE, showWarnings = FALSE)
# save the patients of the training and evaluation set
noop <- lapply(names(datasets), function(dataset_name) {
# dataset_name <- "evaluation"
dataset <- datasets[[dataset_name]]
patient_ids <- sort(unique(dataset$annotations$patient_id))
# patient_ids <- patient_ids[1:2]
patient_bundles <- lapply(patient_ids, function(patient_id) {
# patient_id <- "110"
# create FHIR Patient object
patient <- list(
identifier = patient_id,
gender = 'unknown'
)
# create Notes objects for the patient specified
tag <- paste0(patient_id, "-")
xml_paths <- sort(dataset$xml_paths[grepl(tag, dataset$xml_paths)])
note_bundles <- lapply(xml_paths, function(xml_path){
# xml_path <- "/home/rstudio/nlp-sandbox-analysis/data/testing-PHI-Gold-fixed/110-01.xml"
# create note object
note_xml_doc <- read_xml(xml_path, options = "NOCDATA")
note_text <- xml_text(xml_find_all(note_xml_doc, "//TEXT"), trim = FALSE)
note <- list(
text=note_text,
type="",
patientId="PATIENT_ID"
)
# create the Annotation objects for the note specified
noteId <- gsub(pattern = "\\.xml$", "", basename(xml_path))
## get date annotations
df <- date_annotations[[dataset_name]]
df <- df[df$noteId == noteId,]
text_date_annotations <- df %>% purrr::transpose()
## get person name annotations
df <- person_name_annotations[[dataset_name]]
df <- df[df$noteId == noteId,]
text_person_name_annotations <- df %>% purrr::transpose()
## get location annotations
df <- location_annotations[[dataset_name]]
df <- df[df$noteId == noteId,]
text_location_annotations <- df %>% purrr::transpose()
## get contact annotations
df <- contact_annotations[[dataset_name]]
df <- df[df$noteId == noteId,]
text_contact_annotations <- df %>% purrr::transpose()
## get id annotations
df <- id_annotations[[dataset_name]]
df <- df[df$noteId == noteId,]
text_id_annotations <- df %>% purrr::transpose()
annotation=list(
annotationSource=list(
resourceSource=list(
name="NOTE_NAME"
)
),
textDateAnnotations=text_date_annotations,
textPersonNameAnnotations=text_person_name_annotations,
textLocationAnnotations=text_location_annotations,
textContactAnnotations=text_contact_annotations,
textIdAnnotations=text_id_annotations
)
# create note bundle
note_bundle <- list(
note=note,
annotation=annotation
)
})
# create patient bundle
patient_bundle <- list(
patient=patient,
note_bundles=note_bundles
)
})
data <- list(
patient_bundles=patient_bundles
)
# create output dataset directory and save bundles
# dataset_dir <- file.path(job_output_dir, dataset_name)
dir.create(job_output_dir, recursive = TRUE, showWarnings = FALSE)
data_json <- jsonlite::toJSON(data, auto_unbox = TRUE, pretty = TRUE)
write(data_json, file.path(job_output_dir, paste0("i2b2-phi-dataset-", dataset_name, ".json")))
})
The data from a single patient (patient ID 110
).
# create output dir if needed
dir.create(job_output_dir, recursive = TRUE, showWarnings = FALSE)
dataset_name <- "evaluation"
dataset <- datasets[[dataset_name]]
patient_ids <- as.character(100:109)
# patient_ids <- patient_ids[1:2]
patient_bundles <- lapply(patient_ids, function(patient_id) {
# create FHIR Patient object
patient <- list(
identifier = patient_id,
gender = 'unknown'
)
# create Notes objects for the patient specified
tag <- paste0(patient_id, "-")
xml_paths <- sort(dataset$xml_paths[grepl(tag, dataset$xml_paths)])
note_bundles <- lapply(xml_paths, function(xml_path){
# create note object
note_xml_doc <- read_xml(xml_path, options = "NOCDATA")
note_text <- xml_text(xml_find_all(note_xml_doc, "//TEXT"), trim = FALSE)
note <- list(
text=note_text,
type="",
patientId="PATIENT_ID"
)
# create the Annotation objects for the note specified
noteId <- gsub(pattern = "\\.xml$", "", basename(xml_path))
## get date annotations
df <- date_annotations[[dataset_name]]
df <- df[df$noteId == noteId,]
text_date_annotations <- df %>% purrr::transpose()
## get person name annotations
df <- person_name_annotations[[dataset_name]]
df <- df[df$noteId == noteId,]
text_person_name_annotations <- df %>% purrr::transpose()
## get location annotations
df <- location_annotations[[dataset_name]]
df <- df[df$noteId == noteId,]
text_location_annotations <- df %>% purrr::transpose()
annotation=list(
annotationSource=list(
resourceSource=list(
name="NOTE_NAME"
)
),
textDateAnnotations=text_date_annotations,
textPersonNameAnnotations=text_person_name_annotations,
textLocationAnnotations=text_location_annotations
)
# create note bundle
note_bundle <- list(
note=note,
annotation=annotation
)
})
# create patient bundle
patient_bundle <- list(
patient=patient,
note_bundles=note_bundles
)
})
data <- list(
patient_bundles=patient_bundles
)
# create output dataset directory and save bundles
# dataset_dir <- file.path(job_output_dir, dataset_name)
dir.create(job_output_dir, recursive = TRUE, showWarnings = FALSE)
data_json <- jsonlite::toJSON(data, auto_unbox = TRUE, pretty = TRUE)
write(data_json, file.path(job_output_dir, paste0("i2b2-phi-dataset-example.json")))
list.files(path=job_output_dir, full.names=TRUE)
## [1] "/project/output/i2b2-phi-dataset-nlpsandbox/1.2.1PASSWORD=domp.vig_psar4FESM/i2b2-phi-dataset-evaluation.json"
## [2] "/project/output/i2b2-phi-dataset-nlpsandbox/1.2.1PASSWORD=domp.vig_psar4FESM/i2b2-phi-dataset-example.json"
## [3] "/project/output/i2b2-phi-dataset-nlpsandbox/1.2.1PASSWORD=domp.vig_psar4FESM/i2b2-phi-dataset-training.json"