# Select samples for use in reference building

## Load packages

In [1]:
quiet_library <- function(...) { suppressPackageStartupMessages(library(...)) }
quiet_library(hise)
quiet_library(dplyr)
quiet_library(purrr)

## Retrieve file and sample metadata from HISE

In [2]:
BR1_rna_desc <- getFileDescriptors(
    fileType = "scRNA-seq-labeled", 
    filter = list(cohort.cohortGuid = "BR1"))
BR2_rna_desc <- getFileDescriptors(
    fileType = "scRNA-seq-labeled", 
    filter = list(cohort.cohortGuid = "BR2"))
UP1_rna_desc <- getFileDescriptors(
    fileType = "scRNA-seq-labeled", 
    filter = list(cohort.cohortGuid = "UP1"))

In [3]:
BR1_rna_desc <- fileDescToDataframe(BR1_rna_desc)
BR2_rna_desc <- fileDescToDataframe(BR2_rna_desc)
UP1_rna_desc <- fileDescToDataframe(UP1_rna_desc)

## Remove irrelevant batches

Batches starting with "EXP" are experimental, non-pipeline batches.  
B004 is an early batch that has some batch effects. We'll exclude this batch, as samples have been re-run in later batches.  
Batches later than B145 are not used for this reference.

In [4]:
meta_data <- plyr::rbind.fill(BR1_rna_desc , BR2_rna_desc )

In [5]:
meta_data <- meta_data %>%
  filter(!grepl("EXP",file.batchID)) %>%
  filter(!file.batchID == "B004") %>%
  mutate(file.batch_num = as.numeric(sub("B","",file.batchID))) %>%
  filter(file.batch_num <= 145) %>%
  select(-file.batch_num)

## Remove non-healthy and abnormal subjects

We want to use only healthy subjects without abnormal presentation for this reference. A few subjects have non-healthy or abnormal states recored at some visits. We'll identify and remove these subjects.

In [6]:
non_healthy <- meta_data %>%
  filter(sample.diseaseStatesRecordedAtVisit != "") %>%
  select(subject.subjectGuid, sample.diseaseStatesRecordedAtVisit) %>%
  unique()

In [7]:
non_healthy

Unnamed: 0_level_0,subject.subjectGuid,sample.diseaseStatesRecordedAtVisit
Unnamed: 0_level_1,<chr>,<chr>
1,BR1034,Psoriasis
7,BR2007,Healthy - Abnormal
17,BR2049,Healthy - Abnormal


In [8]:
meta_data <- meta_data %>%
  filter(!subject.subjectGuid %in% non_healthy$subject.subjectGuid)

## Select Flu Year 1 Day 0 samples

To build our reference, we'll use the pre-vaccination samples from each of our BR1 (healthy adult 25-35 years) and BR2 (healthy adult 55-65 years) subjects.

These samples have the visit name "Flu Year 1 Day 0".

In [9]:
meta_data$pbmc_sample_id <- gsub("_","",paste0("PB0",substr(sub(".*PB0", "", meta_data$file.name),1,8)))
meta_data <- meta_data %>% 
  arrange(pbmc_sample_id) %>%
  filter(!duplicated(sample.sampleKitGuid, fromLast = TRUE)) %>%
  filter(sample.visitName == 'Flu Year 1 Day 0') %>%
  arrange(sample.sampleKitGuid)

In [10]:
table(meta_data$cohort.cohortGuid)


BR1 BR2 
 47  45 

## Select pediatric samples

A set of 16 pediatric samples have been previously published in Thomson, Z. et al. Trimodal single-cell profiling reveals a novel pediatric CD8αα+ T cell subset and broad age-related molecular reprogramming across the T cell compartment. Nat. Immunol. 24, 1947–1959 (2023).

We'll use this set of samples to provide coverage of pediatric cells in our reference.

In [11]:
UP_Sample_kits <- c("KT00809","KT00811","KT00193","KT00841",
                    "KT00842","KT00833","KT00910","KT00884",
                    "KT00892","KT00914","KT00913","KT00927",
                    "KT00928","KT02391","KT02392","KT03223")

In [12]:
UP1_rna_desc <- UP1_rna_desc %>% 
  filter(sample.sampleKitGuid %in% UP_Sample_kits)

In [13]:
meta_data <- plyr::rbind.fill(meta_data, UP1_rna_desc)

## Save file and sample metadata

In [14]:
if(!dir.exists("output")) {
    dir.create("output")
}

In [15]:
out_file <- file.path(
    "output",
    paste0("ref_h5_meta_data_",Sys.Date(),".csv")
)

write.csv(
    meta_data,
    out_file,
    row.names = FALSE,
    quote = FALSE
)

## Store results in HISE

In order to store the results in HISE, we'll need to cache these files to register them, and then we can upload the CSV file for later steps.

In [16]:
res <- cacheFiles(list(meta_data$file.id))

[1] "Initiating file download for B001-P1_PB00001-01_labeled.h5"
[1] "Download successful."
[1] "Initiating file download for B001-P1_PB00002-01_labeled.h5"
[1] "Download successful."
[1] "Initiating file download for B001-P1_PB00003-01_labeled.h5"
[1] "Download successful."
[1] "Initiating file download for B002-P1_PB00004-01_2023-11-17T21:36:51.794326181Z_labeled.h5"
[1] "Download successful."
[1] "Initiating file download for B002-P1_PB00006-01_2023-11-17T21:36:51.794326181Z_labeled.h5"
[1] "Download successful."
[1] "Initiating file download for B078-P2_PB00010-02_2021-08-19T17:09:29.934849811Z_labeled.h5"
[1] "Download successful."
[1] "Initiating file download for B002-P1_PB00012-01_2023-11-17T21:36:51.794326181Z_labeled.h5"
[1] "Download successful."
[1] "Initiating file download for B002-P1_PB00014-01_2023-11-17T21:36:51.794326181Z_labeled.h5"
[1] "Download successful."
[1] "Initiating file download for B002-P1_PB00015-01_2023-11-17T21:36:51.794326181Z_labeled.h5"
[1] "Download

ERROR: Error in if (thisDesc$file$id != idsExpanded[[fidx]]) {: the condition has length > 1


In [17]:
study_space_uuid <- "64097865-486d-43b3-8f94-74994e0a72e0"
title <- paste("PBMC Ref. Metadata", Sys.Date())

In [18]:
in_list <- as.list(meta_data$file.id)

In [19]:
out_list <- list(out_file)

In [20]:
uploadFiles(
    files = out_list,
    studySpaceId = study_space_uuid,
    title = title,
    inputFileIds = in_list,
    store = "project",
    doPrompt = FALSE
)

[1] "Cannot determine the current notebook."
[1] "1) /home/jupyter/scRNA-Reference-IH-A/00-R_Sample_Selection.ipynb"
[1] "2) /home/jupyter/examples/Visualization_apps/dash/save_visualization_app_example.ipynb"
[1] "3) /home/jupyter/examples/R/XX-R Tips and tricks.ipynb"


Please select (1-3)  1


In [21]:
sessionInfo()

R version 4.3.2 (2023-10-31)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 20.04.6 LTS

Matrix products: default
BLAS/LAPACK: /opt/conda/lib/libopenblasp-r0.3.25.so;  LAPACK version 3.11.0

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
 [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

time zone: Etc/UTC
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] purrr_1.0.2 dplyr_1.1.4 hise_2.16.0

loaded via a namespace (and not attached):
 [1] crayon_1.5.2     vctrs_0.6.5      httr_1.4.7       cli_3.6.2       
 [5] rlang_1.1.3      stringi_1.8.3    generics_0.1.3   assertthat_0.2.1
 [9] jsonlite_1.8.8   glue_1.7.0       RCurl_1.98-1.14  plyr_1.8.9      
[13] htmlt