################################################################################ ### Input: SBcounts.h5 (output of spatial-count.jl) ### Input: cb_whitelist.txt (newline-delimited list of cell barcodes to position) ### Output: coords.csv (placements+metadata) ################################################################################ suppressMessages(source("helpers.R")) # Load arguments library(optparse) arguments <- OptionParser( usage = "Usage: Rscript positioning.R SBcounts_path CBwhitelist_path output_path [options]", option_list = list( make_option(c("-k", "--knn"), type="integer", default=51, help = "Number of bead neighbors used to compute eps [default: %default]", metavar="K") ) ) %>% parse_args(positional_arguments=3) sb_path <- arguments$args[[1]] cb_path <- arguments$args[[2]] out_path <- arguments$args[[3]] k <- arguments$options$knn rm(arguments) # Print arguments print(g("sb_path: {sb_path}")) print(g("cb_path: {cb_path}")) print(g("out_path: {out_path}")) print(g("k: {k}")) # Check input arguments if (!dir.exists(out_path)) {dir.create(out_path, recursive=TRUE)} stopifnot(filter(rhdf5::h5ls(sb_path), group=="/matrix")$name == c("cb_index", "reads", "sb_index", "umi")) # Load the spatial barcode count matrix f <- function(p){return(rhdf5::h5read(sb_path, p))} dt <- ReadSpatialMatrix(f) metadata <- ReadSpatialMetadata(f) print(g("{add.commas(sum(dt$reads))} spatial barcode reads loaded")) # load the CB whitelist cb_whitelist <- readLines(cb_path) remap <- determine_remap_10X_CB(cb_whitelist, dt) if (remap) { cb_whitelist %<>% remap_10X_CB } metadata$SB_info$remap_10X_CB <- remap stopifnot(class(cb_whitelist) == "character") stopifnot(!duplicated(cb_whitelist)) stopifnot(uniqueN(nchar(cb_whitelist)) == 1) stopifnot(map_lgl(strsplit(cb_whitelist, ""), ~all(. %in% c("A","C","G","T")))) print(g("{len(cb_whitelist)} cell barcodes loaded")) ### Fuzzy matching ############################################################# print("Performing fuzzy cell-barcode matching") setnames(dt, "cb", "cb_fuzzy") df <- data.table(cb=cb_whitelist)[, .(cb_fuzzy=listHD1neighbors(cb), match="fuzzy"), cb ][!cb_fuzzy %in% cb_whitelist ][cb_fuzzy %in% dt$cb_fuzzy] df[, `:=`(cb = ifelse(.N > 1, NA, cb), match = ifelse(.N > 1, "ambig", match)), cb_fuzzy] df %<>% unique df <- rbindlist(list(df, data.table(cb=cb_whitelist, cb_fuzzy=cb_whitelist, match="exact")[cb_fuzzy %in% dt$cb_fuzzy])) stopifnot(df$cb_fuzzy %in% dt$cb_fuzzy) df[, cb_fuzzy := factor(cb_fuzzy, levels = levels(dt$cb_fuzzy))] stopifnot(na.omit(unique(df$cb)) %in% cb_whitelist) df[, cb := factor(cb, levels = cb_whitelist)] stopifnot(df$match %in% c("exact", "fuzzy", "ambig"), !is.na(df$match)) df[, match := factor(match, levels = c("exact", "fuzzy", "ambig"))] stopifnot(!any(duplicated(df$cb_fuzzy)), !is.na(df$cb_fuzzy)) stopifnot(!xor(df$match=="ambig", is.na(df$cb))) stopifnot(!xor(df$match=="exact", df$cb_fuzzy %in% cb_whitelist)) stopifnot(levels(df$cb_fuzzy) == levels(dt$cb_fuzzy)) stopifnot(is.null(key(dt)), is.null(key(df)), is.null(indices(dt)), is.null(indices(df))) # Perform the match dt <- merge(dt, df, by = "cb_fuzzy", all.x = TRUE, sort = FALSE) stopifnot(is.null(key(dt)), is.null(indices(dt))) # Record metadata metadata$CB_matching <- dt[, .(reads=sum(reads)), match][order(-reads)] %>% {setNames(.$reads, .$match %>% as.character %>% replace_na("none"))} stopifnot(sum(dt$reads) == sum(metadata$CB_matching)) # Clean up dt[, match := NULL] setnames(dt, "cb_fuzzy", "cb_raw") setcolorder(dt, c("cb_raw", "cb", "umi", "sb", "reads")) rm(df) ; invisible(gc()) # Collapse post-matched barcodes dt <- rbindlist(list(dt[is.na(cb)], dt[!is.na(cb), .(reads=sum(reads)), .(cb,umi,sb)]), fill = TRUE) ### Load puck ################################################################## # Load the puck print("Loading the puck") res <- ReadPuck(f) puckdf <- res[[1]] metadata$puck_info %<>% c(res[[2]]) rm(res) ; invisible(gc()) # Compute eps scale using the kth neighbor eps <- RANN::nn2(data = puckdf[, .(x,y)], query = puckdf[sample(.N, 10000), .(x,y)], k = k)$nn.dists[,k] %>% median print(g("eps: {eps}")) # Filter reads with a low-quality spatial barcode print("Removing low-quality spatial barcodes") dt[, m := sb %in% puckdf$sb] metadata$SB_filtering %<>% c(reads_lqsb=dt[m == FALSE, sum(reads)]) dt <- dt[m == TRUE] dt[, m := NULL] # Page 4 plot_SBlibrary(dt, f) %>% make.pdf(file.path(out_path, "SBlibrary.pdf"), 7, 8) # Page 5 plot_SBplot(dt, puckdf) %>% make.pdf(file.path(out_path, "SBplot.pdf"), 7, 8) # Delete cell barcodes for cells that were not called print("Removing non-whitelist cells") metadata$SB_filtering %<>% c(reads_nocb = dt[is.na(cb), sum(reads)]) metadata$SB_info$UMI_pct_in_called_cells <- round(nrow(dt[!is.na(cb)])/nrow(dt)*100, digits=2) %>% paste0("%") metadata$SB_info$sequencing_saturation <- round((1 - nrow(dt) / sum(dt$reads)) * 100, 2) %>% paste0("%") dt <- dt[!is.na(cb)] dt[, cb_raw := NULL] invisible(gc()) stopifnot(nrow(dt) == nrow(unique(dt[,.(cb,umi,sb)]))) # Remove chimeric UMIs print("Removing chimeras") dt[, m := reads == max(reads) & sum(reads == max(reads)) == 1, .(cb, umi)] metadata$SB_filtering %<>% c(reads_chimeric = dt[m==FALSE, sum(reads)]) dt <- dt[m == TRUE] dt[, m := NULL] # Aggregate reads into UMIs print("Counting UMIs") metadata$SB_filtering %<>% c(reads_final = dt[,sum(reads)]) dt <- dt[, .(umi=.N), .(cb, sb)] metadata$SB_filtering %<>% c(UMIs_final = dt[, sum(umi)]) invisible(gc()) # Merge spatial barcode count matrix with puck coordinates print("Joining puck coordinates") stopifnot(dt$sb %in% puckdf$sb) stopifnot(is.factor(dt$sb), is.factor(puckdf$sb)) dt <- merge(dt, puckdf, by = "sb", all = FALSE, sort = FALSE)[order(cb, -umi)] setcolorder(dt, c("cb","umi","sb","x","y")) metadata$puck_info$umi_final <- map2_int(metadata$puck_info$puck_boundaries %>% head(-1), metadata$puck_info$puck_boundaries %>% tail(-1), ~dt[x >= .x & x <= .y, sum(umi)]) invisible(gc()) # Page 6 plot_SBmetrics(metadata) %>% make.pdf(file.path(out_path, "SBmetrics.pdf"), 7, 8) # Write intermediate output print("Writing intermediate matrix") fwrite(dt, file.path(out_path, "matrix.csv.gz"), quote=FALSE, sep=",", row.names=FALSE, col.names=TRUE, compress="gzip") metadata %>% map(as.list) %>% jsonlite::toJSON(pretty=TRUE) %>% writeLines(file.path(out_path, "spatial_metadata.json")) dt[, sb := NULL] stopifnot(dt$cb %in% cb_whitelist) stopifnot(dt$umi > 0) stopifnot(!any(is.na(dt))) stopifnot(sort(names(dt)) == c("cb", "umi", "x", "y")) ### DBSCAN ##################################################################### data.list <- split(dt, by="cb", drop=TRUE, keep.by=FALSE) for (dl in data.list) {stopifnot(nrow(dl) > 0, dl$umi > 0)} rm(dt) ; invisible(gc()) print(g("Running positioning on {len(data.list)} cells")) # prepare for positioning library(future) library(furrr) options(future.globals.maxSize = 1024 * 1024 * 1024) myoptions <- furrr::furrr_options(packages=c("data.table"), seed=TRUE, scheduling=1) future::plan(future::multisession, workers=parallelly::availableCores()) mydbscan <- function(dl, eps, minPts) {dbscan::dbscan(x = dl[,.(x,y)], eps = eps, minPts = minPts, weights = dl$umi, borderPoints = TRUE)$cluster} ### Run DBSCAN optimization ### ms <- 1L # minPts step size mranges <- furrr::future_map(data.list, function(dl) { # base case v2 <- mydbscan(dl, eps, 1L * ms) if (max(v2) == 0) {return(data.table(i2=0L, i1=0L))} if (max(v2) == 1) {i2=0L} # loop i <- 1L while (max(v2) > 0) { i <- i + 1L v1 <- v2 v2 <- mydbscan(dl, eps, i * ms) if (max(v1) > 1 & max(v2) <= 1) {i2=i-1L} } # i2 is the index at the highest minPts that produces DBSCAN=2+ # i1 is the index at the highest minPts that produces DBSCAN=1 return(data.table(i2=i2, i1=i-1L)) }, .options = myoptions) %>% rbindlist stopifnot(nrow(mranges) == len(data.list)) # TODO: Walk it back by filtering nearby DBSCAN=2 ### Global DBSCAN ### # Compute minPts that places the greatest number of cells i_opt <- IRanges::IRanges(start=mranges$i2+1L, end=mranges$i1) %>% IRanges::coverage() %>% as.integer %>% {which(.==max(.))} %>% tail(1) minPts <- i_opt * ms print(g("Global optimum minPts: {minPts}")) # Rerun DBSCAN with the optimal minPts for (i in seq_along(data.list)) { data.list[[i]][, cluster := mydbscan(data.list[[i]], eps, minPts)] } # Assign the centroid via a weighted mean coords_global <- imap(data.list, function(dl, cb){ ret <- dl[,.(cb=cb, umi=sum(umi), beads=.N, max=max(umi), clusters=max(cluster))] # Add DBSCAN=0 data ret0 <- dl[cluster==0, .(umi0=sum(umi), beads0=.N, max0=max(umi, default=0))] ret[, names(ret0) := ret0] # Add DBSCAN=1 data ret1 <- dl[cluster==1, .(x1=weighted.mean(x, umi), y1=weighted.mean(y, umi), rmsd1=sqrt(weighted.mean((x-weighted.mean(x,umi))^2+(y-weighted.mean(y,umi))^2, umi)), umi1=sum(umi), beads1=.N, max1=max(umi, default=0), h1=h_index(umi))] ret[, names(ret1) := ret1] # Add DBSCAN=2 data ret2 <- dl[cluster==2, .(x2=weighted.mean(x, umi), y2=weighted.mean(y, umi), rmsd2=sqrt(weighted.mean((x-weighted.mean(x,umi))^2+(y-weighted.mean(y,umi))^2, umi)), umi2=sum(umi), beads2=.N, max2=max(umi, default=0), h2=h_index(umi))] ret[, names(ret2) := ret2] return(ret) }) %>% rbindlist(use.names=TRUE, fill=TRUE) # ignore.attr=TRUE coords_global[, `:=`(eps=eps, minPts=minPts)] coords_global[clusters==1, `:=`(x=x1, y=y1)] print(g("Placed: {round(coords_global[,sum(clusters==1)/.N]*100, 2)}%")) # Plots plot_gdbscan_opt(coords_global, mranges) %>% make.pdf(file.path(out_path, "GDBSCANopt.pdf"), 7, 8) plot_gdbscan_1(coords_global) %>% make.pdf(file.path(out_path, "GDBSCAN1.pdf"), 7, 8) plot_gdbscan_2(coords_global) %>% make.pdf(file.path(out_path, "GDBSCAN2.pdf"), 7, 8) # Save coords fwrite(coords_global, file.path(out_path, "coords.csv")) ### Dynamic DBSCAN ### # Rerun DBSCAN at the computed minPts values for (i in seq_along(data.list)) { data.list[[i]][, cluster2 := mydbscan(data.list[[i]], eps, mranges[i,i2])] data.list[[i]][, cluster1 := mydbscan(data.list[[i]], eps, mranges[i,i1])] } # Assign the centroid via a weighted mean coords_dynamic <- imap(data.list, function(dl, cb) { ret <- dl[,.(cb=cb, umi=sum(umi), beads=.N)] # Position the cell, using the highest minPts that produces DBSCAN=1 if (max(dl$cluster1) == 1) { s <- dl[cluster1 == 1, .(x=weighted.mean(x, umi), y=weighted.mean(y, umi), umi1s=sum(umi), beads1s=.N, h1s=h_index(umi))] } else { s <- data.table(x=NA_real_, y=NA_real_, umi1s=NA_real_, beads1s=NA_integer_, h1s=NA_integer_) } ret[, names(s) := s] # Compute the score, using the highest minPts that produces DBSCAN=2 if (max(dl$cluster2) >= 2) { s <- dl[, .(x=weighted.mean(x, umi), y=weighted.mean(y, umi), umi=sum(umi), beads=.N, h=h_index(umi) ), cluster2][cluster2 > 0][order(-umi, cluster2)] setcolorder(s, c("x", "y", "umi", "beads", "h", "cluster2")) ret[, c("x1","y1","umi1","beads1","h1","cluster1") := s[1]] # Highest UMI DBSCAN cluster ret[, c("x2","y2","umi2","beads2","h2","cluster2") := s[2]] # Second-highest UMI DBSCAN cluster } else if (max(dl$cluster2) == 1) { s <- dl[cluster2 == 1, .(x1=weighted.mean(x, umi), y1=weighted.mean(y, umi), umi1=sum(umi), beads1=.N, h1=h_index(umi), cluster1=1L)] ret[, names(s) := s] s <- data.table(x2=NA_real_, y2=NA_real_, umi2=NA_real_, beads2=NA_integer_, h2=NA_integer_, cluster2=NA_integer_) ret[, names(s) := s] } else { s <- data.table(x1=NA_real_, y1=NA_real_, umi1=NA_real_, beads1=NA_integer_, h1=NA_integer_, cluster1=NA_integer_) ret[, names(s) := s] s <- data.table(x2=NA_real_, y2=NA_real_, umi2=NA_real_, beads2=NA_integer_, h2=NA_integer_, cluster2=NA_integer_) ret[, names(s) := s] } }) %>% rbindlist coords_dynamic[, c("eps", "minPts2", "minPts1") := data.table(eps=eps, minPts2=mranges$i2*ms, minPts1=mranges$i1*ms)] # Plots plot_ddbscan_xy(coords_dynamic) %>% make.pdf(file.path(out_path, "DDBSCANxy.pdf"), 7, 8) # Save coords fwrite(coords_dynamic, file.path(out_path, "coords2.csv")) ### Cell plots ### plot_global_cellplots(data.list) %>% make.pdf(file.path(out_path, "DBSCAN.pdf"), 7, 8) #dynamic_plots <- plot_dynamic_cellplots(data.list, coords_dynamic) ### Final check ### stopifnot(coords_global$cb_index == coords_dynamic$cb_index) stopifnot(file.exists(file.path(out_path, c("coords.csv", "coords2.csv"))))