---
title: "Preprocess Covid data"
author: "Åsa Björklund  &  Paulo Czarnewski"
date: '`r format(Sys.Date(), "%B %d, %Y")`'
output:
  html_document:
    self_contained: true
    highlight: tango
    df_print: paged
    toc: yes
    toc_float:
      collapsed: false
      smooth_scroll: true
    toc_depth: 3
    keep_md: yes
    fig_caption: true
  html_notebook:
    self_contained: true
    highlight: tango
    df_print: paged
    toc: yes
    toc_float:
      collapsed: false
      smooth_scroll: true
    toc_depth: 3
editor_options:
  chunk_output_type: console
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(message=FALSE, warning=FALSE, result='hold',fig.width=12,tidy=TRUE)
knitr::opts_knit$set(progress=TRUE,verbose=TRUE)
```

## Download data

```{r}
# download to folder full:
# ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE149nnn/GSE149689/matrix/GSE149689_series_matrix.txt.gz
# ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE149nnn/GSE149689/suppl/GSE149689_barcodes.tsv.gz
# ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE149nnn/GSE149689/suppl/GSE149689_features.tsv.gz
# ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE149nnn/GSE149689/suppl/GSE149689_matrix.mtx.gz
```

## Load all data

```{r}
library(Matrix)
library(hdf5r)
library(rhdf5)
```

Read the matrix

```{r}

sm <- as(Matrix::readMM("full/GSE149689_matrix.mtx.gz"),Class = "dgCMatrix")
sm@Dimnames[[1]] <- as.character(read.delim("full/GSE149689_features.tsv.gz",header = F)[,2])
sm@Dimnames[[2]] <- as.character(read.delim("full/GSE149689_barcodes.tsv.gz",header = F)[,1])
dim(sm)


#see sample metadata
meta <- read.delim("full/GSE149689_series_matrix.txt.gz",skip = 54)
t(meta[c(7,8,9,10,11,12),])

#check how many cells per sample.
t = table(sub(".*-","",colnames(sm)))[as.character(1:length(table(sub(".*-","",colnames(sm)))) )]
cbind(t,colnames(meta)[2:21],t(unname(meta[10,2:21])))

```

Have shift in metadata for the Normal/Flu samples.

Severe cases:
1,9,10,15,16,17
M,F,M,F,F,M

Healthy:
5,13,14,19
F,F,F,M

Only 19 is male.


Select severe covid samples with many cells. 15,16 probably best.


Filter for at least 400 umis per cell and recount, have cells with only 15 umis...

```{r}
nC = colSums(sm)
range(nC)
dim(sm)
sm = sm[,nC>400]
dim(sm)
# removes 708 cells. Mainly from samples 8,10,17.

t2 = table(sub(".*-","",colnames(sm)))[as.character(1:length(table(sub(".*-","",colnames(sm)))) )]
data.frame(count=as.vector(t2), name=colnames(meta)[2:21])

```

## Select samples

Select all severe and normal samples and create matrices. Skip sample 9,10, less than 1500 cells. 

Subsample to 1500 cells per sample.

```{r}
samples_use <- c(c(1,15,16,17),c(5,13,14,19))
sum(table(sub(".*-","",colnames(sm)))[as.character(samples_use)])


sel <- unlist(lapply(samples_use,function(x){
  set.seed(1);x <- sample(size = 1500,grep(paste0("-",x,"$"),colnames(sm),value = T) )
}))
sm2 <- sm[,sel]
table(sub(".*-","",colnames(sm2)))[as.character(samples_use)]

dim(sm2)

```


## Write as h5

```{r, eval=FALSE}

# need to add in gene id column as well.
feats = read.delim("full/GSE149689_features.tsv.gz",header = F)


for(i in c(paste0("nCoV_PBMC_",c(1,15,16,17)), paste0("Normal_PBMC_",c(5,13,14,19)) )) {
  message(paste0("PROCESSING SAMPLE:    ",i) )
  spn <- sub(".*_","",i)
  fn <- paste0("sub/",i,".h5")
  group <- grep(paste0("-",spn,"$"),colnames(sm2),value = T)
  sm3 <-  sm2[,group]
  dim(sm3)
  
  # file.remove(fn)
  rhdf5::h5createFile(fn)
  rhdf5::h5createGroup(fn,"matrix")
  
  rhdf5::h5write(sm3@Dimnames[[2]],fn,"matrix/barcodes")
  rhdf5::h5write(sm3@x,fn,"matrix/data")
  rhdf5::h5write(sm3@i,fn,"matrix/indices")
  rhdf5::h5write(sm3@p,fn,"matrix/indptr")
  rhdf5::h5write(sm3@Dim,fn,"matrix/shape")
  
  rhdf5::h5createGroup(fn,"matrix/features")
  rhdf5::h5write(sm3@Dimnames[[1]]
          ,fn,"matrix/features/name")
  rhdf5::h5write(sm3@Dimnames[[1]]
          ,fn,"matrix/features/_all_tag_keys")
  rhdf5::h5write(feats[,3],
          fn,"matrix/features/feature_type")
  rhdf5::h5write(feats[,1],
          fn,"matrix/features/id")
  rhdf5::h5write(rep("GRCh38",nrow(sm3))
          ,fn,"matrix/features/genome")
  
  rhdf5::h5ls(fn)
  
  nd <- Seurat::Read10X_h5(fn)
  print(sum(!nd == sm3))
  message("\n\n")
}
```

## Check quality of all


```{r}
library(Seurat)
sid = sub(".*-","",colnames(sm2))
m = data.frame(sample = paste0("sm",sid), type = ifelse(sid %in% c(5,13,14,19), "Normal","Covid"))
rownames(m) = colnames(sm2)
m$sample2 = paste(m$type, m$sample, sep="_")
m$sex = ifelse(sid %in% c(1,17,19), "M","F")
sdata = CreateSeuratObject(sm2, meta.data = m)


#pdf("sample_qualities.pdf")

VlnPlot(sdata, features = c("nFeature_RNA", "nCount_RNA"), group.by = "sample2", pt.size = 0)
#dev.off()

```

Most samples are females, only males are 1,17,19

Sample 16 too low quality.

Use covid 1,15,17 and normal 13,14,19 for the exercises.

Gives 1 male in Normal, and 2 in covid.


```{r}
sessionInfo()
```