---
title: "01_integration_Seurat.v5_47_samples"
output: html_document
date: "2024-09-18"
---

```{r}
library(Seurat)
```

```{r}
file_paths <- list.files(path = "./data/individual_seurat_objs/", pattern = "\\.rds$", full.names = TRUE)

# Sort file paths if necessary (if they are not already in the desired order)
file_paths <- sort(file_paths)

# Read each file and assign it to a variable in the global environment
for (i in seq_along(file_paths)) {
  # Create a variable name based on the index
  var_name <- paste0("obj", i)
  
  # Read the .rds file and assign it to a variable in the global environment
  assign(var_name, readRDS(file_paths[i]))
}
```

```{r}
#merge the object
obj <- merge(x = ob.list[[1]], y = ob.list[2:length(ob.list)])
class(obj[["RNA"]])
saveRDS(obj, './data/processed/obj.merged.132.rds')

# Set the default assay to RNA to avoid SCT-related issues
DefaultAssay(obj) <- "RNA"

# Perform the standard Seurat pipeline
obj <- NormalizeData(obj)
obj <- FindVariableFeatures(obj)
obj <- ScaleData(obj)
obj <- RunPCA(obj)
obj <- FindNeighbors(obj)
obj <- FindClusters(obj, resolution = 0.1, cluster.name = "unintegrated_clusters")
obj <- RunUMAP(obj, dims = 1:10)
saveRDS(obj, './data/processed/obj.unintegrated.132.rds')


#integration
options(future.globals.maxSize = 100 * 1024^3)
obj <- IntegrateLayers(object = obj, method = RPCAIntegration,
                       orig.reduction = "pca", new.reduction = "integrated.rpca",
                       verbose = FALSE)
obj[["RNA"]] <- JoinLayers(obj[["RNA"]])
obj <- FindNeighbors(obj, reduction = "integrated.rpca", dims = 1:30)
obj <- FindClusters(obj, resolution = c(5.1), cluster.name = "rpca_clusters")
obj <- RunUMAP(obj, reduction = "integrated.rpca", dims = 1:30, reduction.name = "umap.rpca")

obj
saveRDS(obj, './data/integrated/47.integrated.rds')
```


```{r, cell type annotation after prediction and cross-validation}
#annotation with major celltype with counts
obj@active.ident <- factor(obj$rpca_clusters)

celltype_C=data.frame(ClusterID=0:108,
                       celltype_C=0:108) 

celltype_C[celltype_C$ClusterID %in% c(10,19,75,76,60,97,30,24,79,42,44,20,85,11,99),2]='CD14hi Mono'  
celltype_C[celltype_C$ClusterID %in% c(34),2]='CD16hi Mono' 
celltype_C[celltype_C$ClusterID %in% c(37,101,51,67,36),2]='Mφ'
celltype_C[celltype_C$ClusterID %in% c(96,21),2]='cDC'  
celltype_C[celltype_C$ClusterID %in% c(94,45),2]='pDC'  
celltype_C[celltype_C$ClusterID %in% c(90),2]='OC'
celltype_C[celltype_C$ClusterID %in% c(87,56),2]='plasma'
celltype_C[celltype_C$ClusterID %in% c(17,57),2]='immature B'
celltype_C[celltype_C$ClusterID %in% c(0),2]='naive B'  
celltype_C[celltype_C$ClusterID %in% c(12,80,84),2]='memory B' 
celltype_C[celltype_C$ClusterID %in% c(74,93),2]='CLP'
celltype_C[celltype_C$ClusterID %in% c(46,89,62),2]='CMP'  
celltype_C[celltype_C$ClusterID %in% c(73,32,91,102,106,71),2]='HPSC'  
celltype_C[celltype_C$ClusterID %in% c(50,35,33,53,78,88,108,38,61,55,103,92,82,69),2]='Epithelium'
celltype_C[celltype_C$ClusterID %in% c(59,100),2]='CAR'
celltype_C[celltype_C$ClusterID %in% c(52),2]='OB'
celltype_C[celltype_C$ClusterID %in% c(72),2]='Fibroblast'  
celltype_C[celltype_C$ClusterID %in% c(70,82,91,102),2]='Erythroid' 
celltype_C[celltype_C$ClusterID %in% c(98,40),2]='Pericyte'
celltype_C[celltype_C$ClusterID %in% c(65),2]='Endothelium'  
celltype_C[celltype_C$ClusterID %in% c(4),2]='CD16hi NK'  
celltype_C[celltype_C$ClusterID %in% c(15,18,22,26,7,83,9),2]='naive CD4 T'  
celltype_C[celltype_C$ClusterID %in% c(14,27,3,31,8),2]='memory/helper CD4 T'  
celltype_C[celltype_C$ClusterID %in% c(1,13,2,23,25,28,5,58,6,64,95),2]='exhausting CD8 T'  
celltype_C[celltype_C$ClusterID %in% c(16,39,49),2]='CD8 Teff'  
celltype_C[celltype_C$ClusterID %in% c(48),2]='CD56hi NK'  
celltype_C[celltype_C$ClusterID %in% c(29,66,86),2]='CD4 Treg'  
celltype_C[celltype_C$ClusterID %in% c(47),2]='CD8 Tex'  
celltype_C[celltype_C$ClusterID %in% c(43),2]='Treg committed naive CD4 T'  
celltype_C[celltype_C$ClusterID %in% c(41,43,68),2]='memory CD8 T'  
celltype_C[celltype_C$ClusterID %in% c(54),2]='CD16int CD56int NK' 
celltype_C[celltype_C$ClusterID %in% c(63),2]='naive CD8 T'  
 

head(celltype_C)
celltype_C
table(celltype_C$celltype_C)
obj@meta.data$celltype_C = "NA"
for(i in 1:nrow(celltype_C)){
  obj@meta.data[which(obj@meta.data$rpca_clusters == celltype_C$ClusterID[i]),'celltype_C'] <- celltype_C$celltype_C[i]}
table(obj@meta.data$celltype_C)

saveRDS(obj, './data/integrated/47.integrated.rds')
```

```{r, convert to .h5ad file format for analysis in Scanpy and Dynamo}
library(sceasy)
library(reticulate)
DefaultAssay(obj) <- "RNA"
DefaultAssay(obj) <- "RNA"
obj[["RNA"]] <- as(obj[["RNA"]], "Assay")
sceasy::convertFormat(obj, from = "seurat", to = "anndata", outFile = './data/integrated/47.integrated.h5ad')
```