--- title: "Pre-Processing Seurat" author: "Roger Casals" date: 2020-12-01T21:13:14-05:00 categories: ["R"] tags: ["R Markdown", "single cell RNA", "Seurat"] ---
Primer de tot, carreguem les dades. Del SCC
Carreguem les diferents llibreries
aa <- CreateSeuratObject(counts=expression_data, project="GSE123813", metadata=metadata2)
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
bbb <- aa
#Ordenem les dades per tal d'obtenir les identificacions correctes.
metadata2 <- metadata2[order(metadata2$cell.id),]
aa <- aa[, order(colnames(aa))]
aa@meta.data <- aa@meta.data[order(row.names(aa@meta.data)), ]
aa@meta.data$cluster <- metadata2$cluster
aa@meta.data$treatment <- metadata2$treatment
table(aa@meta.data$cluster)
##
## CD8_act CD8_eff CD8_ex CD8_ex_act CD8_mem CD8_naive Naive
## 1272 1032 4149 615 5333 3159 2501
## Tfh Th17 Treg
## 3085 1783 3087
ENS CENTRAREM AMB LES CD4 perquè hem vist a la teoria que les “naive” es poden diferenciar en Treg o Th17 o Tfh
#Obtenim els percentatges de gens mitocontrials i ribosòmics, per saber si les cèl·lules estan estressades.
aa <- PercentageFeatureSet(aa, "^MT-", col.name="percent_mito")
aa <- PercentageFeatureSet(aa, "^RP[SL]", col.name = "percent_ribo")
aa <- PercentageFeatureSet(aa, "^HB[^(P)]", col.name = "percent_hb")
feats <- c("nFeature_RNA", "nCount_RNA", "percent_mito", "percent_ribo", "percent_hb")
VlnPlot(aa, group.by = "orig.ident", features = feats, pt.size = 0.1, ncol = 3) +
NoLegend()
FeatureScatter(aa, "nCount_RNA", "nFeature_RNA", group.by = "orig.ident", pt.size = 0.5)
CD4tots <- aa
#Agafem les cèl·lules que són necessàries per l'anàlisi.
CD4tots <- subset(CD4tots, cluster %in% c( "Th17", "Tfh", "Treg", "Naive") & treatment %in% c("post", "pre"))
CD4tots <- subset(CD4tots, subset=nFeature_RNA < 4000 & percent_mito < 8 )
table(CD4tots@meta.data$cluster)
##
## Naive Tfh Th17 Treg
## 2223 2940 1538 2736
#Normalitzem les dades
CD4tots <- NormalizeData(CD4tots)
#Busquem els gens que contribueixen més significativament a la variabilitat d'una matriu d'expressió gènica.
#En seleccionem 2000 per defecte.
CD4tots <- FindVariableFeatures(CD4tots, selection.method = "vst", nfeatures = 2000)
#Escalem les dades mitjançant les funció que ens proporciona Seurat.
CD4tots <- ScaleData(CD4tots)
## Centering and scaling data matrix
#Reduïm dimensió amb el RunPCA (Utilitzem el número 16 ja que l'utilitzen al experiment.)
CD4tots <- RunPCA(CD4tots, npcs = 16)
## PC_ 1
## Positive: HSPA1A, HSPB1, HSP90AA1, HSP90AB1, FKBP4, HSPE1, HSPD1, TNFRSF18, HSPH1, HSPA8
## AHSA1, DNAJA1, CXCL13, HSPA1B, CACYBP, BST2, NINJ1, PKM, PRDX1, NUDC
## CHORDC1, GNA15, TCP1, FABP5, DUSP4, TNFRSF4, DOK2, STIP1, IL32, MTHFD2
## Negative: GPR183, FTH1, S1PR1, ANXA1, IL7R, GLIPR1, KLF2, ZFP36, PIK3R1, CXCR4
## ABLIM1, KLF3, ZFP36L2, CD55, LTB, FAM65B, SELL, SESN3, GIMAP7, MGAT4A
## PLAC8, TC2N, LEF1, RGCC, CMTM8, CCR7, CMSS1, PASK, GABARAPL1, TRAT1
## PC_ 2
## Positive: CTSC, S100A4, LGALS3, RRM2, S100A6, MKI67, UBE2C, BIRC5, CCNA2, RAC2
## JAML, GLO1, CD3G, PFN1, GZMA, RPS27L, CDCA5, CD52, ABRACL, OSTF1
## TYMS, HLA-DQA2, LRRN3, CAPG, TK1, GTSE1, BATF, CTSH, TOP2A, AC092580.4
## Negative: NMB, IFITM1, TUBA4A, CXCL13, HSPD1, DUSP2, CNIH1, HSP90AA1, HSP90AB1, HSPB1
## HSPA1A, FKBP4, GNG4, IGFL2, CD200, G0S2, TNFRSF4, ZNF331, HSPE1, ARID5A
## BAG3, TUBB4B, RGCC, SESN3, EZR, CCR7, STAT3, TRABD2A, CXCR4, JUNB
## PC_ 3
## Positive: FOXP3, LAYN, CD27, IL1R2, LAIR2, TBC1D4, CARD16, IKZF2, SAT1, CCR8
## IL32, TIGIT, RTKN2, ACP5, IL2RA, CTSC, GBP5, TNFRSF1B, CST7, HTATIP2
## CTLA4, ICOS, BATF, IL2RB, FANK1, CD7, MALT1, GCNT1, LTB, FCRL3
## Negative: TRAT1, ANXA1, RRM2, TUBA1B, ALOX5AP, CXCR4, UBE2C, RPLP0, BHLHE40, MKI67
## TOP2A, CDCA5, ZFP36L2, CCNA2, TYMS, BIRC5, GNG4, AC092580.4, GTSE1, CXCL13
## GIMAP7, ASF1B, HIST1H4C, PKMYT1, IGFBP4, UHRF1, KLRB1, FABP5, STMN1, CCR7
## PC_ 4
## Positive: KLRB1, LRRN3, AC092580.4, CTSH, JAML, SMIM3, IL26, GZMB, RBPJ, GPR15
## LGALS3, GLO1, COTL1, RP11-81H14.2, S100A4, SLC1A4, ANKRD28, BHLHE40, ANXA6, ID2
## OSTF1, ARL3, NMRK1, ALOX5AP, LMO4, MSC, S100A6, CD82, CAPG, ARL4C
## Negative: RRM2, BIRC5, MKI67, UBE2C, CDCA5, CCNA2, GTSE1, TYMS, UHRF1, PKMYT1
## ASF1B, AURKB, ZFP36, EZR, TOP2A, ZWINT, TK1, KIFC1, CDT1, CENPA
## ASPM, MYBL2, HIST1H1B, CDK1, SELL, HJURP, CKAP2L, FAM111B, KIF23, CDCA3
## PC_ 5
## Positive: CCR7, BCL2L1, RPLP0, BCL2A1, GRSF1, EBI3, ZFP36L1, REL, XCL1, ECE1
## BCAT1, TIFA, LY75, GPX4, TNFSF4, TRAF1, RPSA, SIAH2, RNF145, PGM2L1
## HTATIP2, PLAGL2, CCR4, NFKB1, CHN1, CHST7, GRAMD1A, LMNA, SLC28A3, THAP2
## Negative: CCL5, LAG3, CST7, PDCD1, IFITM2, IFITM3, NKG7, RALA, SLA2, OASL
## FTH1, SLC7A5, CXCR6, GNLY, CD38, HAVCR2, GZMB, PRF1, CLIC3, CCL4
## CTSW, AC069363.1, ADGRG1, SYTL2, JUN, GZMA, DOK2, VPS37B, ABCG1, FAM46C
#features = VariableFeatures(object = CD4tots))
#Utilitzem aquesta funció per trobar veïns propers mitjançant l'expressió gènica
CD4tots <- FindNeighbors(CD4tots, dims = 1:16)
## Computing nearest neighbor graph
## Computing SNN
#Ens quedem amb el cluster 0.3, ja que és el que millor s'adapta als subgrups que tenimm.
CD4tots <- FindClusters(CD4tots, resolution = c(0.3))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 9437
## Number of edges: 330439
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9106
## Number of communities: 8
## Elapsed time: 1 seconds
CD4tots <- RunUMAP(CD4tots, dims=1:16)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 12:23:55 UMAP embedding parameters a = 0.9922 b = 1.112
## 12:23:55 Read 9437 rows and found 16 numeric columns
## 12:23:55 Using Annoy for neighbor search, n_neighbors = 30
## 12:23:55 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 12:23:56 Writing NN index file to temp file C:\Users\NATLIA~1\AppData\Local\Temp\RtmpETOGXc\file4a7475fd41bd
## 12:23:56 Searching Annoy index using 1 thread, search_k = 3000
## 12:23:59 Annoy recall = 100%
## 12:24:01 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 12:24:03 Initializing from normalized Laplacian + noise (using irlba)
## 12:24:03 Commencing optimization for 500 epochs, with 399390 positive edges
## 12:24:24 Optimization finished
#Realitzem un DimPlot mitjançant per tal de que sigui etiquetat de diverses formes.
DimPlot(CD4tots, reduction = "umap", group.by = "treatment")
#Realitzem el plot per veure com estan ubicades les cèl·lules anotades en l'espai.
DimPlot(CD4tots, reduction="umap", group.by="cluster", label=T)
#Escollim quin serà el millor cluster.
#DimPlot(CD4tots, reduction = "umap", group.by = "RNA_snn_res.0.9", label = TRUE)
#DimPlot(CD4tots, reduction = "umap", group.by = "RNA_snn_res.0.7", label = TRUE)
#DimPlot(CD4tots, reduction = "umap", group.by = "RNA_snn_res.0.5", label = TRUE)
DimPlot(CD4tots, reduction = "umap", group.by = "RNA_snn_res.0.3", label = TRUE)
#DimPlot(CD4tots, reduction = "umap", group.by = "RNA_snn_res.0.1", label = TRUE)
S’ha escollit el cluster 0.3, ja que és el mateix que utilitzen a l’article, i el que engloba millor els diferents subtipus que tenim.
CD4tots <- RenameIdents(CD4tots, "0" = "Naive")
CD4tots <- RenameIdents(CD4tots, "1" = "Tfh")
CD4tots <- RenameIdents(CD4tots, "2" = "Treg")
CD4tots <- RenameIdents(CD4tots, "3" = "Th17")
CD4tots <- RenameIdents(CD4tots, "4" = "Treg")
#CD4tots <- RenameIdents(CD4tots, "5" = "Tfh")
#El número 5, són una barreja, no els identifiquem.
CD4tots <- RenameIdents(CD4tots, "6" = "Tfh")
CD4tots <- RenameIdents(CD4tots, "7" = "Th17")
Creem una nova columna a la metadata amb els “idents” d’aquests clusters
CD4tots@meta.data$idents <- Idents(CD4tots)
Realitzem un subset, identificant només els grups que hem pogut nombrar amb els clusters, i els representem per a veure’ls.
CD4totssubset <- subset(CD4tots, idents %in% c("Naive", "Treg", "Th17", "Tfh"))
plot1 <- DimPlot(CD4totssubset, reduction = "umap", group.by="treatment")
plot2 <- DimPlot(CD4totssubset, reduction = "umap", group.by="idents")
plot1 | plot2
Identifiquem els nous grups segons la seva condició de tractament, és a dir, si són “pre” o “post”, els separem mitjançant un guió baix.
CD4totssubset$celltype.cnd <- paste0(CD4totssubset$treatment, "_", CD4totssubset$idents)
DimPlot(CD4totssubset, reduction = "umap", group.by="celltype.cnd", label=T)
Identifiquem els Idents del cluster, com al que hem definit.
Idents(CD4totssubset) <- CD4totssubset$celltype.cnd
Calculem el percentatge de cada cèl·lula
prop.table(table(CD4totssubset@meta.data$celltype.cnd))*100
##
## post_Naive post_Tfh post_Th17 post_Treg pre_Naive pre_Tfh pre_Th17
## 12.433024 17.823948 2.296337 6.834336 14.991799 10.311646 12.815746
## pre_Treg
## 22.493166
table(CD4totssubset@meta.data$celltype.cnd)
##
## post_Naive post_Tfh post_Th17 post_Treg pre_Naive pre_Tfh pre_Th17
## 1137 1630 210 625 1371 943 1172
## pre_Treg
## 2057
CD4totssubsetpre <- subset(CD4totssubset, celltype.cnd %in% c("pre_Naive", "pre_Treg", "pre_Th17", "pre_Tfh"))
CD4totssubsetpost <- subset(CD4totssubset, celltype.cnd %in% c("post_Naive", "post_Treg", "post_Th17", "post_Tfh"))
library(Seurat)
library(SeuratDisk)
## Registered S3 method overwritten by 'SeuratDisk':
## method from
## as.sparse.H5Group Seurat
library(SeuratData)
#SaveH5Seurat(CD4totssubset, "tots", overwrite=FALSE, verbose=TRUE)
#SaveH5Seurat(CD4totssubsetpre, "pre", overwrite=FALSE, verbose=TRUE)
#SaveH5Seurat(CD4totssubsetpost, "post", overwrite=FALSE, verbose=TRUE)