## Import packages
library("tidyr") # tidy data
library("dplyr") # data wrangling
library("ggplot2") # plotting
library("Seurat") # scRNA-seq analysis
library("ROTS") # differential gene expression analysis
(2-5 min)
For this project we’ll use the data publicly available at www.opentargets.org published by Cano-Gamez et al., 2020. The data comprises bulk- and single-cell RNA data from human naïve and memory CD4+ T cells upon TCR and cytokine stimulation. Then the authors sequence these cells 16h before proliferation and after 5 days when cells acquire an effector phenotype. The differentiated cell types are: Th0, Th1, Th2, Th17, iTreg, IFNβ (apart from the resting state). We’ll only focus in comparing cells sequenced at the 5th day that are common to bulk- and single-cell data: Th0, Th2, Th17 and iTreg.
Read more about it in the Methods section Single-cell RNA-sequencing.
Hopefully, the authors kindly provided the data and code used to produce the analyses in the publication:
data: www.opentargets.org
code: GitHub repository
(5-10 min)
Below we’ll import the table with the information to download.
## Download datasets
# import table with datasets to download
data2download <- read.table("data/CanoGamez_et_al_2020_pub.tsv",
sep="\t", header=TRUE)
Check below the content of the table imported.
# print table
knitr::kable(data2download)
Now we’ll download the metadata, bulk- and single-cell-RNA gene
expression data from the opentargets website. The tables will
be imported as data.frame
s with the exception of the 10X
single-cell gene expression data that will be imported as a
sparse
matrix with the Seurat
function
Read10X()
(it searches for three files:
matrix.mtx
, genes.tsv
,
barcodes.tsv
). We will provide the single-cell data to the
function CreateSeuratObject()
to convert this data into a
Seurat
class object (read more about
it).
The six files that were are downloading are:
NCOMMS-19-7936188_bulk_RNAseq_metadata.txt
: bulk
metadata
NCOMMS-19-7936188_bulk_RNAseq_raw_counts.txt
: bulk
raw counts of human CD4+ T (bulk) cells
NCOMMS-19-7936188_metadata.txt
: single-cell
metadata
10X single-cell gene expression data:
NCOMMS-19-7936188_scRNAseq_raw_UMIs
(renamed to
matrix.mtx
): UMI raw gene expression of human CD4+ T
(single) cells
NCOMMS-19-7936188_scRNAseq_genes.tsv
(renamed to
genes.tsv
): gene names
NCOMMS-19-7936188_scRNAseq_barcodes.tsv
(renamed to
barcodes.tsv
): cell barcodes
For the bulk gene expression data, the data is provided with
ensembl gene ids, whereas the single-cell gene expression data
has gene names. Thus we have to convert ensembl gene
ids into gene names, such as the conversion of the
ensembl gene id ENSG00000010610
to the respective
gene name CD4
. For this purpose, we’ll use the
biomaRt
functions: useEnsembl()
and
getBM()
(see vignette
to know more).
Finally, since we are interested in assessing the accuracy of differential gene expression when applied to single-cell-RNA-seq data using as the ‘ground-truth’ the bulk-RNA-seq data, we’ll keep only genes shared among single-cell- and bulk-RNA data sets.
After running the R
chunk code below, you’ll end up with
two main objects
(i.e., data in R
) called:
seu
(scRNA-seq data - Seurat
class) and
bulk
(bulk-RNA-seq data - list
class with two
data.frames
named counts
and meta
- you can access them by doing bulk$counts
and
bulk$meta
, respectively).
## Download & import data
# create directory to save files
down_dir <- "data/CanoGamez_et_al_2020" # directory to save datasets
if (!dir.exists(down_dir)) dir.create(down_dir, recursive=TRUE) # create folder if doesn't exist
# loop over the table rows & download each dataset
for (f in 1:nrow(data2download)) {
down_filename <- data2download[f,"filename"]
down_filepath <- paste(down_dir, gsub("NCOMMS-19-7936188_scRNAseq_", "", down_filename),
sep="/")# download file to: 'data/CanoGamez_et_al_2020'
if (grepl("raw_UMIs.mtx", down_filepath)) down_filepath <- gsub("raw_UMIs", "matrix", down_filepath)
down_url <- data2download[f,"ftp_url"]
cat("Downloading file", down_filename, "to", down_filepath, "\n")
download.file(url=down_url, destfile=down_filepath)
}
## Downloading file NCOMMS-19-7936188_bulk_RNAseq_metadata.txt to data/CanoGamez_et_al_2020/NCOMMS-19-7936188_bulk_RNAseq_metadata.txt
## Downloading file NCOMMS-19-7936188_bulk_RNAseq_raw_counts.txt to data/CanoGamez_et_al_2020/NCOMMS-19-7936188_bulk_RNAseq_raw_counts.txt
## Downloading file NCOMMS-19-7936188_metadata.txt to data/CanoGamez_et_al_2020/NCOMMS-19-7936188_metadata.txt
## Downloading file NCOMMS-19-7936188_scRNAseq_barcodes.tsv to data/CanoGamez_et_al_2020/barcodes.tsv
## Downloading file NCOMMS-19-7936188_scRNAseq_genes.tsv to data/CanoGamez_et_al_2020/genes.tsv
## Downloading file NCOMMS-19-7936188_scRNAseq_raw_UMIs.mtx to data/CanoGamez_et_al_2020/matrix.mtx
## Bulk-RNA-seq data
bulk <- list()
# raw counts
bulk$counts <- read.table(paste(down_dir, "NCOMMS-19-7936188_bulk_RNAseq_raw_counts.txt",
sep="/"), sep="\t", header=TRUE, row.names=1)
# metadata
bulk$meta <- read.table(paste(down_dir, "NCOMMS-19-7936188_bulk_RNAseq_metadata.txt",
sep="/"), sep="\t", header=TRUE, row.names=1)
# Convert Ensemble ids to gene names w/ biomaRt
if (!"biomaRt" %in% installed.packages()) remotes::install_github("grimbough/biomaRt")
ensembl <- biomaRt::useEnsembl(biomart="genes", dataset="hsapiens_gene_ensembl",
host="https://mar2017.archive.ensembl.org") #v88 - authors: Ensembl v87
genes2symbols <- biomaRt::getBM(attributes = c("ensembl_gene_id", "hgnc_symbol", "external_gene_name"),
filters = "ensembl_gene_id", values=row.names(bulk$counts),
mart=ensembl)
sort.genes2symbols <- data.frame("ensembl_gene_id"=row.names(bulk$counts))
sort.genes2symbols <- left_join(sort.genes2symbols, genes2symbols, by="ensembl_gene_id")
sort.genes2symbols <- sort.genes2symbols[!is.na(sort.genes2symbols$external_gene_name),]
sort.genes2symbols$external_gene_name <- make.unique(sort.genes2symbols$external_gene_name)
## scRNA-seq data
# import gene expression 10X data into Seurat
data10x <- Read10X(data.dir=down_dir, gene.column=1) # import 10x data as sparse matrix
# metadata
sc.meta <- read.table(paste(down_dir, "NCOMMS-19-7936188_metadata.txt", sep="/"),
header=TRUE, row.names=1, sep="\t")
## Filter bulk- & single-cell-RNA to common genes
# Get shared genes between bulk & single-cell
common.genes <- intersect(row.names(data10x), sort.genes2symbols$external_gene_name)
# Filter bulk
bulk$counts <- bulk$counts[sort.genes2symbols$ensembl_gene_id,]
row.names(bulk$counts) <- sort.genes2symbols$external_gene_name
bulk$counts <- bulk$counts[common.genes,]
row.names(bulk$counts) <- gsub("_", "-", row.names(bulk$counts))
# Seurat object
stopifnot(all(row.names(sc.meta)==colnames(data10x)))
seu <- CreateSeuratObject(counts=data10x[common.genes,],
meta.data=sc.meta) # convert gene exp. sparse matrix into Seurat class object
stopifnot(all(row.names(seu)==row.names(bulk$counts)))
Question: How many common genes were found between bulk- and single-cell-RNA-seq data sets and how many samples and cells compose the bulk and single-cell data? (2 min)
Tip: type the object
seu
in the R console and interpret the message printed or provide the objectseu
to the functionsnrow()
andncol()
which count the no. of rows (=genes) and columns (=cells). To check the no. of bulk-related samples provide the bulk counts,bulk$counts
, to the functionncol()
.
The Cano-Gamez et al., 2020 scRNA-seq dataset comprises 20495 genes
and 43112 cells whereas the bulk-RNA-seq comprises
nrow(bulk$counts)
genes and ncol(bulk$counts)
samples.
(5 min)
Get familiar with the metadata fields for the bulk- and sc-RNA-seq data below.
Bulk-RNA-seq metadata
Print below the first 5 rows of the bulk metadata
bulk$meta[1:5,]
and check how many samples (or bulk cell
types) exist by cytokine condition (cytokine_condition
) and
stimulation time (stimulation_time
) using the function
table()
that creates a confusion matrix. This will give you
the number of biological replicates for each cell type upon
differentiation.
The bulk$meta
metadata comprises the following
(self-explanatory) fields: sample_id
,
cell_type
, cytokine_condition
,
stimulation_time
, donor_id
, sex
,
age
, sequencing_batch
,
cell_culture_batch
.
## Print bulk-RNA-seq metadata
bulk$meta[1:5,]
## sample_id cell_type cytokine_condition stimulation_time donor_id sex age
## 1 I0712 CD4_Naive IFNB 16h 257 Male 38
## 2 I0713 CD4_Naive Th17 16h 254 Male 58
## 6 I0717 CD4_Memory Resting 5d 265 Male 59
## 10 I0721 CD4_Naive Th2 5d 257 Male 38
## 15 I0726 CD4_Memory Th17 5d 264 Male 27
## sequencing_batch cell_culture_batch rna_integrity_number
## 1 1 3 9.5
## 2 1 3 9.3
## 6 1 4 9.7
## 10 1 3 10.0
## 15 1 4 9.9
table(paste(bulk$meta$cytokine_condition, bulk$meta$stimulation_time), bulk$meta$cell_type)
##
## CD4_Memory CD4_Naive
## IFNB 16h 3 3
## IFNB 5d 3 2
## iTreg 16h 3 3
## iTreg 5d 3 3
## Resting 16h 3 3
## Resting 5d 3 3
## Th0 16h 6 6
## Th0 5d 6 5
## Th1 16h 3 3
## Th1 5d 3 3
## Th17 16h 2 3
## Th17 5d 3 3
## Th2 16h 3 3
## Th2 5d 3 4
Single-cell-RNA-seq metadata
Do the same as above but this time for scRNA-seq data below.
The seu@meta.data
metadata comprises the following
(self-explanatory) fields: orig.ident
(cell type -
N
or M
- the same as cell.type
),
nCount_RNA
(total no. of UMIs), nFeature_RNA
(no. of different genes expressed), cell.type
(same as
orig.ident
, but with the labels Naive
and
Memory
), cytokine.condition
,
donor.id
, batch.10X
(10X sequencing batch),
nGene
, nUMI
,
percent.mito
,S.Score
(S division score phase),
G2M.Score
(G2/M division score phase), Phase
(division phase), cluster.id
, effectorness
(effectorness score - a concept described in the paper).
## Print single-cell-RNA-seq metadata
seu@meta.data[1:5,]
## orig.ident nCount_RNA nFeature_RNA cell.type
## N_resting_AAACCTGAGCTGTCTA N 4139 1146 Naive
## N_resting_AAACCTGTCACCACCT N 3666 1024 Naive
## N_resting_AAACCTGTCCGTTGTC N 4405 1224 Naive
## N_resting_AAACGGGAGGGTTCCC N 3867 1002 Naive
## N_resting_AAACGGGCAACAACCT N 3539 995 Naive
## cytokine.condition donor.id batch.10X nGene nUMI
## N_resting_AAACCTGAGCTGTCTA UNS D4 2 1163 4172
## N_resting_AAACCTGTCACCACCT UNS D4 2 1037 3690
## N_resting_AAACCTGTCCGTTGTC UNS D2 2 1245 4446
## N_resting_AAACGGGAGGGTTCCC UNS D4 2 1016 3913
## N_resting_AAACGGGCAACAACCT UNS D1 2 1005 3557
## percent.mito S.Score G2M.Score Phase
## N_resting_AAACCTGAGCTGTCTA 0.02349556 -0.13419873 -0.1592109 G1
## N_resting_AAACCTGTCACCACCT 0.02086721 -0.10175611 -0.2037066 G1
## N_resting_AAACCTGTCCGTTGTC 0.02790279 -0.14513081 -0.1642104 G1
## N_resting_AAACGGGAGGGTTCCC 0.01150895 -0.06949151 -0.1908101 G1
## N_resting_AAACGGGCAACAACCT 0.03964015 -0.12400660 -0.1433789 G1
## cluster.id effectorness
## N_resting_AAACCTGAGCTGTCTA TN (resting) 0.15181238
## N_resting_AAACCTGTCACCACCT TN (resting) 0.03176291
## N_resting_AAACCTGTCCGTTGTC TN (resting) 0.11389687
## N_resting_AAACGGGAGGGTTCCC TN (resting) 0.34123968
## N_resting_AAACGGGCAACAACCT TN (resting) 0.01974084
table(seu@meta.data$cytokine.condition, seu@meta.data$cell.type)
##
## Memory Naive
## iTreg 6131 6588
## Th0 4766 2543
## Th17 5267 5615
## Th2 2893 4040
## UNS 3110 2159
Use the samples after 5 days of stimulation for differential gene expression!
(5 min)
Get acquainted below with the two different types of data: bulk- and single-cell-RNA. Try to understand the main differences by looking into the first entries of these three matrices and plotting the distribution of a few samples (which in this study represent cells).
Bulk-RNA
## Distribution of bulk-RNA
bulk$counts[1:20,1:10]
## I0712 I0713 I0717 I0721 I0726 I0731 I0732 I0734 I0735 I0736
## RP11-34P13.7 1 0 14 0 0 0 0 0 0 1
## AP006222.2 6 0 39 27 5 10 17 4 10 4
## RP4-669L17.10 11 8 25 11 11 15 2 13 45 7
## RP11-206L10.9 71 111 194 94 107 89 115 77 210 107
## LINC00115 10 15 44 44 47 22 39 15 47 42
## FAM41C 0 0 1 2 0 0 0 2 2 0
## NOC2L 13449 15055 2187 3800 4253 13852 3179 8236 2940 3978
## KLHL17 643 569 852 905 747 742 747 525 968 451
## PLEKHN1 17 10 133 218 805 6 210 161 26 184
## ISG15 4811 181 553 7222 16804 207 5812 1996 442 5192
## AGRN 2583 2917 934 3907 1840 2654 3313 1851 296 1305
## C1orf159 1143 1313 554 676 687 1287 570 913 735 539
## TNFRSF18 1169 1861 259 1220 1668 2718 1023 6479 3 1781
## TNFRSF4 6091 8179 553 2680 8595 9017 2213 14386 28 8660
## SDF4 7979 9199 5329 5463 7872 8449 4652 8257 4660 7855
## B3GALT6 2599 2905 989 1537 1368 2674 1201 2120 1049 1118
## UBE2J2 3237 3627 2072 2454 2370 3592 2050 2483 2314 2223
## SCNN1D 46 62 298 212 123 64 180 44 532 114
## ACAP3 767 859 2653 2401 1262 959 2127 1012 3203 1041
## PUSL1 698 570 363 510 536 732 425 510 461 489
## Plot the distribution of 5 random samples
set.seed(1024)
bulk_dist_plot <- bulk$counts[,sample(1:ncol(bulk$counts), 5)] %>%
mutate("Genes"=row.names(bulk$counts)) %>%
pivot_longer(., cols=starts_with("I"), names_to="Samples", values_to="Expression") %>%
mutate("Expression"=log2(Expression+1)) %>%
ggplot(data=., mapping=aes(x=Expression, colour=Samples)) +
geom_density() +
xlab("Log2(Expression+1)") +
theme_bw()
# Create directories to save results
dirs2save <- paste("results/CanoGamez_et_al_2020", c("plots", "tables", "objects"), sep="/")
for (d in dirs2save) if (!dir.exists(d)) dir.create(d, recursive=TRUE)
# Save
pdf(paste(dirs2save[1], "bulk_data_distribution_plot.pdf", sep="/"))
print(bulk_dist_plot)
dev.off()
## png
## 2
# Print
print(bulk_dist_plot)
Single-cell-RNA
## Distribution of sc-RNA
seu@assays$RNA@counts[1:20,1:10]
## 20 x 10 sparse Matrix of class "dgCMatrix"
##
## RP11-34P13.7 . . . . . . . . . .
## AP006222.2 . . . . . . . . . .
## RP4-669L17.10 . . . . . . . . . .
## RP11-206L10.9 . . . . . . . . . .
## LINC00115 . . . . . . . . . .
## FAM41C . . . . . . . . . .
## NOC2L . . . . . . 1 . . 1
## KLHL17 . . . . . . . . . .
## PLEKHN1 . . . . . . . . . .
## ISG15 . . . 1 . . . . . .
## AGRN . . . . . . . . . .
## C1orf159 . . . . . . . . 1 .
## TNFRSF18 . . . . . . . . . .
## TNFRSF4 . . . . . . . . . .
## SDF4 . . . 1 . . . 1 . 1
## B3GALT6 . . . . . . . . . .
## UBE2J2 1 . . . . . . 1 . 1
## SCNN1D . . . . . . . . . .
## ACAP3 . . . . . . . . . .
## PUSL1 . . . 1 . . . . . .
## Plot the distribution of 5 random cells
set.seed(1024)
sc_dist_plot <- seu@assays$RNA@counts[,sample(1:ncol(seu), 5)] %>%
as.data.frame(.) %>%
mutate("Genes"=row.names(seu)) %>%
pivot_longer(., cols=contains("_"), names_to="Samples", values_to="Expression") %>%
mutate("Expression"=log2(Expression+1)) %>%
ggplot(data=., mapping=aes(x=Expression, colour=Samples)) +
geom_density() +
xlab("Log2(Expression+1)") +
theme_bw()
# Save
pdf(paste(dirs2save[1], "sc_data_distribution_plot.pdf", sep="/"))
print(sc_dist_plot)
dev.off()
## png
## 2
# Print
print(sc_dist_plot)
TASK 1: Discuss within your group which are the main differences between the bulk- and single-cell-RNA gene expression matrices and distributions.
The first step in any gene expression data analysis is normalization. Normalization aims to reduce the sequencing coverage effect across samples/cells, i.e., the differences that we observe are not because one of the samples/cells were sequenced (for example) twice as much the others. There are many types of normalization that aim to reduce other technicalities but we’ll not go into those.
Normalize below the bulk- and single-cell-data.
(5 min)
The bulk data will be normalized using the TMM (Trimmed Mean of
M-values) normalization method from edgeR
(watch the StatQuest:
edgeR part 1, Library Normalization video for more details).
Run the code below to apply edgeR
normalization to the
bulk$counts
. The resulting object log-normalized is
bulk$logcpm
.
## Normalization
bulk$DGEList <- edgeR::DGEList(counts=bulk$counts, remove.zeros=T) # create object for edgeR
bulk$DGEList <- edgeR::calcNormFactors(bulk$DGEList) # calculate normalizing factors
bulk$logcpm <- edgeR::cpm(bulk$DGEList, normalized.lib.sizes=T, prior.count=1, log=T) # normalize
(5 min)
Here, we’ll focus on the most common used method for single-cell
data: log-normalization
(NormalizeData(..., normalization.method="LogNormalize")
).
This method consists in dividing the expression values by the library
size (=total no. UMIs or nCount_RNA
) for each respective
cell. In other words getting the relative abundance of gene expression
by cell. Then, the values obtained are multiplied by a factor, usually
10,000 (scale.factor=10000
). Finally this result is log1p
transformed.
Run the code below to log-normalize the counts in the
seu
Seurat
object.
## Normalization
seu <- NormalizeData(seu, normalization.method="LogNormalize", scale.factor=10000)
The first step in every analysis is: Quality-Control (QC). In this case, the QC that we’ll perform is reduced to PCA because the data was already processed.
Principal Component Analysis (PCA) is a deterministic and linear dimensional reduction method that aims to reduce the dimensionality of high-dimensional data, as it is the case of bulk- and scRNA-seq, maintaining most of the variation present in the data across a few dozens of Principal Components. It is an ideal preliminary QC data analysis for assessing (visually) which variables/factors contribute to the variance observed in the data.
(5 min)
Run the code below to perform a PCA with the bulk data. We’ll try to
highlight three variables in the plot, cytokine_condition
(dot colour), cell_type
(shape),
stimulation_time
(size). Spend some time inspecting the
result.
## PCA
bulk$pca <- prcomp(t(bulk$logcpm), center=TRUE, scale.=TRUE)
# Plot
bulk_pca <- bulk$pca$x %>%
as.data.frame(.) %>%
dplyr::select(PC1, PC2) %>%
mutate("Samples"=row.names(bulk$pca$x)) %>%
cbind(., bulk$meta) %>%
ggplot(data=., mapping=aes(x=PC1, y=PC2, colour=cytokine_condition,
shape=cell_type, size=stimulation_time)) +
geom_point() +
theme_bw()
# Save
pdf(paste(dirs2save[1], "bulk_pca_plot.pdf", sep="/"))
print(bulk_pca)
dev.off()
## png
## 2
# Print
print(bulk_pca)
(5 min)
Run the code below to perform a PCA with the single-cell data. We’ll
try to highlight two variables in the plot,
cytokine.condition
(dot shape), cell.type
(colour). Spend some time inspecting the result.
## PCA
set.seed(1024)
# HVG
seu <- FindVariableFeatures(seu, selection.method="vst",nfeatures=2000)
# Scaling
seu <- ScaleData(seu)
# PCA
seu <- RunPCA(seu, features=row.names(seu))
# Plot
sc_pca <- DimPlot(seu, reduction="pca", group.by="cell.type",
shape.by="cytokine.condition", pt.size=0.5)
# Save
pdf(paste(dirs2save[1], "sc_pca_plot.pdf", sep="/"))
print(sc_pca)
dev.off()
## png
## 2
# Print
print(sc_pca)
TASK 2: Discuss within your group which/how the variables highlighted in each case - bulk- & sc - explain the variance observed in both data types. Some tips: check the paper - Cano-Gamez et al., 2020.
(5-10 min)
Differential gene expression (DGE) consists in a statistical data analysis that aims to find the genes that are significantly different expressed between two (or more) conditions, e.g., between disease vs health conditions. In this scenario, the analyst is interested in finding the genes that are up- (more expressed in disease than health) and down-regulated (less expressed in disease compared with health).
There are several statistical methods and tools to perform DGE. The choice of the method/tool may depend on the data type.
Bulk-RNA data arose first (compared with single-cell data). Therefore, it is a far more mature technique/method with well-established tools and guidelines for data analysis. This is one of the reasons why we’ll use this data as the ‘ground-truth’. Thus we’ll compare the differentially expressed genes found in single-cell data against the significant differentially expressed genes found in the bulk data. Among the plethora of bulk DGE methods that one could use to perform DGE on bulk gene expression data is ROTS (Reproducibility Optimized Test Statistic; Suomi et al., 2017). ROTS is a very simple but accurate tool to perform differential abundance tests from count-based data, such as bulk data. It uses a modified version of the t-statistic and it has the ability to adjust to the proprieties of the data rather than making strong assumptions about the data.
Single-cell data is a most more recent technique with less defined guidelines and standards. We’ll use the most widely used and simple single-cell DGE method: Wilcox (a non-parametric test - wiki; Fig. 1b - Squair et al., 2021).
Bulk DGE methods, such as ROTS
among many others (e.g., DESeq2
, edgeR
), were
not designed to work directly with single-cell data.
However, single-cell data can be aggregated into
pseudobulks. Pseudobulk data consists in aggregating
(summing or averaging) the gene expression values across cell
types/clusters by biological replicates (see the example below from the
Fig.
2a - Squair et al., 2021). Since pseudobulks
resemble bulk gene expression data, bulk DGE
methods can be applied. Therefore, we’ll first convert the
single-cell data into pseudobulks and, then, perform DGE with
ROTS. This result will be compared against the
‘ground-truth’ bulk data and the differences to the results obtained
from the comparison with single-cell data directly will be explored.
Below we’ll perform the following pairwise comparisons for cells
sequenced at the 5th day for each type of data (bulk, single-cell,
pseudobulk) by each type of cell (i.e., CD4_Naive
,
CD4_Memory
) resulting in 24 results of DGE:
Th0 vs Resting
Th2 vs Resting
Th17 vs Resting
iTreg vs Resting
(10 min)
For bulk gene expression data we’ll use the bulk DGE method: ROTS.
ROTS()
function requires the following parameters (we’re
using the defaults for the omitted parameters):
data
: log-normalized data - bulk$logcpm
(obtained above - selected only the samples required for each two-group
pairwise comparison).
groups
: the class labels that each one of the
samples belong too matching the samples order given in data
(e.g., if the samples order in data
is -
trt1, trt2, trt3, ctr1, ctr2, ctr3
- then
groups
will be: 0, 0, 0, 1, 1, 1
- where
0
represents trt
group samples and
1
ctr
).
B
: the number of bootstrapping - we’ll set to 100
due to time constraints, but a higher number should be used.
seed
: to keep the analysis reproducible. Give any
integer number. Here we’ll use 1024.
Run the R
chunk code below to perform bulk DGE with
ROTS
. First we’ll select the samples names for a given type
of cell and cytokine condition. Then we’ll perform DGE with
ROTS
and parsing the output (a list) to a
data.frame
with gene
, logfc
,
pvalue
, FDR
. The results will be saved into
the list dge.bulks
(actually it is a list of lists). Do not
call this object directly because it contains 8 data frames with
thousands of rows. You can inspect this result by using three functions:
names()
, length()
(to see the sub lists),
head()
(only after you know the name of the sub lists,
inspect the individual data frames, e.g.,
head(dge.bulks$CD4_Naive$Th0_vs_Resting)
).
## DGE
# Samples per condition
cell.types <- c("CD4_Memory", "CD4_Naive")
stim.groups <- c("Resting", "Th0", "Th2", "Th17", "iTreg")
time.select <- "5d"
bulk.cell.comp <- list()
for (cell in cell.types) {
bulk.cell.comp[[cell]] <- list()
for (stim in stim.groups) {
bulk.cell.comp[[cell]][[stim]] <- bulk$meta %>%
filter(cell_type==cell &
cytokine_condition==stim &
stimulation_time==time.select) %>%
dplyr::select(sample_id) %>% pull(.)
}
}
# DGE: ROTS
dge.bulks <- list()
for (cell in cell.types) {
dge.bulks[[cell]] <- list()
ctrl.samps <- bulk.cell.comp[[cell]]$Resting
for (stim in stim.groups[stim.groups!="Resting"]) {
cat("\nPerforming DGE for", cell, "cell with ROTS:", stim, "vs Resting...\n")
comp <- paste0(stim, "_vs_", "Resting")
trt.samps <- bulk.cell.comp[[cell]][[stim]]
group.comp <- c(rep(0, length(trt.samps)), rep(1, length(ctrl.samps)))
dge.res <- ROTS(data=bulk$logcpm[,c(trt.samps, ctrl.samps)],
groups=group.comp, B=100, seed=1024)
dge.bulks[[cell]][[comp]] <- data.frame("gene"=names(dge.res$logfc),
"logfc"=dge.res$logfc,
"pvalue"=dge.res$pvalue,
"FDR"=dge.res$FDR)
cat("The no. of significant genes (FDR<0.05) found was:", sum(dge.res$FDR<0.05), "\n")
}
}
##
## Performing DGE for CD4_Memory cell with ROTS: Th0 vs Resting...
## The no. of significant genes (FDR<0.05) found was: 8324
##
## Performing DGE for CD4_Memory cell with ROTS: Th2 vs Resting...
## The no. of significant genes (FDR<0.05) found was: 7435
##
## Performing DGE for CD4_Memory cell with ROTS: Th17 vs Resting...
## The no. of significant genes (FDR<0.05) found was: 7623
##
## Performing DGE for CD4_Memory cell with ROTS: iTreg vs Resting...
## The no. of significant genes (FDR<0.05) found was: 7581
##
## Performing DGE for CD4_Naive cell with ROTS: Th0 vs Resting...
## The no. of significant genes (FDR<0.05) found was: 10495
##
## Performing DGE for CD4_Naive cell with ROTS: Th2 vs Resting...
## The no. of significant genes (FDR<0.05) found was: 7469
##
## Performing DGE for CD4_Naive cell with ROTS: Th17 vs Resting...
## The no. of significant genes (FDR<0.05) found was: 6953
##
## Performing DGE for CD4_Naive cell with ROTS: iTreg vs Resting...
## The no. of significant genes (FDR<0.05) found was: 7655
(104 min)
For single-cell gene expression data we’ll use the
single-cell DGE method: Wilcox (implemented in
Seurat
).
FindMarkers()
function requires the following parameters
(we’re using the defaults for the omitted parameters):
object
: Seurat
object -
seu
- with seu@data
data
log-normalized.
ident.1
: identity of the first group of cells. To be
compared against the cells belonging to ident.2
.
ident.2
: identity of the second group of
cells.
logfc.threshold
: log fold change threshold. Set to
0
.
min.pct
: minimum fraction that a gene needs to be
expressed in ident.1
or ident.2
to be
considered for DGE (use 0.1
- 10%)
test.use
: DGE test to use -
wilcox
ident.1=trt.cells, ident.2=ctrl.cells, logfc.threshold=0, min.pct=0.1, test.use=“wilcox”
Run the R
chunk code below to perform single-cell DGE
with Wilcox
. First we’ll set the groups of cells to be
compared given type of cell and cytokine condition. Then we’ll perform
DGE with Wilcox
(using the Seurat
function
FindMarkers()
) and parsing the output which looks slightly
different than the previous one, but it is equivalent
(gene
, p_val
, avg_log2FC
,
pct.1
, pct.2
, p_val_adj
). The
results will be saved into the list dge.sc
(actually it is
a list of lists). Do not call this object directly because it contains 8
data frames with thousands of rows. You can inspect this result by using
three functions: names()
, length()
(to see the
sub lists), head()
(only after you know the name of the sub
lists, inspect the individual data frames, e.g.,
head(dge.sc$Naive$Th0_vs_Resting)
).
WARNING: this R chunk code takes around 1 hour and 44 minutes to run. You may prefer to run only one cell type, i.e., naive or memory. It’s up to you. In that case you need to adapt the code!
# DGE: Wilcox
seu@meta.data[["pair_comp"]] <- paste(seu@meta.data$cell.type, seu@meta.data$cytokine.condition, sep="_")
Idents(seu) <- "pair_comp"
cell.types <- c("Naive", "Memory")
stim.groups <- c("UNS", "Th0", "Th2", "Th17", "iTreg")
dge.sc <- list()
for (cell in cell.types) {
dge.sc[[cell]] <- list()
ctrl.cells <- paste(cell, "UNS", sep="_")
for (stim in stim.groups[stim.groups!="UNS"]) {
cat("\nPerforming single-cell DGE for", cell, "cell with Wilcox:", stim, "vs Resting...\n")
comp <- paste0(stim, "_vs_", "Resting")
trt.cells <- paste(cell, stim, sep="_")
dge.res <- FindMarkers(seu, ident.1=trt.cells, ident.2=ctrl.cells,
logfc.threshold=0, min.pct=0.1, test.use="wilcox")
dge.sc[[cell]][[comp]] <- dge.res %>% mutate("gene"=row.names(dge.res)) %>%
dplyr::select(all_of(c("gene", "p_val", "avg_log2FC", "pct.1", "pct.2", "p_val_adj")))
cat("The no. of significant genes (FDR<0.05) found was:", sum(dge.sc[[cell]][[comp]]$p_val_adj<0.05), "\n")
}
}
##
## Performing single-cell DGE for Naive cell with Wilcox: Th0 vs Resting...
## The no. of significant genes (FDR<0.05) found was: 8141
##
## Performing single-cell DGE for Naive cell with Wilcox: Th2 vs Resting...
## The no. of significant genes (FDR<0.05) found was: 8000
##
## Performing single-cell DGE for Naive cell with Wilcox: Th17 vs Resting...
## The no. of significant genes (FDR<0.05) found was: 8021
##
## Performing single-cell DGE for Naive cell with Wilcox: iTreg vs Resting...
## The no. of significant genes (FDR<0.05) found was: 7426
##
## Performing single-cell DGE for Memory cell with Wilcox: Th0 vs Resting...
## The no. of significant genes (FDR<0.05) found was: 8128
##
## Performing single-cell DGE for Memory cell with Wilcox: Th2 vs Resting...
## The no. of significant genes (FDR<0.05) found was: 8453
##
## Performing single-cell DGE for Memory cell with Wilcox: Th17 vs Resting...
## The no. of significant genes (FDR<0.05) found was: 8348
##
## Performing single-cell DGE for Memory cell with Wilcox: iTreg vs Resting...
## The no. of significant genes (FDR<0.05) found was: 8463
(10 min)
For pseudobulk gene expression data we’ll use the bulk DGE method: ROTS.
First, we need to aggregate single-cell data into
pseudobulks by summing the gene expression values by
cell type, cytokine condition and biological replicate (with the
muscat
function aggregateData()
) obtaining the
object ps$counts
. Then, this pseudobulk
data is log-normalized using edgeR
(as it was done for the
bulk data above) - ps$logcpm
. Finally, the
ROTS()
function is called in a similar manner as it was
done above. The final result is a list of data.frame
s with
the DGE results called dge.ps
(following the structure:
head(dge.ps$Naive$Th0_vs_Resting)
).
## Pseudobulks
seu@meta.data[["pair_comp_rep"]] <- paste(seu@meta.data[["pair_comp"]], seu@meta.data[["donor.id"]], sep="_")
sce <- as.SingleCellExperiment(seu)
ps <- list()
ps$counts <- muscat::aggregateData(sce, assay="counts", by="pair_comp_rep", fun="sum")
ps$counts <- SummarizedExperiment::assay(ps$counts)
# Normalization
ps$DGEList <- edgeR::DGEList(ps$counts, remove.zeros=T)
ps$DGEList <- edgeR::calcNormFactors(ps$DGEList)
ps$logcpm <- edgeR::cpm(ps$DGEList, normalized.lib.sizes=T, prior.count=1, log=T)
# DGE: ROTS
cell.types <- c("Naive", "Memory")
stim.groups <- c("UNS", "Th0", "Th2", "Th17", "iTreg")
dge.ps <- list()
for (cell in cell.types) {
dge.ps[[cell]] <- list()
ctrl.samps <- grep(paste(cell, "UNS", sep="_"), colnames(ps$logcpm), value=TRUE)
for (stim in stim.groups[stim.groups!="UNS"]) {
cat("\nPerforming single-cell DGE for", cell, "cell with Wilcox:", stim, "vs Resting...\n")
comp <- paste0(stim, "_vs_", "Resting")
trt.samps <- grep(paste(cell, stim, sep="_"), colnames(ps$logcpm), value=TRUE)
group.comp <- c(rep(0, length(trt.samps)), rep(1, length(ctrl.samps)))
dge.res <- ROTS(data=ps$logcpm[,c(trt.samps, ctrl.samps)],
groups=group.comp, B=100, seed=1024)
dge.ps[[cell]][[comp]] <- data.frame("gene"=names(dge.res$logfc),
"logfc"=dge.res$logfc,
"pvalue"=dge.res$pvalue,
"FDR"=dge.res$FDR)
cat("The no. of significant genes (FDR<0.05) found was:", sum(dge.res$FDR<0.05), "\n")
}
}
##
## Performing single-cell DGE for Naive cell with Wilcox: Th0 vs Resting...
## The no. of significant genes (FDR<0.05) found was: 3785
##
## Performing single-cell DGE for Naive cell with Wilcox: Th2 vs Resting...
## The no. of significant genes (FDR<0.05) found was: 4766
##
## Performing single-cell DGE for Naive cell with Wilcox: Th17 vs Resting...
## The no. of significant genes (FDR<0.05) found was: 4767
##
## Performing single-cell DGE for Naive cell with Wilcox: iTreg vs Resting...
## The no. of significant genes (FDR<0.05) found was: 3004
##
## Performing single-cell DGE for Memory cell with Wilcox: Th0 vs Resting...
## The no. of significant genes (FDR<0.05) found was: 5617
##
## Performing single-cell DGE for Memory cell with Wilcox: Th2 vs Resting...
## The no. of significant genes (FDR<0.05) found was: 5107
##
## Performing single-cell DGE for Memory cell with Wilcox: Th17 vs Resting...
## The no. of significant genes (FDR<0.05) found was: 6345
##
## Performing single-cell DGE for Memory cell with Wilcox: iTreg vs Resting...
## The no. of significant genes (FDR<0.05) found was: 6521
Question: How the number of differentially expressed genes differ between the three methods applied above? (2 min)
Tip: look into the messages printed for each one of the methods.
The method ROTS with bulk-RNA data and the method Wilcox with the single-cell data obtained more differentially expressed genes (assuming a FDR/adjusted p-value<0.05) than the method ROTS with the pseudobulk data (aggregated from single-cell data).
(5-10 min)
Now we aim to compare the lists of differentially expressed genes
obtained with single-cell Wilcox and pseudobulk
ROTS methods against our ground-truth
, i.e., the
bulk data (obtained also with ROTS
).
There are several ways to compare these results. Here we’ll focus in the metric Area Under the Concordance Curve (AUCC, Soneson & Robinson (2018)). The AUCC compares two lists of differentially expressed genes by looking into the top K ranked shared genes. First, both list of differentially expressed genes (i.e., FDR/adjusted p-value<0.05 or other alpha value) are ranked from the lowest to hightest FDR/adj, i.e., from the most significant to lowest. Then, for the top K value choosen, let’s say 500, the first 500 genes are compared between both lists. It starts by checking if the first gene is the same between both lists. If it is, it means that you’ve one shared gene. Then, it checks how many genes are shared for the top 2 genes, and so on, until it checks how many genes are shared between the top 500 genes regardless of the order. Finally, the sum of top K shared genes is divided by the number corresponding to a 100% agreement between both lists. See the example below to have a concrete example how it is calculated.
## Run AUCC
## List of DEGs ordered by increasing significance & log2FC per cell per pairwise comparison
top500.genes <- list()
cell.types <- c("CD4_Naive", "CD4_Memory")
pair.comp <- names(dge.bulks$CD4_Naive)
type.comp <- c("bulk", "sc", "ps")
for (cell in cell.types) {
top500.genes[[cell]] <- list()
for (pair in pair.comp) {
top500.genes[[cell]][[pair]] <- list()
for (type in type.comp) {
cell.new <- cell
cell.new <- ifelse(type %in% c("sc", "ps"), gsub("CD4_", "", cell), cell)
if (type %in% "bulk") { # 'ROTS' methods - same DGE table structure
top500.genes[[cell]][[pair]][[type]] <- dge.bulks[[cell]][[pair]] %>%
filter(FDR<0.05) %>% arrange(FDR, desc(abs(logfc))) %>%
pull(gene)
} else if (type=="ps") {
top500.genes[[cell]][[pair]][[type]] <- dge.ps[[cell.new]][[pair]] %>%
filter(FDR<0.05) %>% arrange(FDR, desc(abs(logfc))) %>%
pull(gene)
} else { # type=='sc'
top500.genes[[cell]][[pair]][[type]] <- dge.sc[[cell.new]][[pair]] %>%
filter(p_val_adj<0.05) %>% arrange(p_val_adj, desc(abs(avg_log2FC))) %>%
pull(gene)
}
}
}
}
## Create function to perform AUCC
aucc <- function(x_deg, y_deg, k=500) {
rank_shared <- lapply(seq_len(k), function(x) length(intersect(x_deg[1:x], y_deg[1:x])))
rank_shared <- unlist(rank_shared)
K <- (k*(k+1)/2)
AUCC <- sum(rank_shared) / K
return(AUCC)
}
## Run AUCC function
aucc.results <- data.frame("Cell_type"=rep(cell.types, each=length(pair.comp)*2),
"Pair_comp"=rep(pair.comp, length(cell.types)*2),
"Type"=rep(c("sc", "ps", "sc", "ps"), each=4),
"AUCC"=rep(NA, length(pair.comp)*length(cell.types)*2))
for (cell in cell.types) {
for (pair in pair.comp) {
aucc.results[aucc.results$Cell_type==cell &
aucc.results$Pair_comp==pair &
aucc.results$Type=="sc", "AUCC"] <-
aucc(x_deg=top500.genes[[cell]][[pair]][["bulk"]],
y_deg=top500.genes[[cell]][[pair]][["sc"]],
k=500)
aucc.results[aucc.results$Cell_type==cell &
aucc.results$Pair_comp==pair &
aucc.results$Type=="ps", "AUCC"] <-
aucc(x_deg=top500.genes[[cell]][[pair]][["bulk"]],
y_deg=top500.genes[[cell]][[pair]][["ps"]],
k=500)
}
}
# Plot data
aucc_plot <- aucc.results %>%
ggplot(data=., mapping=aes(x=Type, y=AUCC, fill=Type)) +
geom_bar(stat="identity") +
facet_grid(Cell_type~Pair_comp) +
theme_bw()
# Save
pdf(paste(dirs2save[1], "aucc_comparison_barplot.pdf", sep="/"))
print(aucc_plot)
dev.off()
## png
## 2
# Print
print(aucc_plot)
Question: Which of the two data types - single-cell vs pseudobolks - seems to resemble ‘ground-truth’ of differentially expressed genes obtained from the bulk data based on the AUCC scores? (2 min)
Pseudobulk data (which corresponds to the aggregation of single-cell
data by summing gene expression by cell types by donors) provides much
higher AUCC scores than single-cell data. This means that the list of
differentially expressed genes obtained with pseudobulks is much more
similar to the list of significant genes found with single-cell data
when comparing against the ground-truth
of bulk-RNA
differentially expressed genes.
TASK 3: Discuss within your group why the methods are performing so differently and why one of them is outperforming the other one. Some tips: check the distribution of pseudobulk samples as we did above for the single-cell and bulk data; inspect the top 10 differentially expressed genes by method; and, check the Squair et al., 2021.
(10-15 min)
In this final section, you’ll have the freedom to explore the gene
expression differences that were triggered upon CD4+ T cell activation.
To this end you can do a plethora of analyses and visualizations. For
instance, you can select the top genes differentially expressed for each
pairwise comparison and plot them in a heatmap
(DoHeatmap(seu, features = c("IL2RA",...,"CCL3"))
), boxplot
(VlnPlot(seu, features = c("MS4A1", "CD79A"))
) and/or UMAP
(FeaturePlot(seu, features = c("MS4A1", "GNLY", "CD8A"))
).
(30-35 min)
Ultimately, for each list of genes up- and down-regulated from each
pairwise comparison made above (for each cell type by cytokine
stimulation by data type) we’ll perform functional enrichment
analysis with gprofiler2
(webserver - Kolberg et al.,
2020). You can read more about the main function called below
gost()
by typing ?gprofiler2::gost
. Since this
is just an API (Application Programming Interface), we’ll not go through
every option (you could just copy/paste your gene list of interest and
get the same results in the webserver). Here, the
main result will be saved into the list func.list
.
Run the R
chunk code below.
## Functional enrichment analysis
## DGE list
deg.list <- list()
for (cell in cell.types) {
deg.list[[cell]] <- list()
new.cell <- gsub("CD4_", "", cell)
for (pair in pair.comp) {
deg.list[[cell]][[pair]][["bulk"]] <- deg.list[[cell]][[pair]][["sc"]] <-
deg.list[[cell]][[pair]][["ps"]] <- deg.list[[cell]][[pair]] <- list()
## Bulk
deg.list[[cell]][[pair]][["bulk"]][["up"]] <- dge.bulks[[cell]][[pair]] %>%
filter(FDR<0.05) %>% arrange(FDR, desc(abs(logfc))) %>%
filter(logfc>0) %>% pull(gene)
deg.list[[cell]][[pair]][["bulk"]][["down"]] <- dge.bulks[[cell]][[pair]] %>%
filter(FDR<0.05) %>% arrange(FDR, desc(abs(logfc))) %>%
filter(logfc<0) %>% pull(gene)
## sc
deg.list[[cell]][[pair]][["sc"]][["up"]] <- dge.sc[[new.cell]][[pair]] %>%
filter(p_val_adj<0.05) %>% arrange(p_val_adj, desc(abs(avg_log2FC))) %>%
filter(avg_log2FC>0) %>% pull(gene)
deg.list[[cell]][[pair]][["sc"]][["down"]] <- dge.sc[[new.cell]][[pair]] %>%
filter(p_val_adj<0.05) %>% arrange(p_val_adj, desc(abs(avg_log2FC))) %>%
filter(avg_log2FC<0) %>% pull(gene)
## sc
deg.list[[cell]][[pair]][["ps"]][["up"]] <- dge.ps[[new.cell]][[pair]] %>%
filter(FDR<0.05) %>% arrange(FDR, desc(abs(logfc))) %>%
filter(logfc>0) %>% pull(gene)
deg.list[[cell]][[pair]][["ps"]][["down"]] <- dge.ps[[new.cell]][[pair]] %>%
filter(FDR<0.05) %>% arrange(FDR, desc(abs(logfc))) %>%
filter(logfc<0) %>% pull(gene)
}
}
## Run functional analysis with gprofiler2
if (!"gprofiler2" %in% installed.packages()) install.packages("gprofiler2", repos="https://cloud.r-project.org")
library("gprofiler2")
func.list <- list()
for (cell in cell.types) {
func.list[[cell]] <- list()
for (pair in pair.comp) {
func.list[[cell]][[pair]] <- list()
for (type in c("bulk", "ps", "sc")) {
func.list[[cell]][[pair]][[type]] <- list()
for (reg in c("up", "down")) {
cat("Performing functional enrichment analysis for:", paste(cell, pair, type, reg, sep=" - "), "\n")
set.seed(1024)
func.list[[cell]][[pair]][[type]][[reg]] <- gost(query=deg.list[[cell]][[pair]][[type]][[reg]],
organism = "hsapiens", ordered_query = TRUE,
multi_query = FALSE, significant = TRUE,
exclude_iea = FALSE, measure_underrepresentation = FALSE,
evcodes = FALSE, user_threshold = 0.05,
correction_method = "g_SCS", domain_scope = "annotated",
custom_bg = NULL, numeric_ns = "", sources = NULL,
as_short_link = FALSE)
}
}
}
}
## Performing functional enrichment analysis for: CD4_Naive - Th0_vs_Resting - bulk - up
## Performing functional enrichment analysis for: CD4_Naive - Th0_vs_Resting - bulk - down
## Performing functional enrichment analysis for: CD4_Naive - Th0_vs_Resting - ps - up
## Performing functional enrichment analysis for: CD4_Naive - Th0_vs_Resting - ps - down
## Performing functional enrichment analysis for: CD4_Naive - Th0_vs_Resting - sc - up
## Performing functional enrichment analysis for: CD4_Naive - Th0_vs_Resting - sc - down
## Performing functional enrichment analysis for: CD4_Naive - Th2_vs_Resting - bulk - up
## Performing functional enrichment analysis for: CD4_Naive - Th2_vs_Resting - bulk - down
## Performing functional enrichment analysis for: CD4_Naive - Th2_vs_Resting - ps - up
## Performing functional enrichment analysis for: CD4_Naive - Th2_vs_Resting - ps - down
## Performing functional enrichment analysis for: CD4_Naive - Th2_vs_Resting - sc - up
## Performing functional enrichment analysis for: CD4_Naive - Th2_vs_Resting - sc - down
## Performing functional enrichment analysis for: CD4_Naive - Th17_vs_Resting - bulk - up
## Performing functional enrichment analysis for: CD4_Naive - Th17_vs_Resting - bulk - down
## Performing functional enrichment analysis for: CD4_Naive - Th17_vs_Resting - ps - up
## Performing functional enrichment analysis for: CD4_Naive - Th17_vs_Resting - ps - down
## Performing functional enrichment analysis for: CD4_Naive - Th17_vs_Resting - sc - up
## Performing functional enrichment analysis for: CD4_Naive - Th17_vs_Resting - sc - down
## Performing functional enrichment analysis for: CD4_Naive - iTreg_vs_Resting - bulk - up
## Performing functional enrichment analysis for: CD4_Naive - iTreg_vs_Resting - bulk - down
## Performing functional enrichment analysis for: CD4_Naive - iTreg_vs_Resting - ps - up
## Performing functional enrichment analysis for: CD4_Naive - iTreg_vs_Resting - ps - down
## Performing functional enrichment analysis for: CD4_Naive - iTreg_vs_Resting - sc - up
## Performing functional enrichment analysis for: CD4_Naive - iTreg_vs_Resting - sc - down
## Performing functional enrichment analysis for: CD4_Memory - Th0_vs_Resting - bulk - up
## Performing functional enrichment analysis for: CD4_Memory - Th0_vs_Resting - bulk - down
## Performing functional enrichment analysis for: CD4_Memory - Th0_vs_Resting - ps - up
## Performing functional enrichment analysis for: CD4_Memory - Th0_vs_Resting - ps - down
## Performing functional enrichment analysis for: CD4_Memory - Th0_vs_Resting - sc - up
## Performing functional enrichment analysis for: CD4_Memory - Th0_vs_Resting - sc - down
## Performing functional enrichment analysis for: CD4_Memory - Th2_vs_Resting - bulk - up
## Performing functional enrichment analysis for: CD4_Memory - Th2_vs_Resting - bulk - down
## Performing functional enrichment analysis for: CD4_Memory - Th2_vs_Resting - ps - up
## Performing functional enrichment analysis for: CD4_Memory - Th2_vs_Resting - ps - down
## Performing functional enrichment analysis for: CD4_Memory - Th2_vs_Resting - sc - up
## Performing functional enrichment analysis for: CD4_Memory - Th2_vs_Resting - sc - down
## Performing functional enrichment analysis for: CD4_Memory - Th17_vs_Resting - bulk - up
## Performing functional enrichment analysis for: CD4_Memory - Th17_vs_Resting - bulk - down
## Performing functional enrichment analysis for: CD4_Memory - Th17_vs_Resting - ps - up
## Performing functional enrichment analysis for: CD4_Memory - Th17_vs_Resting - ps - down
## Performing functional enrichment analysis for: CD4_Memory - Th17_vs_Resting - sc - up
## Performing functional enrichment analysis for: CD4_Memory - Th17_vs_Resting - sc - down
## Performing functional enrichment analysis for: CD4_Memory - iTreg_vs_Resting - bulk - up
## Performing functional enrichment analysis for: CD4_Memory - iTreg_vs_Resting - bulk - down
## Performing functional enrichment analysis for: CD4_Memory - iTreg_vs_Resting - ps - up
## Performing functional enrichment analysis for: CD4_Memory - iTreg_vs_Resting - ps - down
## Performing functional enrichment analysis for: CD4_Memory - iTreg_vs_Resting - sc - up
## Performing functional enrichment analysis for: CD4_Memory - iTreg_vs_Resting - sc - down
Below we’ll plot the functional enrichment result obtained above for
CD4 Naive T cells for the upregulated genes found in the pairwise
comparison Th0 versus Resting across bulk-, single-cell- and
pseudobulk-RNA data. We know that we expect that pathways/terms related
with response to cytokine
should be among the top
enriched/significant terms. Try to find it and inspect if this term is
among the top ones for the three data sets and discuss with your group
the meaning of that. Consider the top ones, terms with an adjusted
p-value<1e-16. Use this information to support your
discussion when assessing which of the two methods - single-cell
Wilcox vs pseudobulk ROTS - provide more functionally
important information. You may need to look into the table rather than
the plots below by calling (just an example):
head(func.list$CD4_Naive$Th0_vs_Resting$bulk$up)
(for the
upregulated terms in CD4 naive cells for the comparison Th0 vs
Resting).
# Bulk
cat("Functional enrichment analysis for bulk data:\n")
## Functional enrichment analysis for bulk data:
gostplot(func.list$CD4_Naive$Th0_vs_Resting$bulk$up, capped = TRUE, interactive = TRUE)
# sc
cat("Functional enrichment analysis for single-cell data:\n")
## Functional enrichment analysis for single-cell data:
gostplot(func.list$CD4_Naive$Th0_vs_Resting$sc$up, capped = TRUE, interactive = TRUE)
# ps
cat("Functional enrichment analysis for pseudobulk data:\n")
## Functional enrichment analysis for pseudobulk data:
gostplot(func.list$CD4_Naive$Th0_vs_Resting$ps$up, capped = TRUE, interactive = TRUE)
TASK 4 (if you’ve time): Discuss within your group which are the most important genes/pathways for CD4+ T cell activation using the DGE results as well as the functional enrichment analysis. Some tips: check the original publication where the data was published - Cano-Gamez et al., 2020.
# Save objects
saveRDS(dge.bulks, paste(dirs2save[3], "dge.bulks.rds", sep="/"))
saveRDS(dge.sc, paste(dirs2save[3], "dge.sc.rds", sep="/"))
saveRDS(dge.ps, paste(dirs2save[3], "dge.ps.rds", sep="/"))
saveRDS(top500.genes, paste(dirs2save[3], "top500.genes.rds", sep="/"))
saveRDS(aucc, paste(dirs2save[3], "aucc.results", sep="/"))
saveRDS(seu, paste(dirs2save[3], "seu.rds", sep="/"))
saveRDS(ps, paste(dirs2save[3], "ps.rds", sep="/"))
saveRDS(bulk, paste(dirs2save[3], "bulk.rds", sep="/"))
saveRDS(func.list, paste(dirs2save[3], "func.list.rds", sep="/"))
## R packages and versions used in these analyses
sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS/LAPACK: /wrk/local2/B21027_scRNAseq_signatures/070622_embl_ebi_tcell_course/EMBL_EBI_scRNA_bioinformatics_Tcell_course_2022/workflow/.snakemake/conda/2b532403d31becf1b5412328473ca837/lib/libopenblasp-r0.3.20.so
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=fi_FI.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=fi_FI.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=fi_FI.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=fi_FI.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] gprofiler2_0.2.1 ROTS_1.22.0 sp_1.5-0 SeuratObject_4.1.0
## [5] Seurat_4.1.1 ggplot2_3.3.6 dplyr_1.0.9 tidyr_1.2.0
##
## loaded via a namespace (and not attached):
## [1] rappdirs_0.3.3 scattermore_0.8
## [3] bit64_4.0.5 knitr_1.39
## [5] irlba_2.3.5 DelayedArray_0.20.0
## [7] data.table_1.14.2 rpart_4.1.16
## [9] KEGGREST_1.34.0 RCurl_1.98-1.7
## [11] doParallel_1.0.17 generics_0.1.2
## [13] BiocGenerics_0.40.0 ScaledMatrix_1.2.0
## [15] cowplot_1.1.1 RSQLite_2.2.14
## [17] RANN_2.6.1 future_1.26.1
## [19] bit_4.0.4 spatstat.data_2.2-0
## [21] xml2_1.3.3 httpuv_1.6.5
## [23] SummarizedExperiment_1.24.0 assertthat_0.2.1
## [25] viridis_0.6.2 xfun_0.31
## [27] hms_1.1.1 jquerylib_0.1.4
## [29] evaluate_0.15 promises_1.2.0.1
## [31] fansi_1.0.3 progress_1.2.2
## [33] caTools_1.18.2 dbplyr_2.2.0
## [35] igraph_1.3.2 DBI_1.1.3
## [37] geneplotter_1.72.0 htmlwidgets_1.5.4
## [39] spatstat.geom_2.4-0 stats4_4.1.1
## [41] purrr_0.3.4 ellipsis_0.3.2
## [43] crosstalk_1.2.0 backports_1.4.1
## [45] annotate_1.72.0 biomaRt_2.53.2
## [47] deldir_1.0-6 sparseMatrixStats_1.6.0
## [49] MatrixGenerics_1.6.0 vctrs_0.4.1
## [51] SingleCellExperiment_1.16.0 Biobase_2.54.0
## [53] ROCR_1.0-11 abind_1.4-5
## [55] cachem_1.0.6 withr_2.5.0
## [57] progressr_0.10.1 sctransform_0.3.3
## [59] prettyunits_1.1.1 goftest_1.2-3
## [61] cluster_2.1.3 lazyeval_0.2.2
## [63] crayon_1.5.1 genefilter_1.76.0
## [65] edgeR_3.36.0 pkgconfig_2.0.3
## [67] labeling_0.4.2 GenomeInfoDb_1.30.0
## [69] nlme_3.1-157 vipor_0.4.5
## [71] blme_1.0-5 rlang_1.0.2
## [73] globals_0.15.0 lifecycle_1.0.1
## [75] miniUI_0.1.1.1 filelock_1.0.2
## [77] klippy_0.0.0.9500 BiocFileCache_2.0.0
## [79] rsvd_1.0.5 polyclip_1.10-0
## [81] matrixStats_0.62.0 lmtest_0.9-40
## [83] Matrix_1.4-1 boot_1.3-28
## [85] zoo_1.8-10 beeswarm_0.4.0
## [87] ggridges_0.5.3 GlobalOptions_0.1.2
## [89] png_0.1-7 viridisLite_0.4.0
## [91] rjson_0.2.21 bitops_1.0-7
## [93] KernSmooth_2.23-20 Biostrings_2.62.0
## [95] blob_1.2.3 DelayedMatrixStats_1.16.0
## [97] shape_1.4.6 stringr_1.4.0
## [99] parallelly_1.32.0 spatstat.random_2.2-0
## [101] S4Vectors_0.32.3 beachmat_2.10.0
## [103] scales_1.2.0 memoise_2.0.1
## [105] magrittr_2.0.3 plyr_1.8.7
## [107] ica_1.0-2 gplots_3.1.3
## [109] zlibbioc_1.40.0 compiler_4.1.1
## [111] RColorBrewer_1.1-3 clue_0.3-60
## [113] lme4_1.1-29 DESeq2_1.34.0
## [115] fitdistrplus_1.1-8 cli_3.3.0
## [117] XVector_0.34.0 lmerTest_3.1-3
## [119] listenv_0.8.0 patchwork_1.1.1
## [121] pbapply_1.5-0 TMB_1.9.0
## [123] MASS_7.3-57 mgcv_1.8-40
## [125] tidyselect_1.1.2 stringi_1.7.6
## [127] highr_0.9 yaml_2.3.5
## [129] BiocSingular_1.10.0 locfit_1.5-9.5
## [131] ggrepel_0.9.1 muscat_1.8.0
## [133] grid_4.1.1 sass_0.4.1
## [135] tools_4.1.1 future.apply_1.9.0
## [137] parallel_4.1.1 circlize_0.4.15
## [139] foreach_1.5.2 gridExtra_2.3
## [141] farver_2.1.0 Rtsne_0.16
## [143] digest_0.6.29 rgeos_0.5-9
## [145] shiny_1.7.1 Rcpp_1.0.8.3
## [147] GenomicRanges_1.46.1 broom_0.8.0
## [149] scuttle_1.4.0 later_1.3.0
## [151] RcppAnnoy_0.0.19 httr_1.4.3
## [153] AnnotationDbi_1.56.1 ComplexHeatmap_2.10.0
## [155] colorspace_2.0-3 XML_3.99-0.10
## [157] tensor_1.5 reticulate_1.25
## [159] IRanges_2.28.0 splines_4.1.1
## [161] uwot_0.1.11 spatstat.utils_2.3-1
## [163] scater_1.22.0 plotly_4.10.0
## [165] xtable_1.8-4 jsonlite_1.8.0
## [167] nloptr_2.0.3 R6_2.5.1
## [169] pillar_1.7.0 htmltools_0.5.2
## [171] mime_0.12 glue_1.6.2
## [173] fastmap_1.1.0 minqa_1.2.4
## [175] BiocParallel_1.28.3 BiocNeighbors_1.12.0
## [177] codetools_0.2-18 utf8_1.2.2
## [179] lattice_0.20-45 bslib_0.3.1
## [181] spatstat.sparse_2.1-1 tibble_3.1.7
## [183] numDeriv_2016.8-1.1 pbkrtest_0.5.1
## [185] curl_4.3.2 ggbeeswarm_0.6.0
## [187] leiden_0.4.2 gtools_3.9.2.2
## [189] survival_3.3-1 limma_3.50.1
## [191] glmmTMB_1.1.3 rmarkdown_2.11
## [193] munsell_0.5.0 GetoptLong_1.0.5
## [195] GenomeInfoDbData_1.2.7 iterators_1.0.14
## [197] variancePartition_1.24.0 reshape2_1.4.4
## [199] gtable_0.3.0 spatstat.core_2.4-4