Notebook for metabolomics analysis using publicly available dataΒΆ
Metabolomics
Christina Schmidt
12 May 2021
#load the needed:
library(tidyverse) # used for data manipulation
library(rmarkdown) # used for paged_table function
Samples
The paper
βAn Integrated Metabolic Atlas of Clear Cell Renal Cell Carcinomaβ
includes metabolomic profiling on 138 matched clear cell renal cell
carcinoma (ccRCC)/normal tissue pairs. Metabolomics was done using The
company Metabolon, so this is untargeted metabolomics.
Here we use the median normalised data from the supplementary table 2 of
the paper. The median normalisation was done by metaboanalyst.
The Data
First we removed metabolites that where not identifiable from the
dataset.
#Load the data:
SampleInfo <- read.csv("Input_MetabolomicsResults/SampleInformation_Supplementarytable2_mmc2.csv", check.names=FALSE)
rownames(SampleInfo) <- SampleInfo[,1]#Metabolite Names as rownames
SampleInfo <- as.data.frame(t(SampleInfo[,-c(1)]))#Transpose SampleInfoframe
SampleInfo<- cbind(rownames(SampleInfo), data.frame(SampleInfo, row.names=NULL, check.names=FALSE))
names(SampleInfo)[names(SampleInfo) == "rownames(SampleInfo)"] <- "SampleID"
Data <- read.csv("Input_MetabolomicsResults/MedianNormalisedData_Supplementarytable2_mmc2.csv", check.names=FALSE)
PathwayInfo <- Data[,1:12]
rownames(Data) <- Data[,2]#Metabolite Names as rownames
Data <- as.data.frame(t(Data[,-c(1:12)]))#Transpose dataframe
Data<- cbind(rownames(Data), data.frame(Data, row.names=NULL, check.names=FALSE))
names(Data)[names(Data) == "rownames(Data)"] <- "SampleID"
#Add Demographic parameters for comparison:
Data <- merge(SampleInfo, Data, by="SampleID", all=TRUE, remove=FALSE)%>%
mutate(Stage = case_when(`TYPE-STAGE`=="TUMOR-STAGE I" ~ 'EARLY-STAGE',
`TYPE-STAGE`=="TUMOR-STAGE II" ~ 'EARLY-STAGE',
`TYPE-STAGE`=="TUMOR-STAGE III" ~ 'LATE-STAGE',
`TYPE-STAGE`=="TUMOR-STAGE IV" ~ 'LATE-STAGE',
`TYPE-STAGE`=="NORMAL-STAGE I" ~ 'EARLY-STAGE',
`TYPE-STAGE`=="NORMAL-STAGE II" ~ 'EARLY-STAGE',
`TYPE-STAGE`=="NORMAL-STAGE III" ~ 'LATE-STAGE',
`TYPE-STAGE`=="NORMAL-STAGE IV" ~ 'LATE-STAGE',
TRUE ~ 'X'))%>%
mutate(Age = case_when(`AGE AT SURGERY`<42 ~ 'Young',
`AGE AT SURGERY`>58 ~ 'Old',
TRUE ~ 'Middle'))%>%
mutate(Gender = case_when(`GENDER`=="Male" ~ 'Male',
`GENDER`=="male" ~ 'Male',
`GENDER`=="male " ~ 'Male',
`GENDER`=="Female" ~ 'Female',
`GENDER`=="female" ~ 'Female',
TRUE ~ 'x'))
Data <-Data[,c(1:4,6:9,890:892,10:889)]
#Remove not identified metabolites:
Data <-Data[,-c(585, 592:891)]
Now we check the clustering via PCA plot:
In detail, we want to check for outliers, how the patients samples and
normal tissue samples cluster and if the different demographics
separate.
library(devtools)
## Loading required package: usethis
library(ggfortify)
library(ggplot2)
library(RColorBrewer)
library(viridisLite)
library(viridis)#https://cran.r-project.org/web/packages/viridis/vignettes/intro-to-viridis.html
PCA_NoLoadings <- function(InputMatrix,OutputPlotName, DF, Color, Shape){
PCA <- autoplot (prcomp(InputMatrix),
data = DF,
colour = Color, #colour = row including the sample information to colour code
label=T,
label.size=1,
label.repel = TRUE,
#loadings=T, #draws Eigenvectors
#loadings.label = TRUE,
#loadings.label.vjust = 1.2,
#loadings.label.size=2,
#loadings.colour="grey10",
#loadings.label.colour="grey10",
fill = Color,#fill colour of the dots ("cyan4")
color = "black",#outline colour
alpha = 0.8,#controls the transparency: 1 = 100% opaque; 0 = 100% transparent.
shape = Shape,#https://rpkgs.datanovia.com/ggpubr/reference/show_point_shapes.html, 21
size = 3.5#size of the dot
)+
labs(col=Color, size=1)+
scale_shape_manual(values=c(22,21,24,23,25,7,8,11,12))+ #needed if more than 6 shapes are in place
theme_classic()+
geom_hline(yintercept=0, linetype="dashed", color = "black", alpha=0.6, size=0.75)+
geom_vline(xintercept = 0, linetype="dashed", color = "black", alpha=0.6, size=0.75)+
ggtitle(paste(OutputPlotName))
ggsave(file=paste("Figures/PCA_Color", OutputPlotName, ".pdf", sep="_"), plot=PCA, width=10, height=10)
plot(PCA)
}
#PLOTs
PCA_Data_m<- Data[,15:590]
PCA_Data_m <- apply(as.matrix(PCA_Data_m), 2, as.numeric)
row.names(PCA_Data_m) <- Data$SampleID
PCA_NoLoadings(InputMatrix= PCA_Data_m, OutputPlotName= "TISSUE_TYPE_Outlier",DF=Data, Color = "TISSUE_TYPE", Shape = "TISSUE_TYPE")
DIAG16267 (Normal tissue) has been removed from the dataset. We also
removed the matching tumour sample DIAG-16266
Data <-Data[-c(187:188),]#DIAG16267 is an outlier
#Safe the Dataframe as our input data:
write.csv(Data, "OutputData/Sheet1_InputData-with-PatientInformation.csv", row.names = FALSE )
After removing the outlier we colour code for the different demographics
to check if there are any patterns visible:
PCA_Data_m<- Data[,15:590]
PCA_Data_m <- apply(as.matrix(PCA_Data_m), 2, as.numeric)
row.names(PCA_Data_m) <- Data$SampleID
PCA_NoLoadings(InputMatrix= PCA_Data_m, OutputPlotName= "TISSUE_TYPE",DF=Data, Color = "TISSUE_TYPE", Shape = "TISSUE_TYPE")
PCA_NoLoadings(InputMatrix= PCA_Data_m, OutputPlotName= "Patients_Stages",DF=Data, Color = "Stage", Shape = "TISSUE_TYPE")
PCA_NoLoadings(InputMatrix= PCA_Data_m, OutputPlotName= "Age",DF=Data, Color = "Age", Shape = "TISSUE_TYPE")
Tumour versus Normal
Here we compare the tumour samples to the normal samples calculating the Log2FoldChange (Log2FC) and p.value using Mann-Whitney U test.
#Single replicates for STAT:
Stat_Normal <- subset(`Data`, `TISSUE_TYPE` == "NORMAL", select=c(15:590))
Stat_Tumour <- subset(`Data`, `TISSUE_TYPE` == "TUMOR", select=c(15:590))
#Means of the replicates
Data_Means <- (Data[,15:590]) %>%
group_by(Data$TISSUE_TYPE) %>%
summarise_all(funs(mean))%>%
rename("TISSUE_TYPE"="Data$TISSUE_TYPE")
Mean_Normal <- subset(Data_Means, TISSUE_TYPE == "NORMAL", select=c(2:577))
Mean_Tumour <- subset(Data_Means, TISSUE_TYPE == "TUMOR", select=c(2:577))
#Function to calculate the log2FC and statistics
library(gtools)#used for "foldchange"
DMA <-function(Log2FC_Condition1, Log2FC_Condition2,Stat_Condition1, Stat_Condition2, Output){
#Log2FC
FC_C1vC2 <- mapply(foldchange,Log2FC_Condition1,Log2FC_Condition2)
Log2FC_C1vC2 <- as.data.frame(foldchange2logratio(FC_C1vC2, base=2))
Log2FC_C1vC2 <- cbind(rownames(Log2FC_C1vC2), data.frame(Log2FC_C1vC2, row.names=NULL))
names(Log2FC_C1vC2)[names(Log2FC_C1vC2) == "rownames(Log2FC_C1vC2)"] <- "Metabolite"
names(Log2FC_C1vC2)[names(Log2FC_C1vC2) == "foldchange2logratio.FC_C1vC2..base...2."] <- paste("Log2FC_", Output)
#t-test
T_C1vC2 <-mapply(wilcox.test, x= as.data.frame(Stat_Condition2), y = as.data.frame(Stat_Condition1), SIMPLIFY = F)
VecPVAL_C1vC2 <- c()
for(i in 1:length(T_C1vC2)){
p_value <- unlist(T_C1vC2[[i]][3])
VecPVAL_C1vC2[i] <- p_value
}
Metabolite <- colnames(Stat_Condition2)
PVal_C1vC2 <- data.frame(Metabolite, VecPVAL_C1vC2)
#Bonferroni p-adjusted
VecPADJ_C1vC2 <- p.adjust((PVal_C1vC2[,2]),method = "BH", n = length((PVal_C1vC2[,2])))
PADJ_C1vC2 <- data.frame(Metabolite, VecPADJ_C1vC2)
STAT_C1vC2 <- merge(PVal_C1vC2,PADJ_C1vC2, by="Metabolite")
STAT_C1vC2 <- merge(Log2FC_C1vC2,STAT_C1vC2, by="Metabolite")
names(STAT_C1vC2)[names(STAT_C1vC2) == "VecPVAL_C1vC2"] <- paste("p.val_", Output)
names(STAT_C1vC2)[names(STAT_C1vC2) == "VecPADJ_C1vC2"] <- paste("p.adj_", Output)
#write.csv(STAT_C1vC2, paste("DMA_",Output, ".csv"), row.names= TRUE)
Output <- STAT_C1vC2
}
#DMA:
DMA_TvN<- DMA(Log2FC_Condition1=Mean_Tumour,
Log2FC_Condition2=Mean_Normal,
Stat_Condition1=Stat_Tumour,
Stat_Condition2=Stat_Normal ,
Output="TvN")
#Add Pathways as assigned by metabolon:
DMA_TvN <- merge (x=DMA_TvN, y=PathwayInfo[,2:4], by.x="Metabolite", by.y="BIOCHEMICAL NAME", all.x=TRUE)
#Safe:
write.csv(DMA_TvN,"OutputData/Sheet2_DMA_TvN.csv", row.names = FALSE)
Late Stage versus Early Stage
Each patient has a βmatching indexβ that assigns a normal sample to a tumour sample. First we calculate the Log2FC comparing tumour versus normal of early stage (stage I/II) and late Stage (stage III/IV). Mann-Whitney U tests were used to identify metabolites significantly higher or lower in stage III/IV tumors compared with stage I/II tumors (Benjamini Hochberg corrected p value <0.05, absolute Log2FC >0.5).
#Single replicates for STAT:
Stat_Normal_Early <- subset(`Data`, `TYPE-STAGE` == "NORMAL-STAGE I" | `TYPE-STAGE` == "NORMAL-STAGE II" , select=c(15:590))
Stat_Tumour_Early <- subset(`Data`, `TYPE-STAGE` == "TUMOR-STAGE I" | `TYPE-STAGE` == "TUMOR-STAGE II" , select=c(15:590))
Stat_Normal_Late <- subset(`Data`, `TYPE-STAGE` == "NORMAL-STAGE III" | `TYPE-STAGE` == "NORMAL-STAGE IV" , select=c(15:590))
Stat_Tumour_Late <- subset(`Data`, `TYPE-STAGE` == "TUMOR-STAGE III" | `TYPE-STAGE` == "TUMOR-STAGE IV" , select=c(15:590))
#Means of the replicates
Data <- unite(Data, col="Mean", c("TISSUE_TYPE","Stage"), sep="_",remove=FALSE)#Merge Stage and Tissue Type
Data_Means1 <- (`Data` [,16:591]) %>%
group_by(`Data`$Mean) %>%
summarise_all(funs(mean))%>%
rename("Mean"="Data$Mean")
Mean_Normal_Early <- subset(Data_Means1, Mean == "NORMAL_EARLY-STAGE", select=c(2:577))
Mean_Tumour_Early <- subset(Data_Means1, Mean == "TUMOR_EARLY-STAGE" , select=c(2:577))
Mean_Normal_Late <- subset(Data_Means1, Mean == "NORMAL_LATE-STAGE" , select=c(2:577))
Mean_Tumour_Late <- subset(Data_Means1, Mean == "TUMOR_LATE-STAGE" , select=c(2:577))
#DMA
DMA_TvN_Early <- DMA(Log2FC_Condition1=Mean_Tumour_Early,
Log2FC_Condition2=Mean_Normal_Early,
Stat_Condition1=Stat_Tumour_Early,
Stat_Condition2=Stat_Normal_Early ,
Output="TvN_Early")
DMA_TvN_Late <- DMA(Log2FC_Condition1=Mean_Tumour_Late,
Log2FC_Condition2=Mean_Normal_Late,
Stat_Condition1=Stat_Tumour_Late,
Stat_Condition2=Stat_Normal_Late ,
Output="TvN_Late")
# Merge the two DF's
DMA_TvN_Stages <- merge(DMA_TvN_Early, DMA_TvN_Late, by="Metabolite", all=TRUE)
# calculate the distance between Early and late stage. Meaning Early Stage is made 0.
DMA_TvN_Stages$`Log2FC_S1-S4` <- ifelse(DMA_TvN_Stages$`Log2FC_ TvN_Early`>=0,
(DMA_TvN_Stages$`Log2FC_ TvN_Late`)-(DMA_TvN_Stages$`Log2FC_ TvN_Early`),
(DMA_TvN_Stages$`Log2FC_ TvN_Late`)+ (abs(DMA_TvN_Stages$`Log2FC_ TvN_Early`)))
#Add Pathways as assigned by metabolon:
DMA_TvN_Stages <- merge (x=DMA_TvN_Stages, y=PathwayInfo[,2:4], by.x="Metabolite", by.y="BIOCHEMICAL NAME", all.x=TRUE)
write.csv(DMA_TvN_Stages,"OutputData/Sheet3_DMA_TvN_Stages.csv", row.names = FALSE)
Volcano Plots Pathways
Here we removed metabolites that had a Log2FC of 0.
#Establish function:
library(ggrepel)
library(EnhancedVolcano)
VolcanoPlot <- function(DF_Condition1,Condition1, DF_Condition2,Condition2, PathwayInfo){
DMA_TvN_Condition1<- DF_Condition1%>%
rename("Log2FC"=paste("Log2FC_ TvN", Condition1,sep="_"),
"p.adj"=paste("p.adj_ TvN", Condition1,sep="_"),
"p.val"= paste("p.val_ TvN", Condition1,sep="_"))
DMA_TvN_Condition1$Condition<-paste(Condition1)
DMA_TvN_Condition2<- DF_Condition2%>%
rename("Log2FC"=paste("Log2FC_ TvN", Condition2,sep="_"),
"p.adj"=paste("p.adj_ TvN", Condition2,sep="_"),
"p.val"= paste("p.val_ TvN", Condition2,sep="_"))
DMA_TvN_Condition2$Condition<-paste(Condition2)
Plot <- rbind(DMA_TvN_Condition1, DMA_TvN_Condition2)
Plot<- (na.omit(Plot))#remove metabolites that do have Log2FC=0 and stats=NA
Plot <- merge (x=Plot, y=PathwayInfo[,2:4], by.x="Metabolite", by.y="BIOCHEMICAL NAME", all.x=TRUE)
Plot$`SUB PATHWAY_NoComma` <- (gsub(",","",Plot$`SUB PATHWAY`))
Plot$`SUB PATHWAY_NoComma` <- (gsub("/","",Plot$`SUB PATHWAY_NoComma`))
Pathway_Names <- Plot[!duplicated(Plot$`SUB PATHWAY_NoComma`),]
Pathway_Names <- Pathway_Names$`SUB PATHWAY_NoComma`
for (i in Pathway_Names){
Volcano1 <- subset(Plot, `SUB PATHWAY_NoComma` == paste(i))%>%
na.omit()
#Prepare the colour scheme:
keyvals <- ifelse(
Volcano1$Condition == paste(Condition1), "blue",
ifelse(Volcano1$Condition == paste(Condition2), "red",
"black"))
names(keyvals)[keyvals == 'blue'] <- paste(Condition1)
names(keyvals)[keyvals == 'red'] <- paste(Condition2)
#Prepare the symbols:
keyvals.shape <- ifelse(
Volcano1$Condition == paste(Condition2), 19,
ifelse(Volcano1$Condition == paste(Condition1), 18,
3))
keyvals.shape[is.na(keyvals.shape)] <- 3
names(keyvals.shape)[keyvals.shape == 3] <- 'NA'
names(keyvals.shape)[keyvals.shape == 19] <- paste(Condition2)
names(keyvals.shape)[keyvals.shape == 18] <- paste(Condition1)
#plot
VolcanoPlot <- EnhancedVolcano (Volcano1,
lab = Volcano1$Metabolite,#Metabolite name
x = "Log2FC",#Log2FC
y = "p.adj",#p-value or q-value
xlab = "Log2FC",
ylab = bquote(~-Log[10]~`p.adj`),#(~-Log[10]~adjusted~italic(P))
pCutoff = 0.05,
FCcutoff = 0.5,#Cut off Log2FC, automatically 2
pointSize = 4,
labSize = 1,
shapeCustom = keyvals.shape,
colCustom = keyvals,
titleLabSize = 16,
#col=c("black", "grey", "grey", "purple"),#if you want to change colors
colAlpha = 0.5,
title=paste(i),
subtitle = bquote(italic("Metabolic Pathway")),
caption = paste0("total = ", nrow(Volcano1)/2, " Metabolites"),
xlim = c((((Reduce(min,Volcano1$Log2FC))))-0.5,(((Reduce(max,Volcano1$Log2FC))))+0.5),
ylim = c(0,((-log10(Reduce(min,Volcano1$p.adj))))+0.1),
#drawConnectors = TRUE,
#widthConnectors = 0.5,
#colConnectors = "black",
cutoffLineType = "dashed",
cutoffLineCol = "black",
cutoffLineWidth = 0.5,
legendPosition = 'right',
legendLabSize = 12,
legendIconSize = 5.0
)
ggsave(file=paste("Figures/VolcanoPlots/", i,"_",Condition2,"vs",Condition1, ".pdf", sep=""), plot=VolcanoPlot, width=10, height=8)
plot(VolcanoPlot)
}
}
#Plot
VolcanoPlot(DF_Condition1=DMA_TvN_Early,
DF_Condition2=DMA_TvN_Late,
Condition1= "Early",
Condition2="Late",
PathwayInfo=PathwayInfo)
Volcano plots group changes
Here, we want to find metabolites that change in the opposite direction when comparing late stage with early stage patients.
Plot_A <- DMA_TvN_Stages%>%
mutate(MetaboliteChange_Significant_Late = case_when(`Log2FC_ TvN_Late` >= 0.5 & `p.adj_ TvN_Late` < 0.05 ~ 'UP',
`Log2FC_ TvN_Late` <= -0.5 & `p.adj_ TvN_Late` < 0.05 ~ 'DOWN',
TRUE ~ 'No_Change')) %>%
mutate(MetaboliteChange_Significant_Early = case_when(`Log2FC_ TvN_Early` >= 0.5 & `p.adj_ TvN_Early` < 0.05 ~ 'UP',
`Log2FC_ TvN_Early` <= -0.5 & `p.adj_ TvN_Early` < 0.05 ~ 'DOWN',
TRUE ~ 'No_Change'))%>%
mutate(MetaboliteChange_Significant_Comparison = case_when(MetaboliteChange_Significant_Late == "UP" & MetaboliteChange_Significant_Early == "UP" ~ 'UP_All-Stages',
MetaboliteChange_Significant_Late == "DOWN" & MetaboliteChange_Significant_Early == "DOWN" ~ 'DOWN_All-Stages',
MetaboliteChange_Significant_Late == "UP" & MetaboliteChange_Significant_Early == "DOWN" ~ 'UP(Late-Stage)_DOWN(Early-Stage)',
MetaboliteChange_Significant_Late == "DOWN" & MetaboliteChange_Significant_Early == "UP" ~ 'DOWN(Late-Stage)_UP(Early-Stage)',
MetaboliteChange_Significant_Late == "No_Change" & MetaboliteChange_Significant_Early == "No_Change" ~ 'NoChange_All-Stages',
MetaboliteChange_Significant_Late == "UP" & MetaboliteChange_Significant_Early == "No_Change" ~ 'UP(Late-Stage)_NoChange(Early-Stage)',
MetaboliteChange_Significant_Late == "DOWN" & MetaboliteChange_Significant_Early == "No_Change" ~ 'DOWN(Late-Stage)_NoChange(Early-Stage)',
MetaboliteChange_Significant_Late == "No_Change" & MetaboliteChange_Significant_Early == "UP" ~ 'No_Change(Late-Stage)_UP(Early-Stage)',
MetaboliteChange_Significant_Late == "No_Change" & MetaboliteChange_Significant_Early == "DOWN" ~ 'No_Change(Late-Stage)_DOWN(Early-Stage)',
TRUE ~ 'FALSE'))
# Make DF's based on groups of change:
## Change only in late Stage:
LateStage_Change <- Plot_A%>%
subset(MetaboliteChange_Significant_Comparison=="UP(Late-Stage)_NoChange(Early-Stage)" | MetaboliteChange_Significant_Comparison=="DOWN(Late-Stage)_NoChange(Early-Stage)")
LateStage_Change_Early <-LateStage_Change[,c(1:4,9:13)]%>%
rename("Log2FC"=`Log2FC_ TvN_Early`,
"p.adj"=`p.adj_ TvN_Early`,
"p.val"= `p.val_ TvN_Early`)
LateStage_Change_Early$Stage<-"Early"
LateStage_Change_Late <- LateStage_Change[,c(1,5:7,9:13)]%>%
rename("Log2FC"=`Log2FC_ TvN_Late`,
"p.adj"=`p.adj_ TvN_Late`,
"p.val"= `p.val_ TvN_Late`)
LateStage_Change_Late$Stage <-"Late"
LateStage_Change <- rbind(LateStage_Change_Early, LateStage_Change_Late)
## Change only in late Stage
EarlyStage_Change <- Plot_A%>%
subset(MetaboliteChange_Significant_Comparison=="No_Change(Late-Stage)_UP(Early-Stage)" | MetaboliteChange_Significant_Comparison=="No_Change(Late-Stage)_DOWN(Early-Stage)")
EarlyStage_Change_Early <-EarlyStage_Change[,c(1:4,9:13)]%>%
rename("Log2FC"=`Log2FC_ TvN_Early`,
"p.adj"=`p.adj_ TvN_Early`,
"p.val"= `p.val_ TvN_Early`)
EarlyStage_Change_Early$Stage <- "Early"
EarlyStage_Change_Late <- EarlyStage_Change[,c(1,5:7,9:13)]%>%
rename("Log2FC"=`Log2FC_ TvN_Late`,
"p.adj"=`p.adj_ TvN_Late`,
"p.val"= `p.val_ TvN_Late`)
EarlyStage_Change_Late$Stage <-"Late"
EarlyStage_Change <- rbind(EarlyStage_Change_Early, EarlyStage_Change_Late)
##Change in the same direction:
AllStage_Change <- Plot_A%>%
subset(MetaboliteChange_Significant_Comparison=="UP_All-Stages" | MetaboliteChange_Significant_Comparison=="DOWN_All-Stages"| MetaboliteChange_Significant_Comparison=="NoChange_All-Stages")
AllStage_Change_Early <-AllStage_Change[,c(1:4,9:13)]%>%
rename("Log2FC"=`Log2FC_ TvN_Early`,
"p.adj"=`p.adj_ TvN_Early`,
"p.val"= `p.val_ TvN_Early`)
AllStage_Change_Early$Stage <- "Early"
AllStage_Change_Late <- AllStage_Change[,c(1,5:7,9:13)]%>%
rename("Log2FC"=`Log2FC_ TvN_Late`,
"p.adj"=`p.adj_ TvN_Late`,
"p.val"= `p.val_ TvN_Late`)
AllStage_Change_Late$Stage <-"Late"
AllStage_Change <- rbind(AllStage_Change_Early, AllStage_Change_Late)
VolcanoPlot_General <- function(Input, OutputFileName){
Volcano1 <- Input
#Prepare the colour scheme:
keyvals <- ifelse(
Volcano1$Stage == "Early", "blue",
ifelse(Volcano1$Stage == "Late", "red",
"black"))
names(keyvals)[keyvals == 'blue'] <- "Early"
names(keyvals)[keyvals == 'red'] <- "Late"
#Prepare the symbols:
keyvals.shape <- ifelse(
Volcano1$Stage == "Late", 19,
ifelse(Volcano1$Stage == "Early", 18,
3))
keyvals.shape[is.na(keyvals.shape)] <- 3
names(keyvals.shape)[keyvals.shape == 3] <- 'NA'
names(keyvals.shape)[keyvals.shape == 19] <- 'Late'
names(keyvals.shape)[keyvals.shape == 18] <- 'Early'
#plot
VolcanoPlot <- EnhancedVolcano (Volcano1,
lab = Volcano1$Metabolite,#Metabolite name
x = "Log2FC",#Log2FC
y = "p.adj",#p-value or q-value
xlab = "Log2FC",
ylab = bquote(~-Log[10]~`p.adj`),#(~-Log[10]~adjusted~italic(P))
pCutoff = 0.05,
FCcutoff = 0.5,#Cut off Log2FC, automatically 2
pointSize = 4,
labSize = 1,
shapeCustom = keyvals.shape,
colCustom = keyvals,
titleLabSize = 16,
#col=c("black", "grey", "grey", "purple"),#if you want to change colors
colAlpha = 0.5,
title=paste(OutputFileName),
subtitle = bquote(italic("Metabolites that Change")),
caption = paste0("total = ", nrow(Volcano1)/2, " Metabolites"),
xlim = c((((Reduce(min,Volcano1$Log2FC))))-0.5,(((Reduce(max,Volcano1$Log2FC))))+0.5),
ylim = c(0,((-log10(Reduce(min,Volcano1$p.adj))))+0.1),
#drawConnectors = TRUE,
#widthConnectors = 0.5,
#colConnectors = "black",
cutoffLineType = "dashed",
cutoffLineCol = "black",
cutoffLineWidth = 0.5,
legendPosition = 'right',
legendLabSize = 12,
legendIconSize = 5.0
)
ggsave(file=paste("Figures/VolcanoPlot_OverallChange_LateStage-vs-EarlyStage_", OutputFileName, ".pdf", sep=""), plot=VolcanoPlot, width=10, height=8)
plot(VolcanoPlot)
}
VolcanoPlot_General(Input=LateStage_Change,
OutputFileName="MetaboliteChange only in late stage patients")
VolcanoPlot_General(Input=EarlyStage_Change,
OutputFileName="MetaboliteChange only in early stage patients")
VolcanoPlot_General(Input=AllStage_Change,
OutputFileName="MetaboliteChange in all patients independent of stage")
It becomes clear that the majority of metabolites (418) do behave in the
same way independent of the tumour stage when compared to their normal
counterpart. Yet, it becomes also clear that early stage tumours are
more variable and the p.adj values are higher than in late stage
tumours.
Moreover, late stage tumours have more metabolites that change
significantly (114) compared to early stage tumours where the same
metabolites do not change. In turn, only 43 metabolites that do not
change in late stage tumours change significantly in early stage
tumours.
Diverging lollipop graphs
this representation is helpful to visualise the change of a metabolite between early stage and late stage.
#Establish the function:
LollipopGraph_p.adj <- function(DF_Condition1,Condition1, DF_Condition2,Condition2, PathwayInfo){
DMA_TvN_Condition1<- DF_Condition1%>%
rename("Log2FC"=paste("Log2FC_ TvN", Condition1,sep="_"),
"p.adj"=paste("p.adj_ TvN", Condition1,sep="_"),
"p.val"= paste("p.val_ TvN", Condition1,sep="_"))
DMA_TvN_Condition1$Condition<-paste(Condition1)
DMA_TvN_Condition2<- DF_Condition2%>%
rename("Log2FC"=paste("Log2FC_ TvN", Condition2,sep="_"),
"p.adj"=paste("p.adj_ TvN", Condition2,sep="_"),
"p.val"= paste("p.val_ TvN", Condition2,sep="_"))
DMA_TvN_Condition2$Condition<-paste(Condition2)
Plot <- rbind(DMA_TvN_Condition1, DMA_TvN_Condition2)
Plot<- (na.omit(Plot))#remove metabolites that do have Log2FC=0 and stats=NA
Plot <- merge (x=Plot, y=PathwayInfo[,2:4], by.x="Metabolite", by.y="BIOCHEMICAL NAME", all.x=TRUE)
Plot$`SUB PATHWAY_NoComma` <- (gsub(",","",Plot$`SUB PATHWAY`))
Plot$`SUB PATHWAY_NoComma` <- (gsub("/","",Plot$`SUB PATHWAY_NoComma`))
Pathway_Names <- Plot[!duplicated(Plot$`SUB PATHWAY_NoComma`),]
Pathway_Names <- Pathway_Names$`SUB PATHWAY_NoComma`
for (i in Pathway_Names){
Graph <- subset(Plot, `SUB PATHWAY_NoComma` == paste(i))%>%
na.omit()
#if(dim(Graph)>=1){
Graph$p.adj_Round <- round(Graph$p.adj, digits = 2)
Graph$p.adj_Round <- replace(Graph$p.adj_Round, Graph$p.adj_Round==0,"<0.01")
Dotplot1 <-ggplot(Graph, aes(x=reorder(Metabolite, + Log2FC), y=Log2FC, label=p.adj_Round)) +
geom_point(stat='identity', aes(size =p.adj, col=Condition)) +
geom_segment(aes(y =(Reduce(max,Graph$Log2FC)),
x = Metabolite,
yend = Log2FC,
xend = Metabolite),
color = "black") +
scale_size(name="p.adj",range = c(2,16), trans = 'reverse', breaks=c(0.01, 0.05, 0.1, 0.5, 0.9))+
geom_text(color="black", size=1.5) +
labs(title=paste(i),
subtitle=paste(Condition2,"vs",Condition1)) +
ylim(((Reduce(min,Graph$Log2FC))-0.5),((Reduce(max,Graph$Log2FC))+0.5)) +
theme_bw() +
coord_flip()+
theme(plot.title = element_text(color = "black", size = 12, face = "bold"),
plot.subtitle = element_text(color = "black", size=10),
plot.caption = element_text(color = "black",size=9, face = "italic", hjust = 2.5))+
labs(y="Log2FC (TvN)", x="")
ggsave(file=paste("Figures/LollipopGraphs/", i,"_",Condition2,"vs",Condition1, ".pdf", sep=""), plot=Dotplot1, width=10, height=12)
plot(Dotplot1)
#}
}
}
#Plot
LollipopGraph_p.adj(DF_Condition1=DMA_TvN_Early,
DF_Condition2=DMA_TvN_Late,
Condition1= "Early",
Condition2="Late",
PathwayInfo=PathwayInfo)
Old versus Young
Each patient has a βmatching indexβ that assigns a normal sample to a
tumour sample. First we calculate the Log2FC comparing tumour versus
normal of young (Age at surgery <42) and old (Age at surgery >58).
Mann-Whitney U tests were used to identify metabolites significantly
higher or lower (Benjamini Hochberg corrected p value <0.05, absolute
Log2FC >0.5).
Noteworthy: We only have 8 young patients, whilst we have 88 old
patients.
#Single replicates for STAT:
Stat_Normal_Young <- subset(`Data`, Age == "Young" & TISSUE_TYPE=="NORMAL", select=c(16:591))
Stat_Tumour_Young <- subset(`Data`, Age == "Young" & TISSUE_TYPE=="TUMOR", select=c(16:591))
Stat_Normal_Old <- subset(`Data`, Age == "Old" & TISSUE_TYPE=="NORMAL", select=c(16:591))
Stat_Tumour_Old <- subset(`Data`, Age == "Old" & TISSUE_TYPE=="TUMOR", select=c(16:591))
#Means of the replicates
Data <- unite(Data, col="Mean", c("TISSUE_TYPE","Age"), sep="_",remove=FALSE)#Merge Stage and Tissue Type
Data_Means1 <- (`Data` [,16:591]) %>%
group_by(`Data`$Mean) %>%
summarise_all(funs(mean))%>%
rename("Mean"="Data$Mean")
Mean_Normal_Young <- subset(Data_Means1, Mean == "NORMAL_Young", select=c(2:577))
Mean_Tumour_Young <- subset(Data_Means1, Mean == "TUMOR_Young" , select=c(2:577))
Mean_Normal_Old <- subset(Data_Means1, Mean == "NORMAL_Old" , select=c(2:577))
Mean_Tumour_Old <- subset(Data_Means1, Mean == "TUMOR_Old" , select=c(2:577))
#DMA
DMA_TvN_Young <- DMA(Log2FC_Condition1=Mean_Tumour_Young,
Log2FC_Condition2=Mean_Normal_Young,
Stat_Condition1=Stat_Tumour_Young,
Stat_Condition2=Stat_Normal_Young ,
Output="TvN_Young")
DMA_TvN_Old <- DMA(Log2FC_Condition1=Mean_Tumour_Old,
Log2FC_Condition2=Mean_Normal_Old,
Stat_Condition1=Stat_Tumour_Old,
Stat_Condition2=Stat_Normal_Old ,
Output="TvN_Old")
# Merge the two DF's
DMA_TvN_Age <- merge(DMA_TvN_Young, DMA_TvN_Old, by="Metabolite", all=TRUE)
# calculate the distance between Early and late stage. Meaning Early Stage is made 0.
DMA_TvN_Age$`Log2FC_Young-Old` <- ifelse(DMA_TvN_Age$`Log2FC_ TvN_Young`>=0,
(DMA_TvN_Age$`Log2FC_ TvN_Old`)-(DMA_TvN_Age$`Log2FC_ TvN_Young`),
(DMA_TvN_Age$`Log2FC_ TvN_Old`)+ (abs(DMA_TvN_Age$`Log2FC_ TvN_Young`)))
#Add Pathways as assigned by metabolon:
DMA_TvN_Age <- merge (x=DMA_TvN_Age, y=PathwayInfo[,2:4], by.x="Metabolite", by.y="BIOCHEMICAL NAME", all.x=TRUE)
write.csv(DMA_TvN_Age,"OutputData/Sheet4_DMA_TvN_Age.csv", row.names = FALSE)
Volcano Plots Pathways
#Plot
VolcanoPlot(DF_Condition1=DMA_TvN_Young,
DF_Condition2=DMA_TvN_Old,
Condition1= "Young",
Condition2="Old",
PathwayInfo=PathwayInfo)
Diverging lollipop graphs
#Plot
LollipopGraph_p.adj(DF_Condition1=DMA_TvN_Young,
DF_Condition2=DMA_TvN_Old,
Condition1= "Young",
Condition2="Old",
PathwayInfo=PathwayInfo)
Information about packge used and versions
sessionInfo()
## R version 4.2.0 (2022-04-22 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 17134)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United Kingdom.1252
## [2] LC_CTYPE=English_United Kingdom.1252
## [3] LC_MONETARY=English_United Kingdom.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United Kingdom.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] EnhancedVolcano_1.14.0 ggrepel_0.9.1 gtools_3.9.2.2
## [4] viridis_0.6.2 viridisLite_0.4.0 RColorBrewer_1.1-3
## [7] ggfortify_0.4.14 devtools_2.4.3 usethis_2.1.6
## [10] rmarkdown_2.14 forcats_0.5.1 stringr_1.4.0
## [13] dplyr_1.0.9 purrr_0.3.4 readr_2.1.2
## [16] tidyr_1.2.0 tibble_3.1.7 ggplot2_3.3.6
## [19] tidyverse_1.3.1
##
## loaded via a namespace (and not attached):
## [1] fs_1.5.2 lubridate_1.8.0 httr_1.4.3 rprojroot_2.0.3
## [5] tools_4.2.0 backports_1.4.1 bslib_0.3.1 utf8_1.2.2
## [9] R6_2.5.1 DBI_1.1.3 colorspace_2.0-3 withr_2.5.0
## [13] tidyselect_1.1.2 gridExtra_2.3 prettyunits_1.1.1 processx_3.6.1
## [17] compiler_4.2.0 cli_3.3.0 rvest_1.0.2 xml2_1.3.3
## [21] desc_1.4.1 labeling_0.4.2 sass_0.4.1 scales_1.2.0
## [25] callr_3.7.0 digest_0.6.29 pkgconfig_2.0.3 htmltools_0.5.2
## [29] sessioninfo_1.2.2 dbplyr_2.2.0 fastmap_1.1.0 highr_0.9
## [33] rlang_1.0.2 readxl_1.4.0 rstudioapi_0.13 jquerylib_0.1.4
## [37] generics_0.1.2 farver_2.1.0 jsonlite_1.8.0 magrittr_2.0.3
## [41] Rcpp_1.0.8.3 munsell_0.5.0 fansi_1.0.3 lifecycle_1.0.1
## [45] stringi_1.7.6 yaml_2.3.5 brio_1.1.3 pkgbuild_1.3.1
## [49] grid_4.2.0 crayon_1.5.1 haven_2.5.0 hms_1.1.1
## [53] knitr_1.39 ps_1.7.1 pillar_1.7.0 pkgload_1.2.4
## [57] reprex_2.0.1 glue_1.6.2 evaluate_0.15 remotes_2.4.2
## [61] modelr_0.1.8 vctrs_0.4.1 tzdb_0.3.0 testthat_3.1.4
## [65] cellranger_1.1.0 gtable_0.3.0 assertthat_0.2.1 cachem_1.0.6
## [69] xfun_0.31 broom_0.8.0 memoise_2.0.1 ellipsis_0.3.2