Notebook RCM Part 1 ccRCC Figure 1
Plots for figure 1¶
- Correlation between RNA and DNA methylation
- Correlation between RNA and protein
- ORA overlap between protein DNA methylation and RNA
We also want to calculate the DNA methylation Beta value as opposed to just the M value change since we want to mainly use this!
In [1]:
# Need to add in entrez gene ID and also make labels for genes for ORA
# Imports
from scircm import * # Note if you have a mac M1 use from sircle import * and you won't be able to do 7,8
import seaborn as sns
from sciutil import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import os
# Setup file locations and label of the cancer
u = SciUtil()
cancer = 'ClearCellRenalCellCarcinoma'
input_dir = 'Input_RCM'
output_dir = 'Output_Data'
supp_dir = 'Required_Refs'
fig_dir = 'Output_Figures'
regLabel = 'RG2_Changes_filtered'
# Use same files as in RCM
DNA_methylation_file = os.path.join('Input_Methylation', f'{cancer}_Filtered_DCpG.csv')
RNA_file = os.path.join('Input_RNAseq', f'{cancer}_filtered_DE_RNA.csv')
protein_file = os.path.join('Input_Protein', f'{cancer}_filtered_DA_Protein.csv')
# Names of columns in each of the above files
logFC_RNA_column = "logFC_rna"
padj_RNA_column = "padj_rna"
betadiff_DNA_methylation_column = "beta_diff"
padj_DNA_methylation_column = 'adj.P.Val'
logFC_protein_column = "logFC_protein"
padj_protein_column = "padj_protein"
# NOTE: all of the above files MUST have this column i.e. they must all have the samely named gene ID column
gene_id_column = "gene_name"
# Use same cutoffs as in RCM
rna_padj_cutoff = 0.05
prot_padj_cutoff = 0.05
meth_padj_cutoff = 0.05
rna_logfc_cutoff = 1.0
prot_logfc_cutoff = 0.5
meth_diff_cutoff = 0.1
# Join the DCpG file with the epic manifest and then filter the file
# Annotate the gene names to entrez gene IDs using annotation file from HG38
annot = pd.read_csv(os.path.join(supp_dir, 'hsapiens_gene_ensembl-GRCh38.p13.csv'))
annot = annot.dropna(subset=['external_gene_name', 'entrezgene_id'])
annot = annot.drop_duplicates(subset='external_gene_name')
name_to_entrez = dict(zip(annot.external_gene_name, annot.entrezgene_id))
# m_df = pd.read_csv(DNA_methylation_file)
# m_df.rename(columns={'logFC': 'M_diff'}, inplace=True)
# # Then also calculate beta value change for each summarised CpG.
# # Use the information in the sample files.
# methylation_sample_file = pd.read_csv(os.path.join('Input_Methylation', f'{cancer}_filtered_samples_CpG.csv'))
# tumour_samples = list(methylation_sample_file[methylation_sample_file['CondID'] == 1]['Sample'].values)
# normal_samples = list(methylation_sample_file[methylation_sample_file['CondID'] == 0]['Sample'].values)
# beta_diff = np.mean(m_df[tumour_samples].values, axis=1) - np.mean(m_df[normal_samples].values, axis=1)
# m_df['beta_diff'] = beta_diff
# m_df.to_csv(DNA_methylation_file, index=False)
2023-12-22 06:30:18.820110: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. /var/folders/gq/6ljhmvm1713fykdjqbl188pm0000gn/T/ipykernel_45165/1950953522.py:49: DtypeWarning: Columns (2) have mixed types. Specify dtype option on import or set low_memory=False. annot = pd.read_csv(os.path.join(supp_dir, 'hsapiens_gene_ensembl-GRCh38.p13.csv'))
Figure 1A¶
Plot and calculate the DNA methylation and protein correlation
In [8]:
# RNA and Methylation correlation
from sciviso import *
from scipy.stats import pearsonr, spearmanr
from sciutil import *
import matplotlib.pyplot as plt
u = SciUtil()
m_df = pd.read_csv(DNA_methylation_file)
df = pd.read_csv(RNA_file)
rg = df[abs(df[logFC_RNA_column]) > rna_logfc_cutoff]
rg = rg[rg[padj_RNA_column] < rna_padj_cutoff][gene_id_column].values
mg = m_df[abs(m_df[betadiff_DNA_methylation_column]) > meth_diff_cutoff]
mg = mg[mg[padj_DNA_methylation_column] < rna_padj_cutoff][gene_id_column].values
u.warn_p(['RNA', 'Methylation', 'Overlap', len(set(mg) & set(rg))])
m_df.set_index(gene_id_column, inplace=True)
df.set_index(gene_id_column, inplace=True)
# Join on gene name
joined_df = m_df.join(df, how='inner')
# Correlation
u.dp(['Pearsons R:', pearsonr(joined_df[betadiff_DNA_methylation_column].values, joined_df[logFC_RNA_column].values)])
u.dp(['Spearmans rho:', spearmanr(joined_df[betadiff_DNA_methylation_column].values, joined_df[logFC_RNA_column].values)])
# plot and print
sc = Scatterplot(joined_df, betadiff_DNA_methylation_column, logFC_RNA_column,
'DNA & RNA', 'DNA M. diff', 'RNA logFC',
config={'figsize': (1.5, 1.5), 'opacity': 0.2},
colour='black')
sc.plot()
plt.savefig(os.path.join(fig_dir, f'Figure1A_MethylationRNA.svg'))
plt.show()
# Do the same with RNA and protein
p_df = pd.read_csv(protein_file)
df = pd.read_csv(RNA_file)
pg = p_df[abs(p_df[logFC_protein_column]) > prot_logfc_cutoff]
pg = pg[pg[padj_protein_column] < prot_padj_cutoff][gene_id_column].values
u.warn_p(['RNA', 'Protein', 'Overlap', len(set(pg) & set(rg))])
p_df.set_index(gene_id_column, inplace=True)
df.set_index(gene_id_column, inplace=True)
# Join on gene name
joined_df = p_df.join(df, how='inner')
# Correlation
u.dp(['Pearsons R:', pearsonr(joined_df[logFC_protein_column].values, joined_df[logFC_RNA_column].values)])
u.dp(['Spearmans rho:', spearmanr(joined_df[logFC_protein_column].values, joined_df[logFC_RNA_column].values)])
# plot and print
sc = Scatterplot(joined_df, logFC_protein_column, logFC_RNA_column, 'Protein & RNA',
'Protein logFC', 'RNA logFC',
config={'figsize': (1.5, 1.5), 'opacity': 0.2},
colour='black')
sc.plot()
plt.savefig(os.path.join(fig_dir, f'Figure1A_ProteinRNA.svg'))
/var/folders/gq/6ljhmvm1713fykdjqbl188pm0000gn/T/ipykernel_45165/2314390714.py:9: DtypeWarning: Columns (212) have mixed types. Specify dtype option on import or set low_memory=False. m_df = pd.read_csv(DNA_methylation_file)
-------------------------------------------------------------------------------- RNA Methylation Overlap 3083 -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- Pearsons R: PearsonRResult(statistic=-0.17708982942476623, pvalue=2.5749622958861697e-84) -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- Spearmans rho: SignificanceResult(statistic=-0.21294219973619416, pvalue=6.531248483400743e-122) --------------------------------------------------------------------------------
/Users/ariane/opt/miniconda3/envs/roundround/lib/python3.10/site-packages/sciviso/scatterplot.py:114: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored scatter = ax.scatter(vis_df[x].values, vis_df[y].values, c=self.colour, alpha=self.opacity,
-------------------------------------------------------------------------------- RNA Protein Overlap 1284 -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- Pearsons R: PearsonRResult(statistic=0.6841598501434772, pvalue=0.0) -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- Spearmans rho: SignificanceResult(statistic=0.6921363331677398, pvalue=0.0) --------------------------------------------------------------------------------
/Users/ariane/opt/miniconda3/envs/roundround/lib/python3.10/site-packages/sciviso/scatterplot.py:114: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored scatter = ax.scatter(vis_df[x].values, vis_df[y].values, c=self.colour, alpha=self.opacity,
In [9]:
len(set(pg) & set(rg) & set(mg))
Out[9]:
796
In [10]:
from matplotlib_venn import venn3
venn3([set(pg), set(rg), set(mg)], set_labels=['Proteomics', 'RNAseq', 'DNA Methylation'],
set_colors=('#0101d7', '#e68e25', '#6aaf44'),)
plt.rcParams['svg.fonttype'] = 'none'
plt.title('Overlap of all sig genes')
plt.savefig(os.path.join(fig_dir, f'Figure1_geneoverlap.svg'))
plt.show()
Figure 1B¶
Plot and visualise the overlap of ORA for the top terms, first we need to make "groups" for the different ORAs and top genes.
We do this for each of the above files (i.e. DNA methylation, gene expression and protein)
In [8]:
# -------------------------------------------------------------------
# Make ORA groups for DNA methylation
# -------------------------------------------------------------------
shared_genes = []
# Add in entrez gene ID and a label for each gene for DNA methylation
m_df = pd.read_csv(DNA_methylation_file)
entrez, labels = [], []
padjs, logfcs = m_df[padj_DNA_methylation_column].values, m_df[betadiff_DNA_methylation_column].values
for i, gene in enumerate(m_df[gene_id_column].values):
label = 'NA'
entrez.append(name_to_entrez.get(gene))
if padjs[i] < meth_padj_cutoff:
if logfcs[i] < (-1 * meth_diff_cutoff):
label = 'Hypo' # Hypo methylation
elif logfcs[i] > meth_diff_cutoff:
label = 'Hyper' # Hyper methylation i.e. "maybe" driving down reg.
labels.append(label)
m_df['ORA_label'] = labels
m_df['entrezgene_id'] = entrez
m_df = m_df[[gene_id_column, 'entrezgene_id', 'ORA_label', padj_DNA_methylation_column,
betadiff_DNA_methylation_column]]
m_df = m_df.dropna()
u.dp(["methylation"])
m_genes = list(m_df[m_df.ORA_label != 'NA'][gene_id_column].values)
print(m_df.ORA_label.value_counts())
m_df.to_csv(os.path.join(output_dir, f'DNAMethylation_{cancer}_ORA.csv'), index=False)
# -------------------------------------------------------------------
# Make ORA groups for Gene expression
# -------------------------------------------------------------------
df = pd.read_csv(RNA_file)
entrez, labels = [], []
padjs, logfcs = df[padj_RNA_column].values, df[logFC_RNA_column].values
for i, gene in enumerate(df[gene_id_column].values):
label = 'NA'
entrez.append(name_to_entrez.get(gene))
if padjs[i] < rna_padj_cutoff:
if logfcs[i] < (-1 * rna_logfc_cutoff):
label = 'DOWN' # Repressed
elif logfcs[i] > rna_logfc_cutoff:
label = 'UP' # Increased Expression
labels.append(label)
df['ORA_label'] = labels
df['entrezgene_id'] = entrez
df = df[[gene_id_column, 'entrezgene_id', 'ORA_label', padj_RNA_column, logFC_RNA_column]]
df = df.dropna()
u.dp(["RNA"])
r_genes = list(df[df.ORA_label != 'NA'][gene_id_column].values)
print(df.ORA_label.value_counts())
df.to_csv(os.path.join(output_dir, f'RNA_{cancer}_ORA.csv'), index=False)
# -------------------------------------------------------------------
# Make ORA groups for Protein
# -------------------------------------------------------------------
df = pd.read_csv(protein_file)
entrez, labels = [], []
padjs, logfcs = df[padj_protein_column].values, df[logFC_protein_column].values
for i, gene in enumerate(df[gene_id_column].values):
label = 'NA'
entrez.append(name_to_entrez.get(gene))
if padjs[i] < prot_padj_cutoff:
if logfcs[i] < (-1 * prot_logfc_cutoff):
label = 'DOWN' # Repressed
elif logfcs[i] > prot_logfc_cutoff:
label = 'UP' # Increased expression
labels.append(label)
df['ORA_label'] = labels
df['entrezgene_id'] = entrez
df = df[[gene_id_column, 'entrezgene_id', 'ORA_label', padj_protein_column, logFC_protein_column]]
df = df.dropna()
u.dp(["Protein"])
print(df.ORA_label.value_counts())
df.to_csv(os.path.join(output_dir, f'Protein_{cancer}_ORA.csv'), index=False)
p_genes = list(df[df.ORA_label != 'NA'][gene_id_column].values)
/var/folders/gq/6ljhmvm1713fykdjqbl188pm0000gn/T/ipykernel_92972/256051197.py:7: DtypeWarning: Columns (212) have mixed types. Specify dtype option on import or set low_memory=False. m_df = pd.read_csv(DNA_methylation_file)
-------------------------------------------------------------------------------- methylation -------------------------------------------------------------------------------- ORA_label Hypo 7300 NA 4416 Hyper 3080 Name: count, dtype: int64 -------------------------------------------------------------------------------- RNA -------------------------------------------------------------------------------- ORA_label NA 12794 UP 3022 DOWN 2384 Name: count, dtype: int64 -------------------------------------------------------------------------------- Protein -------------------------------------------------------------------------------- ORA_label NA 5903 DOWN 1665 UP 1034 Name: count, dtype: int64
In [9]:
print(len(set(m_genes) & set(r_genes) & set(p_genes)))
823
Look at overlap now (after going to R and running SiRCle ORA)¶
Code to run ORA on each of the above files:
{r}
library(EnhancedVolcano)
library(org.Hs.eg.db)#For Human load:
library(clusterProfiler)#To run ORA
library(enrichplot)#For emapplot, dotplot,...
library(ggplot2)#For saving and legends
library(tidyverse) # used for data manipulation
#Establish a couple of functions needed to perform the GSEA (made by Aurelien)
library(fgsea)
library(GSEABase)
# ----------------------------------------------------------------------------------------------
# File and naming setup
# ----------------------------------------------------------------------------------------------
GSEA_file_dir <- 'Required_Refs/GSEA/' # Will need to change the path if you're on windows
figure_dir <- 'Output_Figures'
input_data_dir <- 'Output_Data'
output_data_dir <- 'Output_Data/ORA/'
cancer <- 'Renal_cell_carcinoma__NOS'
sircleFileName <- file.path(input_data_dir, paste0('sircle_PorMandR_', cancer, '.csv'))
# Run the ORA
#Import the pathways:
KEGG <- read.csv(file.path(GSEA_file_dir, "c2.cp.kegg.v6.2.symbols.csv"))
Reactome <- read.csv(file.path(GSEA_file_dir, "c2.cp.reactome.v6.2.symbols.csv"))
Biocarta <- read.csv(file.path(GSEA_file_dir, "c2.cp.biocarta.v6.2.symbols.csv"))
Hallmarks <- read.csv(file.path(GSEA_file_dir, "h.all.v6.2.symbols.csv"))
GO_BP <- read.csv(file.path(GSEA_file_dir, "c5.go.bp.v7.2.symbols.csv"))
GO_CC <- read.csv(file.path(GSEA_file_dir, "c5.go.cc.v7.2.symbols.csv"))
GO_MF <- read.csv(file.path(GSEA_file_dir, "c5.go.mf.v7.2.symbols.csv"))
Metabolic <- read.csv(file.path(GSEA_file_dir, "41467_2016_BFncomms13041_MOESM340_ESM.csv"))
Correction <- read.csv(file.path(GSEA_file_dir, "41467_2016_BFncomms13041_MOESM340_ESM.csv"))
#Run the GSEA analysis
##1."KEGG", "Reactome", "Biocarta", "Hallmarks"
pathways <- rbind(KEGG, Reactome, Biocarta, Hallmarks)
pathway_list <- list()
for (pathway in unique(pathways$term)) {
pathway_list[[pathway]] <- as.character(pathways[pathways$term == pathway, 1])
}
sircleORAHuman <- function(filename, entrezId, title, regLabels="RegulatoryLabels", emptyRegLabel="", fileType="pdf",
minGSSize=10, qvalueCutoff=0.2, pvalueCutoff=0.05, showCatagory=30, outputFolder='') {
## ------------ Setup and installs ----------- ##
packages <- c("org.Hs.eg.db", "clusterProfiler", "svglite", "enrichplot")
install.packages(setdiff(packages, rownames(installed.packages())))
library(org.Hs.eg.db)
library(clusterProfiler)
library(svglite)
library(enrichplot)
## ------------ Run ----------- ##
# open the data
df <- read.csv(filename)
allGenes <- as.character(df[[entrezId]]) #
clusterGenes <- subset(df, ! df[[regLabels]] == emptyRegLabel)
clusterGenes <- subset(clusterGenes, ! clusterGenes[[regLabels]] == "Not-Background")
grps_labels <- unlist(unique(clusterGenes[regLabels]))
for(g in grps_labels) {
grpGenes <- subset(df, df[[regLabels]] == g)
print(g)
print(dim(grpGenes))
clusterGo <- enrichGO(gene = as.character(grpGenes[[entrezId]]),
universe = allGenes,
keyType = "ENTREZID",
OrgDb = org.Hs.eg.db,
ont = "ALL",
pAdjustMethod = "BH",
qvalueCutoff = 1.0,
pvalueCutoff = 1.0,
minGSSize = minGSSize,
readable = TRUE)
# We have a cutoff of all of them, and then only visualise the ones that the user wants...
clusterGoSummary <- data.frame(clusterGo)
write.csv(clusterGoSummary, paste(outputFolder, 'ClusterGoSummary_', g, title, '.csv', sep=""))#Export the ORA results as .csv
if (!(dim(clusterGoSummary)[1] == 0)) {#exclude df's that have no observations
Dotplot <- dotplot(clusterGo, showCategory=showCatagory) +
ggtitle(paste("Dotplot ", g, sep=""))
ggsave(file=paste(outputFolder, "SiRCle-ORA_Dotplot_Human_", g, title, ".", fileType, sep=""), plot=Dotplot, width=10, height=8)
x2 <- pairwise_termsim(clusterGo)
Emapplot <- emapplot(x2, pie_scale=1.5, layout = "nicely")+
ggtitle(paste("Emapplot ", g, sep=""))
ggsave(file=paste(outputFolder, "SiRCle-ORA_Emapplot_Human_", g, title, ".", fileType, sep="" ), plot=Emapplot, width=10, height=8)
Heatplot <- heatplot(clusterGo,showCategory=showCatagory) +
theme(axis.text.x =element_text(size=5), axis.text.y =element_text(size=8,face="bold"), axis.title=element_text(size=12,face="bold"))+
ggtitle(paste("Heatplot ", g, sep=""))
ggsave(file=paste(outputFolder, "SiRCle-ORA_Heatplot_Human_", g,title, ".", fileType, sep="" ), plot=Heatplot, width=10, height=8)
}
}
}
sircleFileName <- file.path(input_data_dir, paste0('Protein_', cancer, '_ORA.csv'))
test_title <- '_protein'
sircleORAHuman(sircleFileName, "entrezgene_id", test_title, 'ORA_label', emptyRegLabel="None", minGSSize=10, qvalueCutoff=0.1, outputFolder=output_data_dir)
sircleFileName <- file.path(input_data_dir, paste0('RNA_', cancer, '_ORA.csv'))
test_title <- '_RNA'
sircleORAHuman(sircleFileName, "entrezgene_id", test_title, 'ORA_label', emptyRegLabel="None", minGSSize=10, qvalueCutoff=0.1, outputFolder=output_data_dir)
sircleFileName <- file.path(input_data_dir, paste0('DNAMethylation_', cancer, '_ORA.csv'))
test_title <- '_Methylation'
sircleORAHuman(sircleFileName, "entrezgene_id", test_title, 'ORA_label', emptyRegLabel="None", minGSSize=10, qvalueCutoff=0.1, outputFolder=output_data_dir)
In [4]:
from matplotlib_venn import venn3
ora_dir = os.path.join(output_dir, 'ORA')
# Get the overlapping GO terms that were significant
p_go = pd.read_csv(os.path.join(ora_dir, f'ClusterGoSummary_DOWN_protein.csv'))
p_go = p_go[p_go['p.adjust'] < 0.05]['Description'].values # Get sig terms
m_go = pd.read_csv(os.path.join(ora_dir, f'ClusterGoSummary_Hyper_Methylation.csv'))
m_go = m_go[m_go['p.adjust'] < 0.05]['Description'].values # Get sig terms
r_go = pd.read_csv(os.path.join(ora_dir, f'ClusterGoSummary_DOWN_RNA.csv'))
r_go = r_go[r_go['p.adjust'] < 0.05]['Description'].values # Get sig terms
venn3([set(p_go), set(r_go), set(m_go)], set_labels=['Proteomics', 'RNAseq', 'DNA Methylation'],
set_colors=('#0101d7', '#e68e25', '#6aaf44'),)
plt.rcParams['svg.fonttype'] = 'none'
plt.title('Down')
plt.savefig(os.path.join(fig_dir, f'Figure1B_DownGO.svg'))
plt.show()
# Get the overlapping GO terms that were significant
p_go = pd.read_csv(os.path.join(ora_dir, f'ClusterGoSummary_UP_protein.csv'))
p_go = p_go[p_go['p.adjust'] < 0.05]['Description'].values # Get sig terms
m_go = pd.read_csv(os.path.join(ora_dir, f'ClusterGoSummary_Hypo_Methylation.csv'))
m_go = m_go[m_go['p.adjust'] < 0.05]['Description'].values # Get sig terms
r_go = pd.read_csv(os.path.join(ora_dir, f'ClusterGoSummary_UP_RNA.csv'))
r_go = r_go[r_go['p.adjust'] < 0.05]['Description'].values # Get sig terms
venn3([set(p_go), set(r_go), set(m_go)], set_labels=['Proteomics', 'RNAseq', 'DNA Methylation'],
set_colors=('#0101d7', '#e68e25', '#6aaf44'),)
plt.rcParams['svg.fonttype'] = 'none'
plt.title('UP')
plt.savefig(os.path.join(fig_dir, 'Figure1B_UPGO.svg'))
In [5]:
import pandas as pd
from sciutil import SciUtil
from matplotlib_venn import venn3
import matplotlib.pyplot as plt
u = SciUtil()
test_title = 'PanCan'
comparisons = {'Down regulated': ['ClusterGoSummary_DOWN_RNA.csv',
'ClusterGoSummary_DOWN_protein.csv',
'ClusterGoSummary_Hyper_Methylation.csv'
],
'Up regulated': ['ClusterGoSummary_UP_RNA.csv',
'ClusterGoSummary_UP_protein.csv',
'ClusterGoSummary_Hypo_Methylation.csv'
]
}
for c in comparisons:
rna_df = pd.read_csv(os.path.join(ora_dir, f'{comparisons[c][0]}'))
protein_df = pd.read_csv(os.path.join(ora_dir, f'{comparisons[c][1]}'))
cpg_df = pd.read_csv(os.path.join(ora_dir, f'{comparisons[c][2]}'))
# Look at significant terms
rna_df = rna_df[rna_df['p.adjust'] < 0.05]
protein_df = protein_df[protein_df['p.adjust'] < 0.05]
cpg_df = cpg_df[cpg_df['p.adjust'] < 0.05]
u.dp(['RNA sig:', len(rna_df), 'Protein sig:', len(protein_df), 'CpG:', len(cpg_df)])
venn3([set(protein_df.ID.values), set(rna_df.ID.values), set(cpg_df.ID.values)], set_labels=['Proteomics GO', 'RNAseq GO',
'DNA Methylation GO'],
set_colors=('#0101d7', '#e68e25', '#6aaf44'),)
plt.rcParams['svg.fonttype'] = 'none'
plt.title(c)
plt.show()
plt.savefig(f'{output_dir}{test_title}_venn.svg')
-------------------------------------------------------------------------------- RNA sig: 650 Protein sig: 558 CpG: 445 --------------------------------------------------------------------------------
-------------------------------------------------------------------------------- RNA sig: 1256 Protein sig: 902 CpG: 846 --------------------------------------------------------------------------------
<Figure size 150x150 with 0 Axes>
In [6]:
import seaborn as sns
import matplotlib as mpl
import matplotlib.cm as cm
import numpy as np
import matplotlib.cm as cm
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (3,10)
sns.set_theme(style="whitegrid")
cmaps = {'Down regulated': {'CpG': 'darkorange', 'Protein': 'royalblue', 'RNA': 'teal'},
'Up regulated': {'CpG': 'limegreen', 'Protein': 'darkred', 'RNA': 'lightcoral'},
}
for c in comparisons:
combined_df = pd.DataFrame()
rna_df = pd.read_csv(os.path.join(ora_dir, f'{comparisons[c][0]}'))
rna_df['GeneRatio'] = [int(g.split('/')[0])/int(g.split('/')[1]) for g in rna_df['GeneRatio'].values]
rna_df['Data'] = 'RNA'
rna_df = rna_df[rna_df['p.adjust'] < 0.05]
rna_df.sort_values('p.adjust', inplace=True)
protein_df = pd.read_csv(os.path.join(ora_dir, f'{comparisons[c][1]}'))
protein_df['GeneRatio'] = [int(g.split('/')[0])/int(g.split('/')[1]) for g in protein_df['GeneRatio'].values]
protein_df['Data'] = 'Protein'
protein_df = protein_df[protein_df['p.adjust'] < 0.05]
protein_df.sort_values('p.adjust', inplace=True)
cpg_df = pd.read_csv(os.path.join(ora_dir, f'{comparisons[c][2]}'))
cpg_df['GeneRatio'] = [int(g.split('/')[0])/int(g.split('/')[1]) for g in cpg_df['GeneRatio'].values]
cpg_df['Data'] = 'CpG'
cpg_df = cpg_df[cpg_df['p.adjust'] < 0.05]
cpg_df.sort_values('p.adjust', inplace=True)
combined_df = pd.concat([combined_df, rna_df])
combined_df = pd.concat([combined_df, cpg_df])
combined_df = pd.concat([combined_df, protein_df])
# Concat all together
num = 10
rna_df = pd.concat([rna_df.head(num), cpg_df.head(num), protein_df.head(num)])
rna_df.sort_values('p.adjust', inplace=True)
max_c = max(rna_df['Count'].values)
min_c = min(rna_df['Count'].values)
size = [int(1 + ((int(c) - min_c)/(max_c - min_c))*300) for c in rna_df['Count'].values]
norm = mpl.colors.Normalize(vmin=np.min(rna_df['p.adjust'].values), vmax=np.max(rna_df['p.adjust'].values))
cmap = cm.RdBu
m = cm.ScalarMappable(norm=norm, cmap=cmap)
c_map = cmaps[c]
colours = [c_map.get(v) for v in rna_df['Data'].values]
plt.scatter(rna_df['GeneRatio'].values, rna_df['Description'].values, s=size, c=colours)#c=m.to_rgba(rna_df['p.adjust'].values))
plt.title(c)
combined_df.to_csv(f'{output_dir}{c}_StageIV-StageI-Tumour.csv', index=False)
gmin = plt.scatter([], [], s=10 + int(300*(min_c/(max_c - min_c))), marker='o', color='#222')
#gmid = plt.scatter([], [], s=int(10 + 300*(((max_c - min_c)/2)/max_c)), marker='o', color='#222')
gmax = plt.scatter([], [], s=10 + int(300*(1)), marker='o', color='#222')
legend = plt.legend((gmin, gmax),
(str(min_c), str(max_c)),
scatterpoints=1,
loc='lower left',
ncol=1,
fontsize=8, bbox_to_anchor=(0, -0.1))
legend.set_title("No. Genes")
plt.gca().add_artist(legend)
plt.savefig(f'{output_dir}DotPlot_GO_{c}.svg')
plt.show()
#sns.stripplot(data=rna_df.head(30), y='Description', x='GeneRatio', size=size) #, color='p.adjust', size='Count')
In [ ]: