Notebook RCM Part 1 ccRCC Figure 5b
Plots for S.figure 4 Early and Late independently¶
- Correlation between RNA and DNA methylation
- Correlation between RNA and protein
- ORA overlap between protein DNA methylation and RNA
We also want to calculate the DNA methylation Beta value as opposed to just the M value change since we want to mainly use this!
In [1]:
# Need to add in entrez gene ID and also make labels for genes for ORA
# Imports
from scircm import * # Note if you have a mac M1 use from sircle import * and you won't be able to do 7,8
import seaborn as sns
from sciutil import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import os
# Setup file locations and label of the cancer
u = SciUtil()
cancer = 'ClearCellRenalCellCarcinoma'
input_dir = 'Input_RCM'
output_dir = 'Output_Data'
supp_dir = 'Required_Refs'
fig_dir = 'Output_Figures'
regLabel = 'RG2_Changes_filtered'
# Use same files as in RCM
DNA_methylation_file = os.path.join('Input_Methylation', f'{cancer}_filtered_DCpG_Early.csv')
RNA_file = os.path.join('Input_RNAseq', f'{cancer}_filtered_DE_RNA_Early.csv')
protein_file = os.path.join('Input_Protein', f'{cancer}_filtered_DA_Protein_Early.csv')
# Names of columns in each of the above files
logFC_RNA_column = "logFC_rna"
padj_RNA_column = "padj_rna"
betadiff_DNA_methylation_column = "beta_diff"
padj_DNA_methylation_column = 'adj.P.Val'
logFC_protein_column = "logFC_protein"
padj_protein_column = "padj_protein"
# NOTE: all of the above files MUST have this column i.e. they must all have the samely named gene ID column
gene_id_column = "gene_name"
# Use same cutoffs as in RCM
rna_padj_cutoff = 0.05
prot_padj_cutoff = 0.05
meth_padj_cutoff = 0.05
rna_logfc_cutoff = 1.0
prot_logfc_cutoff = 0.5
meth_diff_cutoff = 0.1
# Join the DCpG file with the epic manifest and then filter the file
# Annotate the gene names to entrez gene IDs using annotation file from HG38
annot = pd.read_csv(os.path.join(supp_dir, 'hsapiens_gene_ensembl-GRCh38.p13.csv'))
annot = annot.dropna(subset=['external_gene_name', 'entrezgene_id'])
annot = annot.drop_duplicates(subset='external_gene_name')
name_to_entrez = dict(zip(annot.external_gene_name, annot.entrezgene_id))
2023-12-22 06:45:20.800281: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. /var/folders/gq/6ljhmvm1713fykdjqbl188pm0000gn/T/ipykernel_45752/3248147200.py:49: DtypeWarning: Columns (2) have mixed types. Specify dtype option on import or set low_memory=False. annot = pd.read_csv(os.path.join(supp_dir, 'hsapiens_gene_ensembl-GRCh38.p13.csv'))
Figure 1A¶
Plot and calculate the DNA methylation and protein correlation
In [2]:
# RNA and Methylation correlation
from sciviso import *
from scipy.stats import pearsonr, spearmanr
from sciutil import *
import matplotlib.pyplot as plt
u = SciUtil()
m_df = pd.read_csv(DNA_methylation_file)
df = pd.read_csv(RNA_file)
m_df.set_index(gene_id_column, inplace=True)
df.set_index(gene_id_column, inplace=True)
# Join on gene name
joined_df = m_df.join(df, how='inner')
# Correlation
u.dp(['Pearsons R:', pearsonr(joined_df[betadiff_DNA_methylation_column].values, joined_df[logFC_RNA_column].values)])
u.dp(['Spearmans rho:', spearmanr(joined_df[betadiff_DNA_methylation_column].values, joined_df[logFC_RNA_column].values)])
# plot and print
sc = Scatterplot(joined_df, betadiff_DNA_methylation_column, logFC_RNA_column,
'DNA & RNA', 'DNA M. diff', 'RNA logFC',
config={'figsize': (1.5, 1.5), 'opacity': 0.2},
colour='black')
sc.plot()
plt.savefig(os.path.join(fig_dir, f'Figure4A_MethylationRNA_Early.svg'))
plt.show()
# Do the same with RNA and protein
p_df = pd.read_csv(protein_file)
df = pd.read_csv(RNA_file)
p_df.set_index(gene_id_column, inplace=True)
df.set_index(gene_id_column, inplace=True)
# Join on gene name
joined_df = p_df.join(df, how='inner')
# Correlation
u.dp(['Pearsons R:', pearsonr(joined_df[logFC_protein_column].values, joined_df[logFC_RNA_column].values)])
u.dp(['Spearmans rho:', spearmanr(joined_df[logFC_protein_column].values, joined_df[logFC_RNA_column].values)])
# plot and print
sc = Scatterplot(joined_df, logFC_protein_column, logFC_RNA_column, 'Protein & RNA',
'Protein logFC', 'RNA logFC',
config={'figsize': (1.5, 1.5), 'opacity': 0.2},
colour='black')
sc.plot()
plt.savefig(os.path.join(fig_dir, f'Figure4A_ProteinRNA_Early.svg'))
/var/folders/gq/6ljhmvm1713fykdjqbl188pm0000gn/T/ipykernel_45752/1990506602.py:9: DtypeWarning: Columns (212) have mixed types. Specify dtype option on import or set low_memory=False. m_df = pd.read_csv(DNA_methylation_file)
-------------------------------------------------------------------------------- Pearsons R: PearsonRResult(statistic=-0.1785504058856992, pvalue=2.428784126803424e-88) -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- Spearmans rho: SignificanceResult(statistic=-0.2158514429521915, pvalue=3.581998789357984e-129) --------------------------------------------------------------------------------
/Users/ariane/opt/miniconda3/envs/roundround/lib/python3.10/site-packages/sciviso/scatterplot.py:114: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored scatter = ax.scatter(vis_df[x].values, vis_df[y].values, c=self.colour, alpha=self.opacity,
-------------------------------------------------------------------------------- Pearsons R: PearsonRResult(statistic=0.6823952035953487, pvalue=0.0) -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- Spearmans rho: SignificanceResult(statistic=0.6861706655622867, pvalue=0.0) --------------------------------------------------------------------------------
/Users/ariane/opt/miniconda3/envs/roundround/lib/python3.10/site-packages/sciviso/scatterplot.py:114: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored scatter = ax.scatter(vis_df[x].values, vis_df[y].values, c=self.colour, alpha=self.opacity,
Figure 1B¶
Plot and visualise the overlap of ORA for the top terms, first we need to make "groups" for the different ORAs and top genes.
We do this for each of the above files (i.e. DNA methylation, gene expression and protein)
In [3]:
# -------------------------------------------------------------------
# Make ORA groups for DNA methylation
# -------------------------------------------------------------------
# Add in entrez gene ID and a label for each gene for DNA methylation
m_df = pd.read_csv(DNA_methylation_file)
entrez, labels = [], []
padjs, logfcs = m_df[padj_DNA_methylation_column].values, m_df[betadiff_DNA_methylation_column].values
for i, gene in enumerate(m_df[gene_id_column].values):
label = 'NA'
entrez.append(name_to_entrez.get(gene))
if padjs[i] < meth_padj_cutoff:
if logfcs[i] < (-1 * meth_diff_cutoff):
label = 'Hypo' # Hypo methylation
elif logfcs[i] > meth_diff_cutoff:
label = 'Hyper' # Hyper methylation i.e. "maybe" driving down reg.
labels.append(label)
m_df['ORA_label'] = labels
m_df['entrezgene_id'] = entrez
m_df = m_df[[gene_id_column, 'entrezgene_id', 'ORA_label', padj_DNA_methylation_column,
betadiff_DNA_methylation_column]]
m_df = m_df.dropna()
m_df.to_csv(os.path.join(output_dir, f'DNAMethylation_{cancer}_ORA_Early.csv'), index=False)
# -------------------------------------------------------------------
# Make ORA groups for Gene expression
# -------------------------------------------------------------------
df = pd.read_csv(RNA_file)
entrez, labels = [], []
padjs, logfcs = df[padj_RNA_column].values, df[logFC_RNA_column].values
for i, gene in enumerate(df[gene_id_column].values):
label = 'NA'
entrez.append(name_to_entrez.get(gene))
if padjs[i] < rna_padj_cutoff:
if logfcs[i] < (-1 * rna_logfc_cutoff):
label = 'DOWN' # Repressed
elif logfcs[i] > rna_logfc_cutoff:
label = 'UP' # Increased Expression
labels.append(label)
df['ORA_label'] = labels
df['entrezgene_id'] = entrez
df = df[[gene_id_column, 'entrezgene_id', 'ORA_label', padj_RNA_column, logFC_RNA_column]]
df = df.dropna()
df.to_csv(os.path.join(output_dir, f'RNA_{cancer}_ORA_Early.csv'), index=False)
# -------------------------------------------------------------------
# Make ORA groups for Protein
# -------------------------------------------------------------------
df = pd.read_csv(protein_file)
entrez, labels = [], []
padjs, logfcs = df[padj_protein_column].values, df[logFC_protein_column].values
for i, gene in enumerate(df[gene_id_column].values):
label = 'NA'
entrez.append(name_to_entrez.get(gene))
if padjs[i] < prot_padj_cutoff:
if logfcs[i] < (-1 * prot_logfc_cutoff):
label = 'DOWN' # Repressed
elif logfcs[i] > prot_logfc_cutoff:
label = 'UP' # Increased expression
labels.append(label)
df['ORA_label'] = labels
df['entrezgene_id'] = entrez
df = df[[gene_id_column, 'entrezgene_id', 'ORA_label', padj_protein_column, logFC_protein_column]]
df = df.dropna()
df.to_csv(os.path.join(output_dir, f'Protein_{cancer}_ORA_Early.csv'), index=False)
/var/folders/gq/6ljhmvm1713fykdjqbl188pm0000gn/T/ipykernel_45752/2784331757.py:6: DtypeWarning: Columns (212) have mixed types. Specify dtype option on import or set low_memory=False. m_df = pd.read_csv(DNA_methylation_file)
Look at overlap now (after going to R and running SiRCle ORA)¶
Code to run ORA on each of the above files:
{r}
library(EnhancedVolcano)
library(org.Hs.eg.db)#For Human load:
library(clusterProfiler)#To run ORA
library(enrichplot)#For emapplot, dotplot,...
library(ggplot2)#For saving and legends
library(tidyverse) # used for data manipulation
#Establish a couple of functions needed to perform the GSEA (made by Aurelien)
library(fgsea)
library(GSEABase)
# ----------------------------------------------------------------------------------------------
# File and naming setup
# ----------------------------------------------------------------------------------------------
GSEA_file_dir <- 'Required_Refs/GSEA/' # Will need to change the path if you're on windows
figure_dir <- 'Output_Figures'
input_data_dir <- 'Output_Data'
output_data_dir <- 'Output_Data/ORA/'
cancer <- 'Renal_cell_carcinoma__NOS'
sircleFileName <- file.path(input_data_dir, paste0('sircle_PorMandR_', cancer, '.csv'))
# Run the ORA
#Import the pathways:
KEGG <- read.csv(file.path(GSEA_file_dir, "c2.cp.kegg.v6.2.symbols.csv"))
Reactome <- read.csv(file.path(GSEA_file_dir, "c2.cp.reactome.v6.2.symbols.csv"))
Biocarta <- read.csv(file.path(GSEA_file_dir, "c2.cp.biocarta.v6.2.symbols.csv"))
Hallmarks <- read.csv(file.path(GSEA_file_dir, "h.all.v6.2.symbols.csv"))
GO_BP <- read.csv(file.path(GSEA_file_dir, "c5.go.bp.v7.2.symbols.csv"))
GO_CC <- read.csv(file.path(GSEA_file_dir, "c5.go.cc.v7.2.symbols.csv"))
GO_MF <- read.csv(file.path(GSEA_file_dir, "c5.go.mf.v7.2.symbols.csv"))
Metabolic <- read.csv(file.path(GSEA_file_dir, "41467_2016_BFncomms13041_MOESM340_ESM.csv"))
Correction <- read.csv(file.path(GSEA_file_dir, "41467_2016_BFncomms13041_MOESM340_ESM.csv"))
#Run the GSEA analysis
##1."KEGG", "Reactome", "Biocarta", "Hallmarks"
pathways <- rbind(KEGG, Reactome, Biocarta, Hallmarks)
pathway_list <- list()
for (pathway in unique(pathways$term)) {
pathway_list[[pathway]] <- as.character(pathways[pathways$term == pathway, 1])
}
sircleORAHuman <- function(filename, entrezId, title, regLabels="RegulatoryLabels", emptyRegLabel="", fileType="pdf",
minGSSize=10, qvalueCutoff=0.2, pvalueCutoff=0.05, showCatagory=30, outputFolder='') {
## ------------ Setup and installs ----------- ##
packages <- c("org.Hs.eg.db", "clusterProfiler", "svglite", "enrichplot")
install.packages(setdiff(packages, rownames(installed.packages())))
library(org.Hs.eg.db)
library(clusterProfiler)
library(svglite)
library(enrichplot)
## ------------ Run ----------- ##
# open the data
df <- read.csv(filename)
allGenes <- as.character(df[[entrezId]]) #
clusterGenes <- subset(df, ! df[[regLabels]] == emptyRegLabel)
clusterGenes <- subset(clusterGenes, ! clusterGenes[[regLabels]] == "Not-Background")
grps_labels <- unlist(unique(clusterGenes[regLabels]))
for(g in grps_labels) {
grpGenes <- subset(df, df[[regLabels]] == g)
print(g)
print(dim(grpGenes))
clusterGo <- enrichGO(gene = as.character(grpGenes[[entrezId]]),
universe = allGenes,
keyType = "ENTREZID",
OrgDb = org.Hs.eg.db,
ont = "ALL",
pAdjustMethod = "BH",
qvalueCutoff = 1.0,
pvalueCutoff = 1.0,
minGSSize = minGSSize,
readable = TRUE)
# We have a cutoff of all of them, and then only visualise the ones that the user wants...
clusterGoSummary <- data.frame(clusterGo)
write.csv(clusterGoSummary, paste(outputFolder, 'ClusterGoSummary_', g, title, '.csv', sep=""))#Export the ORA results as .csv
if (!(dim(clusterGoSummary)[1] == 0)) {#exclude df's that have no observations
Dotplot <- dotplot(clusterGo, showCategory=showCatagory) +
ggtitle(paste("Dotplot ", g, sep=""))
ggsave(file=paste(outputFolder, "SiRCle-ORA_Dotplot_Human_", g, title, ".", fileType, sep=""), plot=Dotplot, width=10, height=8)
x2 <- pairwise_termsim(clusterGo)
Emapplot <- emapplot(x2, pie_scale=1.5, layout = "nicely")+
ggtitle(paste("Emapplot ", g, sep=""))
ggsave(file=paste(outputFolder, "SiRCle-ORA_Emapplot_Human_", g, title, ".", fileType, sep="" ), plot=Emapplot, width=10, height=8)
Heatplot <- heatplot(clusterGo,showCategory=showCatagory) +
theme(axis.text.x =element_text(size=5), axis.text.y =element_text(size=8,face="bold"), axis.title=element_text(size=12,face="bold"))+
ggtitle(paste("Heatplot ", g, sep=""))
ggsave(file=paste(outputFolder, "SiRCle-ORA_Heatplot_Human_", g,title, ".", fileType, sep="" ), plot=Heatplot, width=10, height=8)
}
}
}
sircleFileName <- file.path(input_data_dir, paste0('Protein_', cancer, '_ORA.csv'))
test_title <- '_protein'
sircleORAHuman(sircleFileName, "entrezgene_id", test_title, 'ORA_label', emptyRegLabel="None", minGSSize=10, qvalueCutoff=0.1, outputFolder=output_data_dir)
sircleFileName <- file.path(input_data_dir, paste0('RNA_', cancer, '_ORA.csv'))
test_title <- '_RNA'
sircleORAHuman(sircleFileName, "entrezgene_id", test_title, 'ORA_label', emptyRegLabel="None", minGSSize=10, qvalueCutoff=0.1, outputFolder=output_data_dir)
sircleFileName <- file.path(input_data_dir, paste0('DNAMethylation_', cancer, '_ORA.csv'))
test_title <- '_Methylation'
sircleORAHuman(sircleFileName, "entrezgene_id", test_title, 'ORA_label', emptyRegLabel="None", minGSSize=10, qvalueCutoff=0.1, outputFolder=output_data_dir)
In [4]:
from matplotlib_venn import venn3
ora_dir = os.path.join(output_dir, 'ORA_Early')
# Get the overlapping GO terms that were significant
p_go = pd.read_csv(os.path.join(ora_dir, f'ClusterGoSummary_DOWN_protein.csv'))
p_go = p_go[p_go['p.adjust'] < 0.05]['Description'].values # Get sig terms
m_go = pd.read_csv(os.path.join(ora_dir, f'ClusterGoSummary_Hyper_Methylation.csv'))
m_go = m_go[m_go['p.adjust'] < 0.05]['Description'].values # Get sig terms
r_go = pd.read_csv(os.path.join(ora_dir, f'ClusterGoSummary_DOWN_RNA.csv'))
r_go = r_go[r_go['p.adjust'] < 0.05]['Description'].values # Get sig terms
venn3([set(p_go), set(r_go), set(m_go)], set_labels=['Proteomics', 'RNAseq', 'DNA Methylation'],
set_colors=('#0101d7', '#e68e25', '#6aaf44'),)
plt.rcParams['svg.fonttype'] = 'none'
plt.title('Down')
plt.savefig(os.path.join(fig_dir, f'Figure4B_DownGO_Early.svg'))
plt.show()
# Get the overlapping GO terms that were significant
p_go = pd.read_csv(os.path.join(ora_dir, f'ClusterGoSummary_UP_protein.csv'))
p_go = p_go[p_go['p.adjust'] < 0.05]['Description'].values # Get sig terms
m_go = pd.read_csv(os.path.join(ora_dir, f'ClusterGoSummary_Hypo_Methylation.csv'))
m_go = m_go[m_go['p.adjust'] < 0.05]['Description'].values # Get sig terms
r_go = pd.read_csv(os.path.join(ora_dir, f'ClusterGoSummary_UP_RNA.csv'))
r_go = r_go[r_go['p.adjust'] < 0.05]['Description'].values # Get sig terms
venn3([set(p_go), set(r_go), set(m_go)], set_labels=['Proteomics', 'RNAseq', 'DNA Methylation'],
set_colors=('#0101d7', '#e68e25', '#6aaf44'),)
plt.rcParams['svg.fonttype'] = 'none'
plt.title('UP')
plt.savefig(os.path.join(fig_dir, 'Figure4B_UPGO_Early.svg'))
In [6]:
import seaborn as sns
import matplotlib as mpl
import matplotlib.cm as cm
import numpy as np
import matplotlib.cm as cm
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (3,10)
sns.set_theme(style="whitegrid")
cmaps = {'Down regulated': {'CpG': 'darkorange', 'Protein': 'royalblue', 'RNA': 'teal'},
'Up regulated': {'CpG': 'limegreen', 'Protein': 'darkred', 'RNA': 'lightcoral'},
}
comparisons = {'Down regulated': ['ClusterGoSummary_DOWN_RNA.csv',
'ClusterGoSummary_DOWN_protein.csv',
'ClusterGoSummary_Hyper_Methylation.csv'
],
'Up regulated': ['ClusterGoSummary_UP_RNA.csv',
'ClusterGoSummary_UP_protein.csv',
'ClusterGoSummary_Hypo_Methylation.csv'
]
}
ora_late_dir = os.path.join(output_dir, 'ORA_Late')
for c in comparisons:
combined_df = pd.DataFrame()
rna_df = pd.read_csv(os.path.join(ora_dir, f'{comparisons[c][0]}'))
rna_df['GeneRatio'] = [int(g.split('/')[0])/int(g.split('/')[1]) for g in rna_df['GeneRatio'].values]
rna_df['Data'] = 'RNA'
rna_df = rna_df[rna_df['p.adjust'] < 0.05]
rna_df.sort_values('p.adjust', inplace=True)
protein_df = pd.read_csv(os.path.join(ora_dir, f'{comparisons[c][1]}'))
protein_df['GeneRatio'] = [int(g.split('/')[0])/int(g.split('/')[1]) for g in protein_df['GeneRatio'].values]
protein_df['Data'] = 'Protein'
protein_df = protein_df[protein_df['p.adjust'] < 0.05]
protein_df.sort_values('p.adjust', inplace=True)
cpg_df = pd.read_csv(os.path.join(ora_dir, f'{comparisons[c][2]}'))
cpg_df['GeneRatio'] = [int(g.split('/')[0])/int(g.split('/')[1]) for g in cpg_df['GeneRatio'].values]
cpg_df['Data'] = 'CpG'
cpg_df = cpg_df[cpg_df['p.adjust'] < 0.05]
cpg_df.sort_values('p.adjust', inplace=True)
combined_df = pd.concat([combined_df, rna_df])
combined_df = pd.concat([combined_df, cpg_df])
combined_df = pd.concat([combined_df, protein_df])
# Concat all together
num = 10
rna_df = pd.concat([rna_df.head(num), cpg_df.head(num), protein_df.head(num)])
rna_df.sort_values('p.adjust', inplace=True)
max_c = max(rna_df['Count'].values)
min_c = min(rna_df['Count'].values)
size = [int(1 + ((int(c) - min_c)/(max_c - min_c))*300) for c in rna_df['Count'].values]
norm = mpl.colors.Normalize(vmin=np.min(rna_df['p.adjust'].values), vmax=np.max(rna_df['p.adjust'].values))
cmap = cm.RdBu
m = cm.ScalarMappable(norm=norm, cmap=cmap)
c_map = cmaps[c]
colours = [c_map.get(v) for v in rna_df['Data'].values]
plt.scatter(rna_df['GeneRatio'].values, rna_df['Description'].values, s=size, c=colours)#c=m.to_rgba(rna_df['p.adjust'].values))
plt.title(c)
combined_df.to_csv(f'{output_dir}{c}_EarlyTvN.csv', index=False)
gmin = plt.scatter([], [], s=10 + int(300*(min_c/(max_c - min_c))), marker='o', color='#222')
#gmid = plt.scatter([], [], s=int(10 + 300*(((max_c - min_c)/2)/max_c)), marker='o', color='#222')
gmax = plt.scatter([], [], s=10 + int(300*(1)), marker='o', color='#222')
legend = plt.legend((gmin, gmax),
(str(min_c), str(max_c)),
scatterpoints=1,
loc='lower left',
ncol=1,
fontsize=8, bbox_to_anchor=(0, -0.1))
legend.set_title("No. Genes")
plt.gca().add_artist(legend)
plt.savefig(f'{output_dir}DotPlot_GO_{c}_Early.svg')
plt.show()
#sns.stripplot(data=rna_df.head(30), y='Description', x='GeneRatio', size=size) #, color='p.adjust', size='Count')
In [7]:
import seaborn as sns
import matplotlib as mpl
import matplotlib.cm as cm
import numpy as np
import matplotlib.cm as cm
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (3,10)
sns.set_theme(style="whitegrid")
cmaps = {'Down regulated': {'CpG': 'darkorange', 'Protein': 'royalblue', 'RNA': 'teal'},
'Up regulated': {'CpG': 'limegreen', 'Protein': 'darkred', 'RNA': 'lightcoral'},
}
comparisons = {'Down regulated': ['ClusterGoSummary_DOWN_RNA.csv',
'ClusterGoSummary_DOWN_protein.csv',
'ClusterGoSummary_Hyper_Methylation.csv'
],
'Up regulated': ['ClusterGoSummary_UP_RNA.csv',
'ClusterGoSummary_UP_protein.csv',
'ClusterGoSummary_Hypo_Methylation.csv'
]
}
ora_late_dir = os.path.join(output_dir, 'ORA_Late')
for c in comparisons:
combined_df = pd.DataFrame()
rna_df = pd.read_csv(os.path.join(ora_dir, f'{comparisons[c][0]}'))
rna_df['GeneRatio'] = [int(g.split('/')[0])/int(g.split('/')[1]) for g in rna_df['GeneRatio'].values]
rna_df['Data'] = 'RNA'
rna_df['marker'] = 'o'
rna_df = rna_df[rna_df['p.adjust'] < 0.05]
rna_df.sort_values('p.adjust', inplace=True)
protein_df = pd.read_csv(os.path.join(ora_dir, f'{comparisons[c][1]}'))
protein_df['GeneRatio'] = [int(g.split('/')[0])/int(g.split('/')[1]) for g in protein_df['GeneRatio'].values]
protein_df['Data'] = 'Protein'
protein_df = protein_df[protein_df['p.adjust'] < 0.05]
protein_df['marker'] = 'o'
protein_df.sort_values('p.adjust', inplace=True)
cpg_df = pd.read_csv(os.path.join(ora_dir, f'{comparisons[c][2]}'))
cpg_df['GeneRatio'] = [int(g.split('/')[0])/int(g.split('/')[1]) for g in cpg_df['GeneRatio'].values]
cpg_df['Data'] = 'CpG'
cpg_df['marker'] = 'o'
cpg_df = cpg_df[cpg_df['p.adjust'] < 0.05]
cpg_df.sort_values('p.adjust', inplace=True)
# Concat all together
num = 10
rna_df = pd.concat([rna_df.head(num), cpg_df.head(num), protein_df.head(num)])
combined_df = rna_df.copy()
# Do the same with late
rna_df = pd.read_csv(os.path.join(ora_late_dir, f'{comparisons[c][0]}'))
rna_df['GeneRatio'] = [int(g.split('/')[0])/int(g.split('/')[1]) for g in rna_df['GeneRatio'].values]
rna_df['Data'] = 'RNA'
rna_df['marker'] = '*'
rna_df = rna_df[rna_df['p.adjust'] < 0.05]
rna_df.sort_values('p.adjust', inplace=True)
protein_df = pd.read_csv(os.path.join(ora_late_dir, f'{comparisons[c][1]}'))
protein_df['GeneRatio'] = [int(g.split('/')[0])/int(g.split('/')[1]) for g in protein_df['GeneRatio'].values]
protein_df['Data'] = 'Protein'
protein_df = protein_df[protein_df['p.adjust'] < 0.05]
protein_df['marker'] = '*'
protein_df.sort_values('p.adjust', inplace=True)
cpg_df = pd.read_csv(os.path.join(ora_late_dir, f'{comparisons[c][2]}'))
cpg_df['GeneRatio'] = [int(g.split('/')[0])/int(g.split('/')[1]) for g in cpg_df['GeneRatio'].values]
cpg_df['Data'] = 'CpG'
cpg_df = cpg_df[cpg_df['p.adjust'] < 0.05]
cpg_df['marker'] = '*'
cpg_df.sort_values('p.adjust', inplace=True)
rna_df = pd.concat([combined_df, rna_df.head(num), cpg_df.head(num), protein_df.head(num)])
rna_df.sort_values('p.adjust', inplace=True)
max_c = max(rna_df['Count'].values)
min_c = min(rna_df['Count'].values)
norm = mpl.colors.Normalize(vmin=np.min(rna_df['p.adjust'].values), vmax=np.max(rna_df['p.adjust'].values))
cmap = cm.RdBu
m = cm.ScalarMappable(norm=norm, cmap=cmap)
c_map = cmaps[c]
# plot the markers sep
early = rna_df[rna_df['marker'] == 'o']
size = [int(1 + ((int(c) - min_c)/(max_c - min_c))*300) for c in early['Count'].values]
colours = [c_map.get(v) for v in early['Data'].values]
plt.scatter(early['GeneRatio'].values, early['Description'].values, s=size, c=colours, alpha=0.5,
edgecolor='black', marker='s')
late = rna_df[rna_df['marker'] == '*']
size = [int(1 + ((int(c) - min_c)/(max_c - min_c))*300) for c in late['Count'].values]
colours = [c_map.get(v) for v in late['Data'].values]
plt.scatter(late['GeneRatio'].values, late['Description'].values, s=size, c=colours, alpha=0.5,
edgecolor='black',
marker='*')
plt.title(c)
combined_df.to_csv(f'{output_dir}{c}_EarlyvLateTvN.csv', index=False)
gmin = plt.scatter([], [], s=10 + int(300*(min_c/(max_c - min_c))), marker='o', color='#222')
gmax = plt.scatter([], [], s=10 + int(300*(1)), marker='o', color='#222')
legend = plt.legend((gmin, gmax),
(str(min_c), str(max_c)),
scatterpoints=1,
loc='lower left',
ncol=1,
fontsize=8, bbox_to_anchor=(0, -0.1))
legend.set_title("No. Genes")
plt.gca().add_artist(legend)
plt.savefig(f'{fig_dir}DotPlot_GO_{c}_EarlyLateTvN.svg')
plt.show()
#sns.stripplot(data=rna_df.head(30), y='Description', x='GeneRatio', size=size) #, color='p.adjust', size='Count')
In [15]:
from matplotlib_venn import venn2
import os
import pandas as pd
import matplotlib.pyplot as plt
ora_dir = os.path.join(output_dir, 'ORA_Early')
# Get the overlapping GO terms that were significant
p_go = pd.read_csv(os.path.join(ora_dir, f'ClusterGoSummary_DOWN_protein.csv'))
p_go = p_go[p_go['p.adjust'] < 0.05]['Description'].values # Get sig terms
ora_dir = os.path.join(output_dir, 'ORA_Late')
# Get the overlapping GO terms that were significant
p_go_late = pd.read_csv(os.path.join(ora_dir, f'ClusterGoSummary_DOWN_protein.csv'))
p_go_late = p_go_late[p_go_late['p.adjust'] < 0.05]['Description'].values # Get sig terms
venn2([set(p_go), set(p_go_late)], set_labels=['Early', 'Late'],
set_colors=('grey', 'blue'),)
plt.rcParams['svg.fonttype'] = 'none'
plt.title('Down')
plt.savefig(os.path.join(fig_dir, f'Figure4B_protein_Down_EarlyLate.svg'))
plt.show()
In [16]:
# Print out which ones weren't overlapping
p_go_late.sort()
[g for g in p_go_late if g not in p_go]
Out[16]:
['I band', 'L-phenylalanine catabolic process', 'L-phenylalanine metabolic process', 'Z disc', 'actin filament bundle', 'actinin binding', 'alpha-actinin binding', 'angiotensin maturation', 'basement membrane', 'bile acid biosynthetic process', 'carboxypeptidase activity', 'cell-cell contact zone', 'cell-cell junction', 'chloride ion homeostasis', 'circulatory system process', 'collagen-containing extracellular matrix', 'contractile actin filament bundle', 'contractile fiber', 'epithelial cell differentiation', 'erythrose 4-phosphate/phosphoenolpyruvate family amino acid catabolic process', 'erythrose 4-phosphate/phosphoenolpyruvate family amino acid metabolic process', 'exopeptidase activity', 'external encapsulating structure', 'extracellular matrix', 'glycerol metabolic process', 'glycosaminoglycan catabolic process', 'heparin binding', 'intercalated disc', 'kidney development', 'kidney epithelium development', 'leukotriene metabolic process', 'long-chain fatty acid transporter activity', 'main axon', 'mesonephric epithelium development', 'mesonephric tubule development', 'monovalent inorganic anion homeostasis', 'muscle alpha-actinin binding', 'myofibril', 'nephron epithelium development', 'nucleoside monophosphate kinase activity', 'organic hydroxy compound biosynthetic process', 'peptide hormone processing', 'phospholipase activity', 'phospholipid catabolic process', 'phosphotransferase activity, phosphate group as acceptor', 'positive regulation of lipid transport', 'regulation of angiotensin levels in blood', 'regulation of systemic arterial blood pressure by circulatory renin-angiotensin', 'sarcomere', 'serine-type exopeptidase activity', 'signaling receptor ligand precursor processing', 'steroid biosynthetic process', 'stress fiber', 'ureteric bud development', 'urogenital system development', 'vascular process in circulatory system']
In [17]:
ora_dir = os.path.join(output_dir, 'ORA_Early')
# Get the overlapping GO terms that were significant
p_go = pd.read_csv(os.path.join(ora_dir, f'ClusterGoSummary_UP_protein.csv'))
p_go = p_go[p_go['p.adjust'] < 0.05]['Description'].values # Get sig terms
ora_dir = os.path.join(output_dir, 'ORA_Late')
# Get the overlapping GO terms that were significant
p_go_late = pd.read_csv(os.path.join(ora_dir, f'ClusterGoSummary_UP_protein.csv'))
p_go_late = p_go_late[p_go_late['p.adjust'] < 0.05]['Description'].values # Get sig terms
venn2([set(p_go), set(p_go_late)], set_labels=['Early', 'Late'],
set_colors=('grey', 'blue'),)
plt.rcParams['svg.fonttype'] = 'none'
plt.title('Up')
plt.savefig(os.path.join(fig_dir, f'Figure4B_protein_up_EarlyLate.svg'))
plt.show()
In [18]:
# Print out which ones weren't overlapping
# Print out which ones weren't overlapping
p_go_late.sort()
[g for g in p_go_late if g not in p_go]
Out[18]:
['Fc receptor signaling pathway', 'actin filament polymerization', 'apoptotic cell clearance', 'apoptotic mitochondrial changes', 'cell redox homeostasis', 'cellular response to amyloid-beta', 'cellular response to leukemia inhibitory factor', 'cellular response to oxidative stress', 'cytokine production involved in inflammatory response', 'defense response to fungus', 'establishment of protein localization to membrane', 'fructose 6-phosphate metabolic process', 'integral component of organelle membrane', 'intramolecular oxidoreductase activity', 'intramolecular oxidoreductase activity, transposing S-S bonds', 'intrinsic apoptotic signaling pathway', 'intrinsic apoptotic signaling pathway by p53 class mediator', 'large ribosomal subunit', 'lipid droplet', 'maturation of LSU-rRNA', 'maturation of LSU-rRNA from tricistronic rRNA transcript (SSU-rRNA, 5.8S rRNA, LSU-rRNA)', 'membrane depolarization', 'monocyte differentiation', 'negative regulation of DNA-binding transcription factor activity', 'negative regulation of I-kappaB kinase/NF-kappaB signaling', 'negative regulation of NF-kappaB transcription factor activity', 'negative regulation of cysteine-type endopeptidase activity involved in apoptotic process', 'negative regulation of intracellular signal transduction', 'nuclear DNA replication', 'positive regulation of cold-induced thermogenesis', 'positive regulation of leukocyte chemotaxis', 'positive regulation of release of cytochrome c from mitochondria', 'preribosome', 'preribosome, large subunit precursor', 'preribosome, small subunit precursor', 'protein disulfide isomerase activity', 'protein localization to nucleolus', 'rRNA metabolic process', 'rRNA processing', 'regulation of B cell differentiation', 'regulation of G protein-coupled receptor signaling pathway', 'regulation of cysteine-type endopeptidase activity involved in apoptotic signaling pathway', 'regulation of cytokine production involved in inflammatory response', 'regulation of endothelial cell chemotaxis', 'regulation of intrinsic apoptotic signaling pathway', 'regulation of mast cell activation involved in immune response', 'regulation of mast cell degranulation', 'regulation of protein kinase activity', 'regulation of regulated secretory pathway', 'regulation of release of cytochrome c from mitochondria', 'regulation of tyrosine phosphorylation of STAT protein', 'release of cytochrome c from mitochondria', 'response to endoplasmic reticulum stress', 'response to hydrogen peroxide', 'response to interferon-beta', 'response to leukemia inhibitory factor', 'response to progesterone', 'response to reactive oxygen species', 'ribosomal small subunit biogenesis', 'ribosome biogenesis', 'ruffle assembly', 'skin development', 'translational initiation', 'viral transcription']
In [19]:
# Print out which ones weren't overlapping
# Print out which ones weren't overlapping
p_go.sort()
[g for g in p_go if g not in p_go_late]
Out[19]:
['SH3 domain binding', 'T cell differentiation in thymus', 'activation of cysteine-type endopeptidase activity involved in apoptotic process', 'amide binding', 'animal organ regeneration', 'basement membrane', 'bicarbonate transport', 'biological process involved in interaction with host', 'blood microparticle', 'branching involved in blood vessel morphogenesis', 'calcium ion transmembrane import into cytosol', 'calcium-dependent protein binding', 'carbohydrate biosynthetic process', 'cartilage development', 'cation homeostasis', 'cell adhesion molecule binding', 'cell junction disassembly', 'cell migration involved in sprouting angiogenesis', 'cell recognition', 'cell-cell adhesion via plasma-membrane adhesion molecules', 'cell-substrate junction', 'cellular cation homeostasis', 'cellular ion homeostasis', 'cellular polysaccharide catabolic process', 'cellular response to lipid', 'cellular response to mechanical stimulus', 'cellular response to vascular endothelial growth factor stimulus', 'chondrocyte differentiation', 'circulatory system process', 'connective tissue development', 'cytokine receptor binding', 'detection of biotic stimulus', 'detoxification', 'endopeptidase regulator activity', 'endothelial cell chemotaxis', 'enzyme inhibitor activity', 'epithelial cell proliferation', 'erythrocyte development', 'foam cell differentiation', 'focal adhesion', 'gland morphogenesis', 'glucan catabolic process', 'glycogen catabolic process', 'heterophilic cell-cell adhesion via plasma membrane cell adhesion molecules', 'humoral immune response', 'hyaluronan metabolic process', 'hyperosmotic response', 'immunoglobulin mediated immune response', 'inorganic ion homeostasis', 'integrin activation', 'leukocyte tethering or rolling', 'lipase inhibitor activity', 'lipid binding', 'lipopolysaccharide binding', 'lipoprotein particle binding', 'low-density lipoprotein particle binding', 'macrophage derived foam cell differentiation', 'maternal process involved in female pregnancy', 'mucopolysaccharide metabolic process', 'multicellular organismal homeostasis', 'muscle cell migration', 'muscle cell proliferation', 'negative regulation of angiogenesis', 'negative regulation of blood vessel morphogenesis', 'negative regulation of cell cycle G1/S phase transition', 'negative regulation of developmental process', 'negative regulation of endopeptidase activity', 'negative regulation of epithelial cell differentiation', 'negative regulation of epithelial cell proliferation', 'negative regulation of exocytosis', 'negative regulation of interleukin-8 production', 'negative regulation of lymphocyte differentiation', 'negative regulation of oxidoreductase activity', 'negative regulation of peptidase activity', 'negative regulation of phagocytosis', 'negative regulation of reproductive process', 'negative regulation of sequestering of calcium ion', 'negative regulation of vasculature development', 'nerve development', 'nucleoside-triphosphatase regulator activity', 'ossification', 'oxidoreductase activity, acting on a sulfur group of donors', 'peptidase regulator activity', 'platelet alpha granule', 'platelet alpha granule lumen', 'platelet alpha granule membrane', 'platelet degranulation', 'podosome assembly', 'polysaccharide catabolic process', 'positive regulation of binding', 'positive regulation of cell-substrate adhesion', 'positive regulation of cysteine-type endopeptidase activity involved in apoptotic process', 'positive regulation of endothelial cell proliferation', 'positive regulation of leukocyte adhesion to vascular endothelial cell', 'protease binding', 'protein complex oligomerization', 'protein processing', 'protein-lipid complex binding', 'regeneration', 'regulation of B cell receptor signaling pathway', 'regulation of NIK/NF-kappaB signaling', 'regulation of binding', 'regulation of calcium ion transport into cytosol', 'regulation of cellular extravasation', 'regulation of endothelial cell migration', 'regulation of epithelial cell differentiation', 'regulation of epithelial cell proliferation', 'regulation of leukocyte adhesion to vascular endothelial cell', 'regulation of lipopolysaccharide-mediated signaling pathway', 'regulation of macrophage differentiation', 'regulation of phospholipase activity', 'regulation of proteolysis', 'regulation of reactive oxygen species metabolic process', 'regulation of receptor binding', 'regulation of sequestering of calcium ion', 'regulation of smooth muscle cell migration', 'regulation of sprouting angiogenesis', 'regulation of superoxide metabolic process', 'release of sequestered calcium ion into cytosol', 'response to alcohol', 'response to dsRNA', 'response to ethanol', 'response to exogenous dsRNA', 'response to mechanical stimulus', 'response to nicotine', 'response to nutrient', 'sarcoplasm', 'scavenger receptor activity', 'second-messenger-mediated signaling', 'semaphorin-plexin signaling pathway', 'sequestering of calcium ion', 'tissue homeostasis', 'viral life cycle']
In [20]:
ora_dir = os.path.join(output_dir, 'ORA_Early')
m_go = pd.read_csv(os.path.join(ora_dir, f'ClusterGoSummary_Hyper_Methylation.csv'))
m_go = m_go[m_go['p.adjust'] < 0.05]['Description'].values # Get sig terms
ora_dir = os.path.join(output_dir, 'ORA_Late')
# Get the overlapping GO terms that were significant
m_go_late = pd.read_csv(os.path.join(ora_dir, f'ClusterGoSummary_Hyper_Methylation.csv'))
m_go_late = m_go_late[m_go_late['p.adjust'] < 0.05]['Description'].values # Get sig terms
venn2([set(m_go), set(m_go_late)], set_labels=['Early', 'Late'],
set_colors=('grey', 'green'),)
plt.rcParams['svg.fonttype'] = 'none'
plt.title('Hyper')
plt.savefig(os.path.join(fig_dir, f'Figure4B_meth_up_EarlyLate.svg'))
plt.show()
In [21]:
# Print out which ones weren't overlapping
# Print out which ones weren't overlapping
m_go.sort()
[g for g in m_go if g not in m_go_late]
Out[21]:
['cardiac right ventricle morphogenesis', 'fatty acid beta-oxidation', 'fatty acid oxidation', 'integral component of mitochondrial membrane', 'intrinsic component of mitochondrial membrane', 'learning', 'lipid modification', 'lipid oxidation', 'magnesium ion transmembrane transport', 'mismatch repair', 'negative regulation of signaling receptor activity', 'pericardium development', 'phasic smooth muscle contraction', 'positive regulation of behavior', 'positive regulation of epithelial cell apoptotic process', 'positive regulation of muscle contraction', 'regulation of ATPase activity', 'regulation of fatty acid oxidation', 'response to extracellular stimulus', 'response to nutrient levels']
In [22]:
# Print out which ones weren't overlapping
# Print out which ones weren't overlapping
m_go_late.sort()
[g for g in m_go_late if g not in m_go]
Out[22]:
['G0 to G1 transition', 'GABAergic neuron differentiation', 'Notch signaling pathway', 'SMAD binding', 'SMAD protein complex assembly', 'adenylate cyclase-activating adrenergic receptor signaling pathway', 'adrenal gland development', 'adrenergic receptor signaling pathway', 'anatomical structure maturation', 'anterograde trans-synaptic signaling', 'aortic valve development', 'aortic valve morphogenesis', 'axon development', 'axon guidance', 'axonogenesis', 'beta-catenin-TCF complex assembly', 'biological process involved in intraspecies interaction between organisms', 'biomineral tissue development', 'body morphogenesis', 'bone mineralization', 'canonical Wnt signaling pathway', 'cardiac atrium morphogenesis', 'cardiac cell fate commitment', 'cardiac chamber development', 'cardiac epithelial to mesenchymal transition', 'cardiac left ventricle morphogenesis', 'cardiac ventricle development', 'cardiac ventricle morphogenesis', 'cardioblast differentiation', 'catecholamine secretion', 'catecholamine transport', 'cell cycle arrest', 'cell differentiation in hindbrain', 'cell differentiation involved in kidney development', 'cell differentiation involved in metanephros development', 'cell fate commitment involved in formation of primary germ layer', 'cell fate determination', 'cell junction assembly', 'cell morphogenesis involved in differentiation', 'cell morphogenesis involved in neuron differentiation', 'cell part morphogenesis', 'cell projection morphogenesis', 'cellular response to hormone stimulus', 'cellular response to organic cyclic compound', 'cellular response to steroid hormone stimulus', 'cerebellar Purkinje cell layer development', 'cerebellar cortex development', 'cerebral cortex GABAergic interneuron differentiation', 'chemical synaptic transmission', 'chondrocyte differentiation', 'circadian rhythm', 'collecting duct development', 'columnar/cuboidal epithelial cell differentiation', 'cranial nerve development', 'cranial nerve morphogenesis', 'craniofacial suture morphogenesis', 'cyclic-nucleotide-mediated signaling', 'definitive hemopoiesis', 'determination of heart left/right asymmetry', 'determination of left/right symmetry', 'developmental induction', 'developmental maturation', 'digestive tract morphogenesis', 'eating behavior', 'embryonic digestive tract development', 'embryonic heart tube development', 'embryonic heart tube morphogenesis', 'endocardial cushion development', 'endocardial cushion morphogenesis', 'endochondral bone morphogenesis', 'endocrine pancreas development', 'endoderm development', 'epithelial cell differentiation involved in kidney development', 'epithelial cell fate commitment', 'epithelial cell proliferation', 'epithelial to mesenchymal transition involved in endocardial cushion formation', 'face development', 'fear response', 'fibroblast growth factor receptor binding', 'forebrain generation of neurons', 'forebrain neuron differentiation', 'formation of primary germ layer', 'glandular epithelial cell differentiation', 'gliogenesis', 'hair cycle', 'hair cycle process', 'hair follicle development', 'hair follicle morphogenesis', 'heart looping', 'heart valve development', 'heart valve morphogenesis', 'histone H3 acetylation', 'histone H3-K14 acetylation', 'histone methylation', 'in utero embryonic development', 'inhibitory synapse assembly', 'inner mitochondrial membrane organization', 'inositol metabolic process', 'insulin secretion', 'intracellular receptor signaling pathway', 'kidney mesenchyme development', 'lens development in camera-type eye', 'lens morphogenesis in camera-type eye', 'limbic system development', 'lung cell differentiation', 'lung development', 'lung epithelial cell differentiation', 'lymph vessel development', 'male sex differentiation', 'mesenchymal cell proliferation', 'mesenchyme morphogenesis', 'mesoderm formation', 'mesoderm morphogenesis', 'mesodermal cell differentiation', 'mesodermal cell fate commitment', 'metanephric glomerulus development', 'metanephric nephron development', 'metanephric nephron morphogenesis', 'metanephric renal vesicle morphogenesis', 'metanephros morphogenesis', 'middle ear morphogenesis', 'molting cycle', 'molting cycle process', 'monoamine transport', 'muscle cell differentiation', 'myoblast differentiation', 'myotube differentiation', 'negative regulation of BMP signaling pathway', 'negative regulation of G0 to G1 transition', 'negative regulation of Wnt signaling pathway', 'negative regulation of amine transport', 'negative regulation of cell cycle', 'negative regulation of cell development', 'negative regulation of cellular response to growth factor stimulus', 'negative regulation of epithelial to mesenchymal transition', 'negative regulation of histone methylation', 'negative regulation of muscle cell differentiation', 'negative regulation of muscle organ development', 'negative regulation of muscle tissue development', 'negative regulation of myeloid cell differentiation', 'negative regulation of nervous system development', 'negative regulation of neural precursor cell proliferation', 'negative regulation of neurogenesis', 'negative regulation of neuron apoptotic process', 'negative regulation of neuron death', 'negative regulation of organic acid transport', 'negative regulation of osteoblast differentiation', 'negative regulation of transforming growth factor beta receptor signaling pathway', 'negative regulation of transmembrane receptor protein serine/threonine kinase signaling pathway', 'nephron tubule formation', 'nerve development', 'neural nucleus development', 'neural tube formation', 'neural tube patterning', 'neuron apoptotic process', 'neuron fate specification', 'neuron migration', 'neuron projection guidance', 'neuron projection morphogenesis', 'neuropeptide receptor binding', 'odontogenesis', 'odontogenesis of dentin-containing tooth', 'organ induction', 'organic hydroxy compound metabolic process', 'ossification', 'osteoblast differentiation', 'pallium development', 'peptide hormone secretion', 'peripheral nervous system neuron development', 'peripheral nervous system neuron differentiation', 'phenol-containing compound metabolic process', 'plasma membrane bounded cell projection morphogenesis', 'positive regulation of BMP signaling pathway', 'positive regulation of animal organ morphogenesis', 'positive regulation of binding', 'positive regulation of biomineral tissue development', 'positive regulation of bone mineralization', 'positive regulation of branching involved in ureteric bud morphogenesis', 'positive regulation of cardiocyte differentiation', 'positive regulation of cell cycle', 'positive regulation of cell development', 'positive regulation of cellular response to transforming growth factor beta stimulus', 'positive regulation of epithelial cell differentiation', 'positive regulation of histone acetylation', 'positive regulation of histone modification', 'positive regulation of morphogenesis of an epithelium', 'positive regulation of stem cell proliferation', 'positive regulation of transforming growth factor beta receptor signaling pathway', 'primary neural tube formation', 'protein acetylation', 'protein acylation', 'regulation of BMP signaling pathway', 'regulation of DNA-templated transcription, elongation', 'regulation of G0 to G1 transition', 'regulation of Wnt signaling pathway', 'regulation of animal organ formation', 'regulation of animal organ morphogenesis', 'regulation of appetite', 'regulation of branching involved in ureteric bud morphogenesis', 'regulation of canonical Wnt signaling pathway', 'regulation of cell cycle arrest', 'regulation of cellular response to transforming growth factor beta stimulus', 'regulation of chondrocyte differentiation', 'regulation of chromatin organization', 'regulation of embryonic development', 'regulation of endocrine process', 'regulation of epithelial cell differentiation', 'regulation of histone H3-K9 acetylation', 'regulation of histone acetylation', 'regulation of histone methylation', 'regulation of histone modification', 'regulation of hormone levels', 'regulation of intracellular steroid hormone receptor signaling pathway', 'regulation of kidney development', 'regulation of morphogenesis of a branching structure', 'regulation of morphogenesis of an epithelium', 'regulation of muscle cell differentiation', 'regulation of muscle organ development', 'regulation of muscle tissue development', 'regulation of myoblast proliferation', 'regulation of neurogenesis', 'regulation of neuron apoptotic process', 'regulation of neuron death', 'regulation of odontogenesis', 'regulation of organ growth', 'regulation of ossification', 'regulation of osteoblast differentiation', 'regulation of smoothened signaling pathway', 'regulation of stem cell proliferation', 'regulation of striated muscle cell differentiation', 'regulation of striated muscle tissue development', 'regulation of transforming growth factor beta receptor signaling pathway', 'regulation of transporter activity', 'renal vesicle development', 'renal vesicle morphogenesis', 'reproductive structure development', 'reproductive system development', 'respiratory tube development', 'response to alkaloid', 'response to isoquinoline alkaloid', 'response to morphine', 'response to retinoic acid', 'response to steroid hormone', 'retina development in camera-type eye', 'retinoic acid metabolic process', 'retinoic acid receptor signaling pathway', 'rhythmic process', 'semi-lunar valve development', 'skeletal muscle cell differentiation', 'skeletal muscle fiber development', 'skin epidermis development', 'smooth muscle cell differentiation', 'smooth muscle contraction', 'smoothened signaling pathway', 'social behavior', 'specification of animal organ identity', 'spinal cord association neuron differentiation', 'spinal cord patterning', 'stem cell proliferation', 'steroid hormone mediated signaling pathway', 'striated muscle cell differentiation', 'striated muscle cell proliferation', 'sympathetic nervous system development', 'synapse assembly', 'telencephalon development', 'thalamus development', 'trans-synaptic signaling', 'transforming growth factor beta binding', 'ureter development', 'ventral spinal cord interneuron differentiation', 'ventricular septum development']
In [10]:
ora_dir = os.path.join(output_dir, 'ORA_Early')
m_go = pd.read_csv(os.path.join(ora_dir, f'ClusterGoSummary_Hypo_Methylation.csv'))
m_go = m_go[m_go['p.adjust'] < 0.05]['Description'].values # Get sig terms
ora_dir = os.path.join(output_dir, 'ORA_Late')
# Get the overlapping GO terms that were significant
m_go_late = pd.read_csv(os.path.join(ora_dir, f'ClusterGoSummary_Hypo_Methylation.csv'))
m_go_late = m_go_late[m_go_late['p.adjust'] < 0.05]['Description'].values # Get sig terms
venn2([set(m_go), set(m_go_late)], set_labels=['Early', 'Late'],
set_colors=('grey', 'green'),)
plt.rcParams['svg.fonttype'] = 'none'
plt.title('Hypo')
plt.savefig(os.path.join(fig_dir, f'Figure4B_meth_down_EarlyLate.svg'))
plt.show()
In [12]:
ora_dir = os.path.join(output_dir, 'ORA_Early')
r_go = pd.read_csv(os.path.join(ora_dir, f'ClusterGoSummary_DOWN_RNA.csv'))
r_go = r_go[r_go['p.adjust'] < 0.05]['Description'].values # Get sig terms
ora_dir = os.path.join(output_dir, 'ORA_Late')
# Get the overlapping GO terms that were significant
r_go_late = pd.read_csv(os.path.join(ora_dir, f'ClusterGoSummary_DOWN_RNA.csv'))
r_go_late = r_go_late[r_go_late['p.adjust'] < 0.05]['Description'].values # Get sig terms
venn2([set(m_go), set(m_go_late)], set_labels=['Early', 'Late'],
set_colors=('grey', 'orange'),)
plt.rcParams['svg.fonttype'] = 'none'
plt.title('Down')
plt.savefig(os.path.join(fig_dir, f'Figure4_RNA_down_EarlyLate.svg'))
plt.show()
In [13]:
ora_dir = os.path.join(output_dir, 'ORA_Early')
r_go = pd.read_csv(os.path.join(ora_dir, f'ClusterGoSummary_UP_RNA.csv'))
r_go = r_go[r_go['p.adjust'] < 0.05]['Description'].values # Get sig terms
ora_dir = os.path.join(output_dir, 'ORA_Late')
# Get the overlapping GO terms that were significant
r_go_late = pd.read_csv(os.path.join(ora_dir, f'ClusterGoSummary_UP_RNA.csv'))
r_go_late = r_go_late[r_go_late['p.adjust'] < 0.05]['Description'].values # Get sig terms
venn2([set(m_go), set(m_go_late)], set_labels=['Early', 'Late'],
set_colors=('grey', 'orange'),)
plt.rcParams['svg.fonttype'] = 'none'
plt.title('Up')
plt.savefig(os.path.join(fig_dir, f'Figure4_RNA_up_EarlyLate.svg'))
plt.show()
Lastly check what the overlap is for the late vs early pathways¶
In [25]:
protein_pathways = pd.read_csv(f'{fig_dir}/Early_vs_Late_Protein_GSEA_MetabolicPathways.csv')
protein_pathways[protein_pathways['padj'] < 0.25]
Out[25]:
pathway | pval | padj | ES | NES | nMoreExtreme | size | leadingEdge | |
---|---|---|---|---|---|---|---|---|
1 | Amino and Nucleotide Sugar Metabolism | 0.047948 | 0.189269 | 0.538691 | 1.503105 | 214 | 17 | NaN |
3 | beta-Alanine metabolism | 0.070297 | 0.237815 | -0.757309 | -1.447738 | 363 | 5 | NaN |
5 | Carbonic Acid Metabolism | 0.002451 | 0.045947 | -0.803497 | -1.797185 | 12 | 9 | NaN |
7 | Cholesterol Metabolism | 0.037754 | 0.165966 | -0.822433 | -1.483419 | 192 | 4 | NaN |
10 | Citric Acid Cycle | 0.059119 | 0.221694 | 0.374659 | 1.371847 | 223 | 52 | NaN |
13 | Cyclic Nucleotides Metabolism | 0.006085 | 0.052806 | -0.704562 | -1.769670 | 32 | 14 | NaN |
16 | Eicosanoid Metabolism | 0.004947 | 0.052806 | -0.644841 | -1.776511 | 27 | 20 | NaN |
17 | Ethanol Metabolism | 0.004115 | 0.051440 | -0.771069 | -1.774602 | 21 | 10 | NaN |
26 | Glutamate metabolism | 0.027906 | 0.149496 | 0.526393 | 1.574757 | 120 | 22 | NaN |
27 | Glutathione Metabolism | 0.038346 | 0.165966 | -0.551212 | -1.531492 | 216 | 21 | NaN |
28 | Glycerophospholipid Metabolism | 0.039832 | 0.165966 | -0.464642 | -1.475074 | 236 | 38 | NaN |
31 | Glycolysis and Gluconeogenesis | 0.072930 | 0.237815 | -0.397303 | -1.362088 | 457 | 56 | NaN |
34 | Histidine Metabolism | 0.027298 | 0.149496 | -0.662179 | -1.599090 | 146 | 12 | NaN |
42 | NAD Metabolism | 0.072685 | 0.237815 | 0.650378 | 1.456923 | 342 | 8 | NaN |
45 | Oxidative Phosphorylation | 0.015498 | 0.096861 | -0.446848 | -1.541363 | 96 | 59 | NaN |
50 | Porphyrin and Heme Metabolism | 0.000184 | 0.013797 | -0.789078 | -2.013049 | 0 | 15 | NaN |
52 | Protein Modification | 0.000644 | 0.018929 | 0.786570 | 1.886154 | 2 | 10 | NaN |
55 | Purine Metabolism | 0.007597 | 0.056979 | -0.627207 | -1.727929 | 42 | 20 | NaN |
59 | Retinol Metabolism | 0.034852 | 0.165966 | -0.627066 | -1.575020 | 188 | 14 | NaN |
61 | ROS Detoxification | 0.014644 | 0.096861 | -0.564875 | -1.634957 | 83 | 25 | NaN |
65 | Sugar Degradation | 0.003090 | 0.046350 | -0.884460 | -1.690812 | 15 | 5 | NaN |
66 | Transport, Extracellular | 0.006337 | 0.052806 | -0.572526 | -1.715340 | 36 | 29 | NaN |
72 | Urea Cycle | 0.000757 | 0.018929 | -0.843891 | -1.828688 | 3 | 8 | NaN |
In [31]:
rna_pathways = pd.read_csv(f'{fig_dir}/Early_vs_Late_RNA_GSEA_MetabolicPathways.csv')
rna_pathways[rna_pathways['padj'] < 0.25]
Out[31]:
pathway | pval | padj | ES | NES | nMoreExtreme | size | leadingEdge | |
---|---|---|---|---|---|---|---|---|
39 | Glyoxylate and Dicarboxylate Metabolism | 0.007983 | 0.218220 | -0.868297 | -1.656952 | 37 | 5 | NaN |
51 | N-Glycan Biosynthesis | 0.009699 | 0.218220 | 0.559554 | 1.672951 | 55 | 38 | NaN |
63 | Protein Modification | 0.002233 | 0.100484 | 0.825467 | 1.867962 | 11 | 11 | NaN |
75 | Steroid Metabolism | 0.000515 | 0.046392 | 0.600752 | 1.900671 | 2 | 51 | NaN |
In [32]:
set(list(rna_pathways[rna_pathways['padj'] < 0.25].pathway.values)) & set(list(protein_pathways[protein_pathways['padj'] < 0.25].pathway.values))
Out[32]:
{'Protein Modification'}
In [33]:
cpg_pathways = pd.read_csv(f'{fig_dir}/Early_vs_Late_DMC_GSEA_MetabolicPathways.csv')
cpg_pathways[cpg_pathways['padj'] < 0.25]
Out[33]:
pathway | pval | padj | ES | NES | nMoreExtreme | size | leadingEdge |
---|
In [ ]: