Notebook RCM Part 1 PanCan S.Figure 1
Figures for S.Fig 1¶
- For S. Figure 1 we want the PCA plots
- We also want a figure showing the RCM plots for the different clustering levels
In [2]:
# Need to add in entrez gene ID and also make labels for genes for ORA
# Imports
#from scircm import * # Note if you have a mac M1 use from sircle import * and you won't be able to do 7,8
import seaborn as sns
from sciutil import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import os
# Setup file locations and label of the cancer
u = SciUtil()
cancer = 'PanCan'
input_dir = 'Input_RCM'
output_dir = 'Output_Data'
supp_dir = 'Required_Refs'
fig_dir = 'Output_Figures'
regLabel = 'RG2_Changes_filtered'
# Use same files as in RCM
DNA_methylation_file = os.path.join('Input_Methylation', f'{cancer}_Filtered_DCpG.csv')
RNA_file = os.path.join('Input_RNAseq', f'{cancer}_filtered_DE_RNA.csv')
protein_file = os.path.join('Input_Protein', f'{cancer}_filtered_DA_Protein.csv')
# Names of columns in each of the above files
logFC_RNA_column = "logFC_rna"
padj_RNA_column = "padj_rna"
betadiff_DNA_methylation_column = "beta_diff"
padj_DNA_methylation_column = 'adj.P.Val'
logFC_protein_column = "logFC_protein"
padj_protein_column = "padj_protein"
# NOTE: all of the above files MUST have this column i.e. they must all have the samely named gene ID column
gene_id_column = "gene_name"
# Use same cutoffs as in RCM
rna_padj_cutoff = 0.05
prot_padj_cutoff = 0.05
meth_padj_cutoff = 0.05
rna_logfc_cutoff = 1.0
prot_logfc_cutoff = 0.5
meth_diff_cutoff = 0.1
m_df = pd.read_csv(DNA_methylation_file)
r_df = pd.read_csv(RNA_file)
p_df = pd.read_csv(protein_file)
/var/folders/sj/4wqsfdtd6093v9746b0t3mq40000gn/T/ipykernel_15789/3384284013.py:47: DtypeWarning: Columns (597) have mixed types. Specify dtype option on import or set low_memory=False. m_df = pd.read_csv(DNA_methylation_file)
In [3]:
# Load in the files that we used for DE analysis and remove any redundant columns
# the VAE expects just the ID as the index, and then the values #ClearCellRenalCellCarcinoma_filtered_samples_RNA
rna_sample_file = pd.read_csv(os.path.join('Input_RNAseq', f'{cancer}_samples_RNA.csv'))
prot_sample_file = pd.read_csv(os.path.join('Input_Protein', f'{cancer}_samples_Protein.csv'))
# Now we want to merge the clinical info with the cases from the sample df
meth_sample_file = pd.read_csv(os.path.join('Input_Methylation', f'{cancer}_samples_CpG.csv'))
In [7]:
for disease in set(rna_sample_file['Disease'].values):
s_df = rna_sample_file[rna_sample_file['Disease'] == disease]
print('RNA', disease, len(set(s_df['SafeCases'].values)))
for disease in set(prot_sample_file['Disease'].values):
s_df = prot_sample_file[prot_sample_file['Disease'] == disease]
print('Protein', disease, len(set(s_df['SafeCases'].values)))
for disease in set(meth_sample_file['Disease'].values):
s_df = meth_sample_file[meth_sample_file['Disease'] == disease]
print('DNA Methylation', disease, len(set(s_df['SafeCases'].values)))
RNA LungSquamousCellCarcinoma 100 RNA PancreaticDuctalAdenocarcinoma 90 RNA HeadandNeckSquamousCellCarcinoma 97 RNA LungAdenocarcinoma 103 Protein LungSquamousCellCarcinoma 107 Protein PancreaticDuctalAdenocarcinoma 132 Protein HeadandNeckSquamousCellCarcinoma 107 Protein LungAdenocarcinoma 103 DNA Methylation LungSquamousCellCarcinoma 99 DNA Methylation PancreaticDuctalAdenocarcinoma 89 DNA Methylation HeadandNeckSquamousCellCarcinoma 86 DNA Methylation LungAdenocarcinoma 101
In [8]:
for disease in set(rna_sample_file['Disease'].values):
cases = list(rna_sample_file[rna_sample_file['Disease'] == disease]['SafeCases'].values)
cases += list(prot_sample_file[prot_sample_file['Disease'] == disease]['SafeCases'].values)
cases += list(meth_sample_file[meth_sample_file['Disease'] == disease]['SafeCases'].values)
print('With at least one', disease, len(set(cases)))
With at least one LungSquamousCellCarcinoma 107 With at least one PancreaticDuctalAdenocarcinoma 135 With at least one HeadandNeckSquamousCellCarcinoma 108 With at least one LungAdenocarcinoma 107
PCA plots¶
In [20]:
from sklearn.decomposition import PCA
from sciviso import *
save_fig = True
plot_fig = True
disease_c_map = {'LungSquamousCellCarcinoma': 'pink', 'LungAdenocarcinoma': 'grey',
'PancreaticDuctalAdenocarcinoma': 'red',
'HeadandNeckSquamousCellCarcinoma': 'blue'}
sample_df = meth_sample_file
df = m_df
disease = cancer
# -------------------------------------------
# Visualise to check using PCA
# -------------------------------------------
cols = list(sample_df['Sample'].values)
vals = df[cols].values.T
pca = PCA(n_components=2)
pca_values = pca.fit_transform(vals)
var_ratio = pca.fit(vals).explained_variance_ratio_
plt.rcParams['figure.figsize'] = [4, 4]
vis_df = pd.DataFrame()
vis_df['PC_1'] = pca_values[:, 0]
vis_df['PC_2'] = pca_values[:, 1]
vis_df['Stage'] = sample_df['TumorStage'].values
vis_df['Disease'] = sample_df['Disease'].values
vis_df['CondID'] = sample_df['CondID'].values
vis_df['Colour'] = ['black' if c == 1 else 'grey' for c in vis_df['CondID'].values]
stage_c_map = {'Stage I': '#00665D', 'Stage II': '#43C9B8',
'Stage III': '#A05300',
'Stage IV': '#BF802D'}
vis_df['StageColour'] = [stage_c_map.get(c) if stage_c_map.get(c) else '#808080' for c in vis_df['Stage'].values]
vis_df['disColour'] = [disease_c_map.get(c) if disease_c_map.get(c) else '#808080' for c in vis_df['Disease'].values]
figsize = (1, 1)
ss = 10
sc = Scatterplot(vis_df, x='PC_1', y='PC_2', title=f'Sample Type',
xlabel=f'PC 1 {round(var_ratio[0], 2)}', ylabel=f'PC 2 {round(var_ratio[1], 2)}',
add_legend=True, colour=vis_df['Colour'].values,
config={'s': ss, 'opacity': 0.8, 'figsize': figsize})
sc.plot()
if save_fig:
plt.savefig(os.path.join(fig_dir, f'{disease}_CpG_scatterPCASampleType.svg'))
if plot_fig:
plt.show()
sc = Scatterplot(vis_df, x='PC_1', y='PC_2', title=f'Stage',
xlabel=f'PC 1 {round(var_ratio[0], 2)}', ylabel=f'PC 2 {round(var_ratio[1], 2)}',
add_legend=True, colour=vis_df['StageColour'].values,
config={'s': ss, 'opacity': 0.8, 'figsize': figsize})
sc.plot()
if save_fig:
plt.savefig(os.path.join(fig_dir, f'{disease}_CpG_scatterPCAStage.svg'))
if plot_fig:
plt.show()
sc = Scatterplot(vis_df, x='PC_1', y='PC_2', title=f'Disease',
xlabel=f'PC 1 {round(var_ratio[0], 2)}', ylabel=f'PC 2 {round(var_ratio[1], 2)}',
add_legend=True, colour=vis_df['disColour'].values,
config={'s': ss, 'opacity': 0.8, 'figsize': figsize})
sc.plot()
if save_fig:
plt.savefig(os.path.join(fig_dir, f'{disease}_CpG_scatterPCADisease.svg'))
if plot_fig:
plt.show()
/Users/ariane/opt/miniconda3/envs/roundround/lib/python3.10/site-packages/sciviso/scatterplot.py:114: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored scatter = ax.scatter(vis_df[x].values, vis_df[y].values, c=self.colour, alpha=self.opacity,
In [21]:
from sklearn.decomposition import PCA
from sciviso import *
save_fig = True
plot_fig = True
sample_df = rna_sample_file
df = r_df
disease = cancer
# -------------------------------------------
# Visualise to check using PCA
# -------------------------------------------
cols = [c for c in list(sample_df['Sample'].values) if c in df.columns]
sample_df = sample_df[sample_df['Sample'].isin(cols)]
vals = df[cols].values.T
pca = PCA(n_components=2)
pca_values = pca.fit_transform(vals)
var_ratio = pca.fit(vals).explained_variance_ratio_
plt.rcParams['figure.figsize'] = [4, 4]
vis_df = pd.DataFrame()
vis_df['PC_1'] = pca_values[:, 0]
vis_df['PC_2'] = pca_values[:, 1]
vis_df['Stage'] = sample_df['TumorStage'].values
vis_df['Disease'] = sample_df['Disease'].values
vis_df['CondID'] = sample_df['CondID'].values
vis_df['Colour'] = ['black' if c == 1 else 'grey' for c in vis_df['CondID'].values]
vis_df['disColour'] = [disease_c_map.get(c) if disease_c_map.get(c) else '#808080' for c in vis_df['Disease'].values]
stage_c_map = {'Stage I': '#00665D', 'Stage II': '#43C9B8',
'Stage III': '#A05300',
'Stage IV': '#BF802D'}
vis_df['StageColour'] = [stage_c_map.get(c) if stage_c_map.get(c) else '#808080' for c in vis_df['Stage'].values]
sc = Scatterplot(vis_df, x='PC_1', y='PC_2', title=f'Sample Type',
xlabel=f'PC 1 {round(var_ratio[0], 2)}', ylabel=f'PC 2 {round(var_ratio[1], 2)}',
add_legend=True, colour=vis_df['Colour'].values,
config={'s': ss, 'opacity': 0.8, 'figsize': figsize})
sc.plot()
if save_fig:
plt.savefig(os.path.join(fig_dir, f'{disease}_RNA_scatterPCASampleType.svg'))
if plot_fig:
plt.show()
sc = Scatterplot(vis_df, x='PC_1', y='PC_2', title=f'Stage',
xlabel=f'PC 1 {round(var_ratio[0], 2)}', ylabel=f'PC 2 {round(var_ratio[1], 2)}',
add_legend=True, colour=vis_df['StageColour'].values,
config={'s': ss, 'opacity': 0.8, 'figsize': figsize})
sc.plot()
if save_fig:
plt.savefig(os.path.join(fig_dir, f'{disease}_RNA_scatterPCAStage.svg'))
if plot_fig:
plt.show()
if plot_fig:
plt.show()
sc = Scatterplot(vis_df, x='PC_1', y='PC_2', title=f'Disease',
xlabel=f'PC 1 {round(var_ratio[0], 2)}', ylabel=f'PC 2 {round(var_ratio[1], 2)}',
add_legend=True, colour=vis_df['disColour'].values,
config={'s': ss, 'opacity': 0.8, 'figsize': figsize})
sc.plot()
/Users/ariane/opt/miniconda3/envs/roundround/lib/python3.10/site-packages/sciviso/scatterplot.py:114: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored scatter = ax.scatter(vis_df[x].values, vis_df[y].values, c=self.colour, alpha=self.opacity,
Out[21]:
<Axes: title={'center': 'Disease'}, xlabel='PC 1 0.25', ylabel='PC 2 0.09'>
In [22]:
from sklearn.decomposition import PCA
from sciviso import *
save_fig = True
plot_fig = True
sample_df = prot_sample_file
df = p_df
disease = cancer
# -------------------------------------------
# Visualise to check using PCA
# -------------------------------------------
cols = list(set(sample_df['Sample'].values) & set(df.columns))
sample_df = sample_df[sample_df.Sample.isin(cols)]
cols = list(sample_df['Sample'].values)
vals = df[cols].values.T
pca = PCA(n_components=2)
pca_values = pca.fit_transform(vals)
var_ratio = pca.fit(vals).explained_variance_ratio_
plt.rcParams['figure.figsize'] = [4, 4]
vis_df = pd.DataFrame()
vis_df['PC_1'] = pca_values[:, 0]
vis_df['PC_2'] = pca_values[:, 1]
vis_df['Stage'] = sample_df['TumorStage'].values
vis_df['Disease'] = sample_df['Disease'].values
vis_df['CondID'] = sample_df['CondID'].values
vis_df['Colour'] = ['black' if c == 1 else 'grey' for c in vis_df['CondID'].values]
stage_c_map = {'Stage I': '#00665D', 'Stage II': '#43C9B8',
'Stage III': '#A05300',
'Stage IV': '#BF802D'}
vis_df['disColour'] = [disease_c_map.get(c) if disease_c_map.get(c) else '#808080' for c in vis_df['Disease'].values]
vis_df['StageColour'] = [stage_c_map.get(c) if stage_c_map.get(c) else '#808080' for c in vis_df['Stage'].values]
sc = Scatterplot(vis_df, x='PC_1', y='PC_2', title=f'Sample Type',
xlabel=f'PC 1 {round(var_ratio[0], 2)}', ylabel=f'PC 2 {round(var_ratio[1], 2)}',
add_legend=True, colour=vis_df['Colour'].values,
config={'s': ss, 'opacity': 0.8, 'figsize': figsize})
sc.plot()
if save_fig:
plt.savefig(os.path.join(fig_dir, f'{disease}_protein_scatterPCASampleType.svg'))
if plot_fig:
plt.show()
sc = Scatterplot(vis_df, x='PC_1', y='PC_2', title=f'Stage',
xlabel=f'PC 1 {round(var_ratio[0], 2)}', ylabel=f'PC 2 {round(var_ratio[1], 2)}',
add_legend=True, colour=vis_df['StageColour'].values,
config={'s': ss, 'opacity': 0.8, 'figsize': figsize})
sc.plot()
if save_fig:
plt.savefig(os.path.join(fig_dir, f'{disease}_protein_scatterPCAStage.svg'))
if plot_fig:
plt.show()
if plot_fig:
plt.show()
sc = Scatterplot(vis_df, x='PC_1', y='PC_2', title=f'Disease',
xlabel=f'PC 1 {round(var_ratio[0], 2)}', ylabel=f'PC 2 {round(var_ratio[1], 2)}',
add_legend=True, colour=vis_df['disColour'].values,
config={'s': ss, 'opacity': 0.8, 'figsize': figsize})
sc.plot()
/Users/ariane/opt/miniconda3/envs/roundround/lib/python3.10/site-packages/sciviso/scatterplot.py:114: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored scatter = ax.scatter(vis_df[x].values, vis_df[y].values, c=self.colour, alpha=self.opacity,
Out[22]:
<Axes: title={'center': 'Disease'}, xlabel='PC 1 0.21', ylabel='PC 2 0.1'>
In [24]:
[c for c in prot_sample_file['Sample'].values if c not in set(df.columns)]
## WHAT????
Out[24]:
[]
Add in Plots of RCM for simplicity¶
In [27]:
df = pd.read_csv(os.path.join(output_dir, 'sircle_PorMandR_PanCan.csv'))
# ----------------------------------------------------------------------------------------------
# Make figures
# ----------------------------------------------------------------------------------------------
# Make some plots
label = 'disease'
reg_label = 'RG2_Changes'
# figure size in inches
rcParams['figure.figsize'] = 3, 2
sns.set(rc={'figure.figsize': (3, 2)}, style='ticks')
colour_map = {'MDS': '#d8419b', 'MDS_TMDE': '#e585c0', 'MDS_ncRNA': '#d880b4',
'MDE': '#6aaf44', 'MDE_TMDS': '#0e8e6d', 'MDE_ncRNA': '#9edb77',
'TMDE': '#fe2323', 'TMDS': '#2952ff', 'TPDE': '#e68e25', 'TPDE_TMDS': '#844c0f',
'TPDS': '#462d76', 'TPDS_TMDE': '#9b29b7'}
sns.set_palette("Greys_r")
rcm_labels = ["TMDE", "TMDS", "TPDE_TMDS", "TPDE", "TPDS_TMDE", "TPDS", "MDS_TMDE", "MDE", "MDE_TMDS", "MDS"]
colours = [colour_map[c] for c in rcm_labels]
sns.catplot(data=df, x=reg_label + '_filtered', kind="count", order=rcm_labels, palette=sns.color_palette(colours),
height=4)
plt.xticks(rotation=45, ha='right')
plt.title(f'{label} {reg_label}')
plt.savefig(os.path.join(fig_dir, f'Figure4A_barplot_{label}_TvN.svg'))
plt.show()
/var/folders/gq/6ljhmvm1713fykdjqbl188pm0000gn/T/ipykernel_22344/1583966900.py:1: DtypeWarning: Columns (1,2,1161,1163,1164,1167,1176,1177,1178,1179,1180,1181,1182,1195,1197,1198,1202,1203,1208,1974,1975,1976) have mixed types. Specify dtype option on import or set low_memory=False. df = pd.read_csv(os.path.join(output_dir, 'sircle_PorMandR_PanCan.csv')) /Users/ariane/opt/miniconda3/envs/roundround/lib/python3.10/site-packages/seaborn/axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
In [28]:
reg_label = 'RG3_Translation'
sns.catplot(data=df, x=reg_label + '_filtered', kind="count",
order=['MDE', 'MDS', 'TPDE', 'TPDS', 'TMDE', 'TMDS'], height=4)
plt.xticks(rotation=45, ha='right')
plt.title(f'{label} {reg_label}')
plt.savefig(os.path.join(fig_dir, f'Figure4A_barplot_{label}_{reg_label}_TvN.svg'))
plt.show()
/Users/ariane/opt/miniconda3/envs/roundround/lib/python3.10/site-packages/seaborn/axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
In [29]:
# Now we want to build a heatmap for RG1_All_filtered
# We'll have RNA + DNA methylation and then protein as the y axis
from collections import defaultdict
labels = defaultdict(lambda: defaultdict(int))
values = defaultdict(lambda: defaultdict(int))
rg2 = df['RG2_Changes_filtered'].values
for i, c in enumerate(df['RG1_All_filtered'].values):
rna_meth = " + ".join(c.split('+')[:-1])
prot = c.split('+')[-1]
values[rna_meth.strip()][prot.strip()] += 1
labels[rna_meth.strip()][prot.strip()] = rg2[i]
In [30]:
# Now we make a heatmap based on this
cols = ['Protein UP', 'Protein significant-positive', 'Protein not-significant', 'Protein significant-negative', 'Protein DOWN', 'Protein Undetected']
rows = []
for k, v in values.items():
if k:
row = [k]
for pk in cols:
row.append(v.get(pk))
rows.append(row)
cols = ['Label'] + cols
heatdf = pd.DataFrame(rows, columns=cols)
In [31]:
# Now we make a heatmap based on this
cols = ['Protein Undetected', 'Protein UP', 'Protein significant-positive', 'Protein not-significant', 'Protein significant-negative', 'Protein DOWN']
row_order = ['Protein Undetected', 'Protein UP', 'Protein significant-positive', 'Protein not-significant', 'Protein significant-negative', 'Protein DOWN']
row_lbl = ['Hypomethylation + RNA UP',
'Hypomethylation + RNA No change',
'Hypomethylation + RNA DOWN',
'Methylation No change + RNA DOWN',
'Methylation No change + RNA No change',
'Methylation No change + RNA UP',
'Hypermethylation + RNA UP',
'Hypermethylation + RNA No change',
'Hypermethylation + RNA DOWN']
rows = []
for k in row_lbl:
v = values.get(k)
for pk in cols:
rows.append([k, pk, v.get(pk), colour_map.get(labels.get(k).get(pk)), labels.get(k).get(pk)])
cols = ['meth_rna', 'protein', 'value', 'colour', 'reglabel']
scatdf = pd.DataFrame(rows, columns=cols)
In [32]:
ax = plt.scatter(scatdf['meth_rna'], scatdf['protein'], c=[c if c else '#D3D3D3' for c in scatdf['colour']],
s=200, alpha=0.8)
plt.xticks(rotation=45, ha='right')
for i, txt in enumerate(scatdf['meth_rna']):
plt.annotate(str(scatdf['value'][i]), (scatdf['meth_rna'][i], scatdf['protein'][i]),
ha='center', va='center', fontsize=8)
plt.margins(0.1, 0.1)
plt.savefig(os.path.join(fig_dir, f'Sfig1_scatplot_AllRG1_TvN.svg'))
In [ ]: