Notebook Part 4 ITH Analysis

protein_analysis

A supplemental figure for the analysis of the protein data¶

This is a supplemental analysis for the reviewer comment that covers how the differentially expressed proteins from a held out cohort overlap with the data from our original SiRCle analysis.

In [1]:

import pandas as pd
import seaborn as sns
import os
cancer = 'ClearCellRenalCellCarcinoma-ITH'

protein_file = os.path.join('', f'{cancer}_filtered_DA_Protein.csv')
DNA_methylation_file = os.path.join('', f'{cancer}_filtered_DCpG.csv')
RNA_file = os.path.join('', f'{cancer}_filtered_DE_RNA.csv')
df = pd.read_csv(protein_file)

In [57]:

df

Out[57]:

	gene_name	C3L.00103.N	C3L.00103.T	C3L.00369.N	C3L.00369.T	C3L.00418.N	C3L.00418.T	C3L.00447.N	C3L.00447.T	C3L.00581.N	...	C3N.01522.N	C3N.01522.T	C3N.01524.N	C3N.01524.T	logFC_protein	stat_protein	pvalue_protein	padj_protein	B_protein	mean_protein
0	A1BG	23.804253	26.759882	25.250929	25.706787	25.574954	25.655343	24.810448	26.370921	25.835122	...	24.860013	24.611204	25.761720	26.682259	0.375288	2.303716	0.023836	0.042772	-5.059550	25.396635
1	A1CF	20.787317	22.992804	20.812159	17.168629	21.221135	21.058130	21.316131	20.529617	21.118637	...	21.397382	20.904029	19.244446	19.077349	0.200236	0.760737	0.449059	0.533554	-7.363686	20.849946
2	A2M	24.614646	27.558356	25.904099	26.987516	26.591306	27.710213	24.398907	26.680032	25.710723	...	25.538208	26.169343	26.450465	27.142634	0.776217	4.223913	0.000063	0.000197	0.477720	26.441642
3	AAAS	20.236853	20.397305	20.231370	19.934762	20.394137	20.681434	20.178137	20.756631	20.494033	...	20.086720	20.531336	20.610156	22.994137	0.227338	1.503925	0.136541	0.195651	-6.528349	20.322339
4	AACS	20.306412	19.653499	20.262668	19.051496	20.395627	20.425885	20.738622	21.572830	20.761294	...	20.623840	21.063069	20.605948	20.117698	0.017210	0.082915	0.934126	0.949128	-7.651501	20.336660
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
6638	ZRANB2	20.718964	20.649922	20.756337	20.931351	20.172372	20.204854	20.001630	20.337843	20.437339	...	19.906271	20.883130	20.853612	19.448125	0.149938	1.628372	0.107380	0.159724	-6.337325	20.378892
6639	ZSCAN18	18.751459	22.945405	18.272627	17.000598	19.062571	18.615463	18.961661	18.800101	17.955643	...	17.870333	17.688821	18.802441	19.333710	0.343370	0.953356	0.343292	0.426499	-7.198444	18.877981
6640	ZW10	20.385328	19.858816	20.306806	19.632650	20.220704	19.591835	20.485089	19.727791	20.108091	...	20.394800	20.636149	20.238352	19.593638	-0.130679	-1.642351	0.104445	0.155916	-6.314979	20.158184
6641	ZYX	24.676250	24.442168	24.290796	23.778188	23.275541	23.835075	23.240994	23.937512	24.003469	...	23.337872	23.956111	24.883876	23.600379	0.322354	2.670256	0.009178	0.018410	-4.205174	23.867947
6642	ZZEF1	19.611202	19.312686	19.703293	19.612429	19.367981	19.774148	19.250150	19.670867	19.255124	...	19.337341	19.262659	19.327407	18.525027	-0.105834	-1.357701	0.178378	0.246099	-6.734418	19.230044

6643 rows × 87 columns

Generic visualisations¶

In [84]:

import matplotlib.pyplot as plt
import numpy as np
df = df.dropna()
plt.scatter(df['logFC_protein'].values, -1* np.log10(df['padj_protein'].values))
plt.title('New kidney cancer DEPs for CL coloured by SiRCle cluster')
plt.xlabel('Log2FoldChange')
plt.ylabel('-1*log10(FDR)')

# Perhaps colour by the SiRCle cluster

Out[84]:

Text(0, 0.5, '-1*log10(FDR)')

No description has been provided for this image

In [96]:

s_df = pd.read_csv('input_data/Table2-3_SiRCle_ORA_DE_DA_DMC.csv')
s_df.set_index('gene_name', inplace=True)

/var/folders/sj/4wqsfdtd6093v9746b0t3mq40000gn/T/ipykernel_90533/4293941828.py:1: DtypeWarning: Columns (2,3,59) have mixed types. Specify dtype option on import or set low_memory=False.
  s_df = pd.read_csv('input_data/Table2-3_SiRCle_ORA_DE_DA_DMC.csv')

In [86]:

df.set_index('gene_name', inplace=True)
df = df.join(s_df, how='inner', lsuffix='_ITH')
df

Out[86]:

	C3L.00103_Normal_Protein	C3L.00103_Tumor_Protein	C3L.00369_Normal_Protein	C3L.00369_Tumor_Protein	C3L.00418_Normal_Protein	C3L.00418_Tumor_Protein	C3L.00447_Normal_Protein	C3L.00447_Tumor_Protein	C3L.00581_Normal_Protein	C3L.00581_Tumor_Protein	...	RG1_All	RG2_Changes	RG3_Translation	RG4_Detection	Background_filter	RG1_All_filtered	RG2_Changes_filtered	RG3_Translation_filtered	RG4_Detection_filtered	entrezgene_id
gene_name
A1BG	23.804253	26.759882	25.250929	25.706787	25.574954	25.655343	24.810448	26.370921	25.835122	25.456924	...	Hypermethylation + RNA No change + Protein sig...	NaN	NaN	NaN	threshold + NS + threshold	Hypermethylation + RNA No change + Protein sig...	NaN	NaN	NaN	1.0
A1CF	20.787317	22.995330	20.812159	17.168629	21.221135	21.058130	21.316131	20.529617	21.118637	20.000714	...	Hypomethylation + RNA No change + Protein DOWN	TMDS	TMDS	TMDS	threshold + threshold + threshold	Hypomethylation + RNA No change + Protein DOWN	TMDS	TMDS	TMDS	29974.0
A2M	24.614646	27.558356	25.904099	26.987516	26.591306	27.710213	24.398907	26.680032	25.710723	25.618344	...	Hypomethylation + RNA No change + Protein UP	TMDE	TMDE	TMDE	threshold + threshold + threshold	Hypomethylation + RNA No change + Protein UP	TMDE	TMDE	TMDE	2.0
AAAS	20.236853	20.397305	20.231370	19.934762	20.394137	20.681434	20.178137	20.756631	20.494033	20.071701	...	Methylation No change + RNA No change + Protei...	NaN	NaN	NaN	threshold + threshold + threshold	Methylation No change + RNA No change + Protei...	NaN	NaN	NaN	8086.0
AACS	20.306412	19.653499	20.262668	19.051496	20.395627	20.425885	20.738622	21.572830	20.761294	20.791659	...	Methylation No change + RNA No change + Protei...	NaN	NaN	NaN	NS + threshold + threshold	Methylation No change + RNA No change + Protei...	NaN	NaN	NaN	65985.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
ZRANB2	20.718964	20.649922	20.756337	20.931351	20.172372	20.204854	20.001630	20.337843	20.437339	20.258375	...	Hypomethylation + RNA No change + Protein not-...	NaN	NaN	NaN	threshold + threshold + threshold	Hypomethylation + RNA No change + Protein not-...	NaN	NaN	NaN	9406.0
ZSCAN18	18.751459	22.948015	18.272627	17.000598	19.062571	18.615463	18.961661	18.800101	17.955643	16.301208	...	Hypermethylation + RNA No change + Protein sig...	NaN	NaN	NaN	threshold + threshold + threshold	Hypermethylation + RNA No change + Protein sig...	NaN	NaN	NaN	65982.0
ZW10	20.385328	19.858816	20.306806	19.632650	20.220704	19.591835	20.485089	19.727791	20.108091	20.353182	...	Methylation No change + RNA No change + Protei...	NaN	NaN	NaN	threshold + threshold + threshold	Methylation No change + RNA No change + Protei...	NaN	NaN	NaN	9183.0
ZYX	24.676250	24.442168	24.290796	23.778188	23.275541	23.835075	23.240994	23.937512	24.003469	24.562098	...	Hypomethylation + RNA No change + Protein sign...	NaN	NaN	NaN	threshold + threshold + threshold	Hypomethylation + RNA No change + Protein sign...	NaN	NaN	NaN	7791.0
ZZEF1	19.611202	19.312686	19.703293	19.612429	19.367981	19.774148	19.250150	19.670867	19.255124	19.540517	...	Hypermethylation + RNA No change + Protein not...	NaN	NaN	NaN	threshold + threshold + threshold	Hypermethylation + RNA No change + Protein not...	NaN	NaN	NaN	23140.0

6610 rows × 174 columns

In [87]:

cols = ['logFC_rna', 'logFC_protein', 'beta_diff']
df = df[df['RG2_Changes_filtered'] != 'Not-Background']
from scipy import stats
df = df.dropna(subset='logFC_protein')
for c in cols:
    sub_df = df.dropna(subset=c)
    sns.scatterplot(data=sub_df, x=c, y='logFC_protein_ITH', hue='RG2_Changes_filtered')
    plt.title(f'New cohort protein vs {c}')
    res = stats.pearsonr(list(sub_df[c].values), list(sub_df['logFC_protein_ITH'].values))
    print('-------------------------------------')
    print(c)
    print(res)
    print('-------------------------------------')

    plt.xlabel(c)
    plt.ylabel('logFC_protein')
    plt.show()

-------------------------------------
logFC_rna
PearsonRResult(statistic=0.3059215135342366, pvalue=9.759214433961229e-136)
-------------------------------------

-------------------------------------
logFC_protein
PearsonRResult(statistic=0.5385927636414971, pvalue=0.0)
-------------------------------------

-------------------------------------
beta_diff
PearsonRResult(statistic=-0.09372901219197625, pvalue=1.8799128025780966e-09)
-------------------------------------

In [92]:

df

Out[92]:

	Unnamed: 0	gene_name	gene_id	logFC_rna	stat_rna	pvalue_rna	padj_rna	lfcSE_rna	baseMean_rna	var_rna	...	C3N.00577_Normal_RNA	C3N.00646_Normal_RNA	C3N.00733_Normal_RNA	C3N.00852_Normal_RNA	C3N.01200_Normal_RNA	C3N.01214_Normal_RNA	C3N.01220_Normal_RNA	C3N.01261_Normal_RNA	C3N.01522_Normal_RNA	C3N.01524_Normal_RNA
0	ENSG00000227232.5	WASH7P	ENSG00000227232.5	0.100815	0.981146	3.265207e-01	3.713650e-01	0.102752	55.953140	0.063379	...	1.435478	1.348459	1.116795	1.320242	1.343969	1.035960	1.169060	1.394861	1.293977	1.355976
1	ENSG00000278267.1	MIR6859-3	ENSG00000278267.1	0.459552	2.885099	3.912911e-03	5.983595e-03	0.159285	14.359665	0.034510	...	0.146062	0.394089	0.222742	0.229883	0.234560	0.163569	0.486820	0.352453	0.271943	0.553764
2	ENSG00000241860.5	RP11-34P13.13	ENSG00000241860.5	-0.063560	-0.373845	7.085199e-01	7.427218e-01	0.170018	14.717394	0.035239	...	0.616098	0.225445	0.222742	0.458666	0.489068	0.757292	0.209243	0.547170	0.332612	0.553764
3	ENSG00000279457.2	FO538757.2	ENSG00000279457.2	0.288522	3.443427	5.743912e-04	9.671850e-04	0.083789	201.589489	0.107351	...	2.273700	2.224256	2.951146	2.426555	2.009537	1.934058	2.212404	2.266853	2.435082	2.834879
4	ENSG00000228463.7	AP006222.2	ENSG00000228463.7	-0.055012	-0.222341	8.240486e-01	8.464832e-01	0.247422	31.800915	0.174052	...	0.512156	0.884158	0.688985	0.829712	0.727483	0.378663	0.738915	0.281235	0.931697	0.475008
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
24143	ENSG00000198695.2	MT-ND6	ENSG00000198695.2	-1.908100	-8.777229	1.675520e-18	8.350974e-18	0.217392	948.196104	1.728170	...	4.366362	5.131135	5.204232	5.792905	4.171866	5.017668	6.009295	6.043066	5.642533	3.203895
24144	ENSG00000210194.1	MT-TE	ENSG00000210194.1	-0.171499	-0.401326	6.881799e-01	7.245135e-01	0.427330	27.631935	0.349613	...	0.118009	0.884158	0.663867	0.898170	0.070997	0.536257	0.679900	0.871993	0.970395	0.000000
24145	ENSG00000198727.2	MT-CYB	ENSG00000198727.2	-3.343283	-20.637475	1.265017e-94	1.840219e-92	0.162001	46937.429237	3.213463	...	10.718429	10.846462	10.660657	11.722216	9.696734	10.687551	11.213792	11.426008	11.655362	8.983677
24146	ENSG00000210195.2	MT-TT	ENSG00000210195.2	-1.827532	-7.928303	2.221611e-15	9.100504e-15	0.230507	62.624165	0.525226	...	1.160869	2.091509	0.638304	2.868886	0.727483	1.674007	0.885418	1.492683	1.479804	0.767118
24147	ENSG00000210196.2	MT-TP	ENSG00000210196.2	0.014782	0.056953	9.545823e-01	9.610696e-01	0.259543	29.435895	0.185329	...	0.655665	0.659833	0.713672	0.852894	0.353246	0.678318	0.936735	1.136954	0.651169	0.303290

24148 rows × 85 columns

In [98]:

df = pd.read_csv(RNA_file)
import matplotlib.pyplot as plt
import numpy as np

plt.scatter(df['logFC_rna'].values, -1* np.log10(df['padj_rna'].values))
plt.title('New kidney cancer RNA for CL coloured by SiRCle cluster')
plt.xlabel('RNA')
plt.ylabel('-1*log10(FDR)')

# Perhaps colour by the SiRCle cluster

/var/folders/sj/4wqsfdtd6093v9746b0t3mq40000gn/T/ipykernel_90533/971542705.py:5: RuntimeWarning: divide by zero encountered in log10
  plt.scatter(df['logFC_rna'].values, -1* np.log10(df['padj_rna'].values))

Out[98]:

Text(0, 0.5, '-1*log10(FDR)')

In [99]:

df.set_index('gene_name', inplace=True)
df = df.join(s_df, how='inner', lsuffix='_ITH')

cols = ['logFC_rna', 'logFC_protein', 'beta_diff']
df = df[df['RG2_Changes_filtered'] != 'Not-Background']
from scipy import stats
df = df.dropna(subset='logFC_protein')
for c in cols:
    sub_df = df.dropna(subset=c)
    sns.scatterplot(data=sub_df, x=c, y='logFC_rna_ITH', hue='RG2_Changes_filtered')
    plt.title(f'New cohort logFC_rna vs {c}')
    res = stats.pearsonr(list(sub_df[c].values), list(sub_df['logFC_rna_ITH'].values))
    print('-------------------------------------')
    print(c)
    print(res)
    print('-------------------------------------')

    plt.xlabel(c)
    plt.ylabel('logFC_rna')
    plt.show()

-------------------------------------
logFC_rna
PearsonRResult(statistic=0.9756351955789109, pvalue=0.0)
-------------------------------------

-------------------------------------
logFC_protein
PearsonRResult(statistic=0.6682427744430844, pvalue=0.0)
-------------------------------------

-------------------------------------
beta_diff
PearsonRResult(statistic=-0.2636471724241564, pvalue=2.891010625265638e-87)
-------------------------------------

In [ ]:

In [68]:

DNA_methylation_file = os.path.join('', f'{cancer}_filtered_DCpG.csv')
df = pd.read_csv(DNA_methylation_file)
import matplotlib.pyplot as plt
import numpy as np

plt.scatter(df['beta_diff'].values, -1* np.log10(df['adj.P.Val'].values))
plt.title('New kidney cancer DMETH for CL coloured by SiRCle cluster')
plt.xlabel('Beta Diff')
plt.ylabel('-1*log10(FDR)')

# Perhaps colour by the SiRCle cluster

Out[68]:

Text(0, 0.5, '-1*log10(FDR)')

In [69]:

df.set_index('gene_name', inplace=True)
df = df.join(s_df, how='inner', lsuffix='_ITH')

cols = ['logFC_rna', 'logFC_protein', 'beta_diff']
df = df[df['RG2_Changes_filtered'] != 'Not-Background']
from scipy import stats
df = df.dropna(subset='logFC_protein')
for c in cols:
    sub_df = df.dropna(subset=c)
    sns.scatterplot(data=sub_df, x=c, y='beta_diff_ITH', hue='RG2_Changes_filtered')
    plt.title(f'New cohort methylation vs {c}')
    res = stats.pearsonr(list(sub_df[c].values), list(sub_df['beta_diff_ITH'].values))
    print('-------------------------------------')
    print(c)
    print(res)
    print('-------------------------------------')

    plt.xlabel(c)
    plt.ylabel('Beta diff')
    plt.show()

-------------------------------------
logFC_rna
PearsonRResult(statistic=-0.20775327088679674, pvalue=1.810060174214105e-61)
-------------------------------------

-------------------------------------
logFC_protein
PearsonRResult(statistic=-0.13901533974886499, pvalue=5.399477145097151e-29)
-------------------------------------

-------------------------------------
beta_diff
PearsonRResult(statistic=0.75982793719863, pvalue=0.0)
-------------------------------------

In [30]:

# Get the CNV and correlate it with the expression change

In [70]:

cnv = pd.read_csv('input_data/CPTAC_ccRCC_combined_CNV_gene_level_log2ratio_v1.1.tsv', sep='\t')
# Get the mean cnv for a gene then correlate it with the gene expression log2FC
cnv

Out[70]:

	gene	chr	start	end	C3L-00004	C3L-00010	C3L-00011	C3L-00026	C3L-00079	C3L-00088	...	C3N-02723	C3N-02726	C3N-02761	C3N-02763	C3N-02811	C3N-02945	C3N-03018	C3N-03019	C3N-03020	C3N-03021
0	OR4F5	chr1	65418	71585	0.009405	0.010501	0.118637	-0.578567	NaN	0.141188	...	-0.014622	-0.010682	0.006287	NaN	0.329618	NaN	0.742674	0.019052	0.013805	0.194185
1	OR4F29	chr1	450702	451697	0.009405	0.010501	0.118637	-0.578567	0.048056	0.141188	...	-0.014622	-0.010682	0.006287	-0.054349	0.329618	-0.024026	0.742674	0.019052	0.013805	0.194185
2	OR4F16	chr1	685678	686673	0.009405	0.010501	0.118637	2.464256	0.048056	2.596901	...	-0.014622	-0.010682	0.006287	-0.054349	1.677102	-0.024026	0.742674	0.019052	0.013805	-0.015749
3	SAMD11	chr1	923927	944581	0.009405	0.010501	0.118637	-0.067869	0.048056	0.002553	...	-0.014622	-0.010682	0.006287	-0.054349	0.037897	-0.024026	-0.081266	0.019052	0.013805	-0.015749
4	NOC2L	chr1	944203	959309	0.009405	0.010501	0.118637	-0.067869	0.048056	0.002553	...	-0.014622	-0.010682	0.006287	-0.054349	0.037897	-0.024026	-0.081266	0.019052	0.013805	-0.015749
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
19902	BPY2B	chrY	24607559	24639207	-0.706813	-0.877417	0.320002	0.019201	-0.968900	-0.117876	...	-0.718553	-0.570521	-0.015854	-1.046517	-0.079820	0.130616	-1.342746	0.063460	-0.053560	-0.062743
19903	DAZ3	chrY	24763068	24813492	-0.706813	-0.877417	0.320002	0.019201	-0.968900	-0.117876	...	-0.718553	-0.570521	-0.015854	-1.046517	-0.079820	0.130616	-1.342746	0.063460	-0.053560	-0.062743
19904	DAZ4	chrY	24833842	24907040	-0.706813	-0.877417	0.320002	0.019201	-0.968900	-0.117876	...	-0.718553	-0.570521	-0.015854	-1.046517	-0.079820	0.130616	-1.342746	0.063460	-0.053560	-0.062743
19905	BPY2C	chrY	25030900	25062548	-0.706813	-0.877417	0.320002	0.019201	-0.968900	-0.117876	...	-0.718553	-0.570521	-0.015854	-1.046517	-0.079820	0.130616	-1.342746	0.063460	-0.053560	-0.062743
19906	CDY1	chrY	25622161	25624902	-0.706813	-0.877417	0.320002	0.019201	-0.968900	-0.117876	...	-0.718553	-0.570521	-0.015854	-1.046517	-0.079820	0.130616	-1.342746	0.063460	-0.053560	-0.062743

19907 rows × 226 columns

In [71]:

mean_cnv = np.mean(cnv[[c for c in cnv if c not in ['gene', 'chr', 'start', 'end']]], axis=1)
cnv['MeanCNV'] = mean_cnv

In [73]:

cnv.set_index('gene', inplace=True)
df = cnv.join(s_df, how='inner', lsuffix='_ITH')

cols = ['logFC_rna', 'logFC_protein', 'beta_diff']
df = df[df['RG2_Changes_filtered'] != 'Not-Background']
from scipy import stats
df = df.dropna(subset='logFC_protein')
for c in cols:
    sub_df = df.dropna(subset=c)
    sns.scatterplot(data=sub_df, x=c, y='MeanCNV', hue='RG2_Changes_filtered')
    plt.title(f'New cohort MeanCNV vs {c}')
    res = stats.pearsonr(list(sub_df[c].values), list(sub_df['MeanCNV'].values))
    print('-------------------------------------')
    print(c)
    print(res)
    print('-------------------------------------')

    plt.xlabel(c)
    plt.ylabel('MeanCNV')
    plt.show()

-------------------------------------
logFC_rna
PearsonRResult(statistic=0.10968270161975488, pvalue=3.919205659105048e-24)
-------------------------------------

-------------------------------------
logFC_protein
PearsonRResult(statistic=0.08067832550527829, pvalue=4.092812047083789e-14)
-------------------------------------

-------------------------------------
beta_diff
PearsonRResult(statistic=-0.037718084729706496, pvalue=0.004803714980133639)
-------------------------------------

In [74]:

df = pd.read_csv('Tab1iProFun_SupplementalTable3.csv')
cnv_lr = df['CNV (lr) gain or loss'].values
cnv_baf = df['CNV (baf) < 0.4'].values
df

Out[74]:

	Gene	peptide	chromosome	start	CNV (lr) RNA	CNV (baf) RNA	Methylation RNA	CNV (lr) Protein	CNV (baf) Protein	Methylation Protein	...	R^2 Methylation-mRNA	R^2 Methylation-Phospho	R^2 Mutation-Global	R^2 Mutation-mRNA	R^2 Mutation-Phospho	q_value_tumor_normal	q_value_grade	hypermethylation	CNV (lr) gain or loss	CNV (baf) < 0.4
0	SEPT1	EEEIHIYQFPECDSDEDEDFKR	chr16	30378133	0	0	0	0	0	0	...	0.0370	0.03500	NaN	NaN	NaN	3.800000e-19	0.72	0.00	0.150	0.110
1	SEPT2	IYHLPDAESDEDEDFK	chr2	241315100	1	0	0	0	0	0	...	0.0190	0.00110	NaN	NaN	NaN	3.900000e-04	0.36	0.00	0.140	0.170
2	SEPT4	LTRESGTDFPIPAVPPGTDPETEK	chr17	58520250	1	0	0	0	0	0	...	0.0023	0.00016	NaN	NaN	NaN	4.600000e-21	0.54	0.00	0.019	0.029
3	SEPT5	MESPIPILPLPTPDAETEK	chr22	19714464	1	0	0	0	0	0	...	0.0017	0.00380	NaN	NaN	NaN	9.700000e-23	0.72	0.99	0.058	0.058
4	SEPT6	TAAELLQSQGSQAGGSQTLKR	chrX	119615724	0	0	0	0	0	0	...	0.0410	0.00510	NaN	NaN	NaN	3.100000e-04	0.63	0.44	0.120	0.380
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
4016	ZRANB2	SRSPESQVIGENTK	chr1	71063291	1	0	0	1	0	0	...	0.0050	0.01300	NaN	NaN	NaN	1.700000e-05	0.91	0.00	0.029	0.038
4017	ZRSR2	NPNNEFWEANRDIYLSPDR	chrX	15790472	1	0	0	0	0	0	...	0.0190	0.00220	NaN	NaN	NaN	9.300000e-04	0.70	0.00	0.110	0.240
4018	ZSCAN18	AFASPRSSPAPPDLPTPGSAAGVQQEEPETIPERTPADLEFSR	chr19	58083838	0	0	1	0	0	1	...	0.3100	0.26000	NaN	NaN	NaN	9.700000e-18	0.15	0.86	0.019	0.029
4019	ZYX	FSPGAPGGSGSQPNQK	chr7	143381080	1	0	0	1	0	0	...	0.0140	0.03200	NaN	NaN	NaN	6.800000e-02	0.53	0.00	0.260	0.150
4020	ZZEF1	EINALAEHGDLELDERGDREEEVERPVSSPGDPEQK	chr17	4004445	1	0	0	1	0	0	...	0.0580	0.00190	NaN	NaN	NaN	2.400000e-19	0.84	0.00	0.038	0.048

4021 rows × 33 columns

In [75]:

df.set_index('Gene', inplace=True)
df = df.join(s_df, how='inner', lsuffix='_ITH')

cols = ['logFC_rna', 'logFC_protein', 'beta_diff']
df = df[df['RG2_Changes_filtered'] != 'Not-Background']
from scipy import stats
df = df.dropna(subset='logFC_protein')
for c in cols:
    sub_df = df.dropna(subset=c)
    sns.scatterplot(data=sub_df, x=c, y='CNV (lr) gain or loss', hue='RG2_Changes_filtered')
    plt.title(f'Old cohort CNV (lr) gain or loss vs {c}')
    res = stats.pearsonr(list(sub_df[c].values), list(sub_df['CNV (lr) gain or loss'].values))
    print('-------------------------------------')
    print(c)
    print(res)
    print('-------------------------------------')

    plt.xlabel(c)
    plt.ylabel('CNV (lr) gain or loss old study')
    plt.show()

-------------------------------------
logFC_rna
PearsonRResult(statistic=-0.043478331010822636, pvalue=0.007396341558143551)
-------------------------------------

-------------------------------------
logFC_protein
PearsonRResult(statistic=-0.05597460591016603, pvalue=0.0005062306728915353)
-------------------------------------

-------------------------------------
beta_diff
PearsonRResult(statistic=-0.01923201898483545, pvalue=0.34782408850384267)
-------------------------------------

In [ ]:

import pandas as pd
import seaborn as sns

old_df = pd.read_csv('SuppTable4_Rics_gene_wise.csv', index_col=0)
old_df

RNA seq data from old cohort vs cnv¶

In [104]:

import pandas as pd
import seaborn as sns

old_df = pd.read_csv('SuppTable4_Rics_gene_wise.csv', index_col=0)
df = pd.read_csv('Tab1iProFun_SupplementalTable3.csv')
cnv_lr = df['CNV (lr) gain or loss'].values
cnv_baf = df['CNV (baf) < 0.4'].values
df.set_index('Gene', inplace=True)
sub_df = df.join(old_df, how='inner', lsuffix='_ITH')

cols = ['RNA log2 Fold change', 'protein log2 Fold change']
from scipy import stats

sns.scatterplot(data=sub_df, x='RNA log2 Fold change', y='CNV (lr) gain or loss')
plt.title(f'Old cohort CNV (lr) gain or loss vs {c}')
res = stats.pearsonr(list(sub_df[c].values), list(sub_df['CNV (lr) gain or loss'].values))
print('-------------------------------------')
print(c)
print(res)
print('-------------------------------------')

plt.xlabel(c)
plt.ylabel('CNV (lr) gain or loss old study')
plt.show()

-------------------------------------
RNA log2 Fold change
PearsonRResult(statistic=-0.05577305989390756, pvalue=0.0012342054045919259)
-------------------------------------

In [103]:

sub_df

Out[103]:

	peptide	chromosome	start	CNV (lr) RNA	CNV (baf) RNA	Methylation RNA	CNV (lr) Protein	CNV (baf) Protein	Methylation Protein	CNV (lr) Phospho	...	DNA replication	HIF signaling	Ribosome	Translation factors	MYC targets	MTORC1 signaling	Spliceosome	Histidine metabolism	Galactose metabolism	Arachidonic acid metabolism
Gene
SEPT1	EEEIHIYQFPECDSDEDEDFKR	chr16	30378133	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
SEPT2	IYHLPDAESDEDEDFK	chr2	241315100	1	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
SEPT4	LTRESGTDFPIPAVPPGTDPETEK	chr17	58520250	1	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
SEPT5	MESPIPILPLPTPDAETEK	chr22	19714464	1	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
SEPT6	TAAELLQSQGSQAGGSQTLKR	chrX	119615724	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
ZNRF2	ITYNEDVLSK	chr7	30284307	1	0	0	1	0	0	1	...	0	0	0	0	0	0	0	0	0	0
ZRANB2	SRSPESQVIGENTK	chr1	71063291	1	0	0	1	0	0	1	...	0	0	0	0	0	0	0	0	0	0
ZSCAN18	AFASPRSSPAPPDLPTPGSAAGVQQEEPETIPERTPADLEFSR	chr19	58083838	0	0	1	0	0	1	0	...	0	0	0	0	0	0	0	0	0	0
ZYX	FSPGAPGGSGSQPNQK	chr7	143381080	1	0	0	1	0	0	1	...	0	0	0	0	0	0	0	0	0	0
ZZEF1	EINALAEHGDLELDERGDREEEVERPVSSPGDPEQK	chr17	4004445	1	0	0	1	0	0	1	...	0	0	0	0	0	0	0	0	0	0

3353 rows × 61 columns

In [77]:

df = old_df.join(s_df, how='inner', lsuffix='_ITH')

cols = ['logFC_rna', 'logFC_protein', 'beta_diff']
df = df[df['RG2_Changes_filtered'] != 'Not-Background']
from scipy import stats
df = df.dropna(subset='logFC_protein')
for c in cols:
    sub_df = df.dropna(subset=c)
    sns.scatterplot(data=sub_df, x=c, y='protein log2 Fold change', hue='RG2_Changes_filtered')
    plt.title(f'Old cohort protein log2 Fold change vs {c}')
    res = stats.pearsonr(list(sub_df[c].values), list(sub_df['protein log2 Fold change'].values))
    print('-------------------------------------')
    print(c)
    print(res)
    print('-------------------------------------')

    plt.xlabel(c)
    plt.ylabel('protein log2 Fold change old study')
    plt.show()

-------------------------------------
logFC_rna
PearsonRResult(statistic=0.7194129194979026, pvalue=0.0)
-------------------------------------

-------------------------------------
logFC_protein
PearsonRResult(statistic=0.9805140480153869, pvalue=0.0)
-------------------------------------

-------------------------------------
beta_diff
PearsonRResult(statistic=-0.21364363781448548, pvalue=2.9483565590707836e-47)
-------------------------------------

In [78]:

df = pd.read_csv('Tab1iProFun_SupplementalTable3.csv')
cnv_lr = df['CNV (lr) gain or loss'].values
cnv_baf = df['CNV (baf) < 0.4'].values
df

Out[78]:

	Gene	peptide	chromosome	start	CNV (lr) RNA	CNV (baf) RNA	Methylation RNA	CNV (lr) Protein	CNV (baf) Protein	Methylation Protein	...	R^2 Methylation-mRNA	R^2 Methylation-Phospho	R^2 Mutation-Global	R^2 Mutation-mRNA	R^2 Mutation-Phospho	q_value_tumor_normal	q_value_grade	hypermethylation	CNV (lr) gain or loss	CNV (baf) < 0.4
0	SEPT1	EEEIHIYQFPECDSDEDEDFKR	chr16	30378133	0	0	0	0	0	0	...	0.0370	0.03500	NaN	NaN	NaN	3.800000e-19	0.72	0.00	0.150	0.110
1	SEPT2	IYHLPDAESDEDEDFK	chr2	241315100	1	0	0	0	0	0	...	0.0190	0.00110	NaN	NaN	NaN	3.900000e-04	0.36	0.00	0.140	0.170
2	SEPT4	LTRESGTDFPIPAVPPGTDPETEK	chr17	58520250	1	0	0	0	0	0	...	0.0023	0.00016	NaN	NaN	NaN	4.600000e-21	0.54	0.00	0.019	0.029
3	SEPT5	MESPIPILPLPTPDAETEK	chr22	19714464	1	0	0	0	0	0	...	0.0017	0.00380	NaN	NaN	NaN	9.700000e-23	0.72	0.99	0.058	0.058
4	SEPT6	TAAELLQSQGSQAGGSQTLKR	chrX	119615724	0	0	0	0	0	0	...	0.0410	0.00510	NaN	NaN	NaN	3.100000e-04	0.63	0.44	0.120	0.380
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
4016	ZRANB2	SRSPESQVIGENTK	chr1	71063291	1	0	0	1	0	0	...	0.0050	0.01300	NaN	NaN	NaN	1.700000e-05	0.91	0.00	0.029	0.038
4017	ZRSR2	NPNNEFWEANRDIYLSPDR	chrX	15790472	1	0	0	0	0	0	...	0.0190	0.00220	NaN	NaN	NaN	9.300000e-04	0.70	0.00	0.110	0.240
4018	ZSCAN18	AFASPRSSPAPPDLPTPGSAAGVQQEEPETIPERTPADLEFSR	chr19	58083838	0	0	1	0	0	1	...	0.3100	0.26000	NaN	NaN	NaN	9.700000e-18	0.15	0.86	0.019	0.029
4019	ZYX	FSPGAPGGSGSQPNQK	chr7	143381080	1	0	0	1	0	0	...	0.0140	0.03200	NaN	NaN	NaN	6.800000e-02	0.53	0.00	0.260	0.150
4020	ZZEF1	EINALAEHGDLELDERGDREEEVERPVSSPGDPEQK	chr17	4004445	1	0	0	1	0	0	...	0.0580	0.00190	NaN	NaN	NaN	2.400000e-19	0.84	0.00	0.038	0.048

4021 rows × 33 columns

Check the overlap between the two cohorts¶

In [ ]:

from matplotlib_venn import venn2

new_sircle = pd.read_csv('sircle_PorMandR_ClearCellRenalCellCarcinoma-ITH.csv')
for r in set(new_sircle['RG2_Changes_filtered'].values):
    old_r = list(s_df[s_df['RG2_Changes_filtered'] == r].index)
    new_r = list(new_sircle[new_sircle['RG2_Changes_filtered'] == r]['gene_name'].values)
    venn2([set(old_r), set(new_r)], ('Old genes', 'New cohort genes'))
    plt.title(r)
    plt.show()

In [81]:

from matplotlib_venn import venn2

new_sircle = pd.read_csv('sircle_PorMandR_ClearCellRenalCellCarcinoma-New.csv')
for r in set(new_sircle['RG2_Changes_filtered'].values):
    old_r = list(s_df[s_df['RG2_Changes_filtered'] == r].index)
    new_r = list(new_sircle[new_sircle['RG2_Changes_filtered'] == r]['gene_name'].values)
    venn2([set(old_r), set(new_r)], ('Old genes', 'New cohort genes'))
    plt.title(r)
    plt.show()

/var/folders/sj/4wqsfdtd6093v9746b0t3mq40000gn/T/ipykernel_90533/1120268717.py:3: DtypeWarning: Columns (1,2,150,180,181,185,186) have mixed types. Specify dtype option on import or set low_memory=False.
  new_sircle = pd.read_csv('sircle_PorMandR_ClearCellRenalCellCarcinoma-ITH.csv')

/Users/arianemora/miniconda3/envs/sircle/lib/python3.10/site-packages/matplotlib_venn/layout/venn2/exact.py:83: UserWarning: Both circles have zero area
  warnings.warn("Both circles have zero area")