Notebook Part 3 Comparison Figure 1

Figure1

RCM of ccRCC vs PanCan¶

Basically have a look at what genes are shared and different between ccRCC and PanCan for each cluster.

In [16]:

# Imports
import pandas as pd
import os
import matplotlib.pyplot as plt

cancer = 'ClearCellRenalCellCarcinoma'
input_dir = 'Input_Data'
output_dir = 'Output_Data'
supp_dir = 'Required_Refs'
fig_dir = 'Output_Figures'
reg_label = 'RG2_Changes_filtered'

ccrcc = pd.read_csv(os.path.join(input_dir, 'sircle_PorMandR_ClearCellRenalCellCarcinoma.csv'))
pancan = pd.read_csv(os.path.join(input_dir, 'sircle_PorMandR_PanCan.csv'))

rcm_labels = ["TMDE", "TMDS", "TPDE_TMDS", "TPDE", "TPDS_TMDE", "TPDS", "MDS_TMDE", "MDE",  "MDE_TMDS", "MDS"]

values = []
cluster = []
lbls = []
genes = []


    
for r in rcm_labels:
    ccrcc_r = ccrcc[ccrcc[reg_label].values == r]
    pancan_r = pancan[pancan[reg_label].values == r]
    shared_genes = list(set(ccrcc_r['gene_name'].values) & set(pancan_r['gene_name'].values))
    ccrcc_genes = [c for c in ccrcc_r['gene_name'].values if c not in pancan_r['gene_name'].values]
    pancan_genes = [c for c in pancan_r['gene_name'].values if c not in ccrcc_r['gene_name'].values]
    print(r, len(shared_genes), len(ccrcc_genes), len(pancan_genes))

    # Add values for the stacked bar chart
    values.append(len(ccrcc_genes))
    values.append(len(pancan_genes))
    values.append(len(shared_genes))

    genes.append(' '.join(ccrcc_genes))
    genes.append(' '.join(pancan_genes))
    genes.append(' '.join(shared_genes))

    # Add in the labels
    lbls.append('ccRCC unique')
    lbls.append('PanCan unique')
    lbls.append('Shared')
    
    # Add in the cluster
    cluster.append(r)
    cluster.append(r)
    cluster.append(r)

/var/folders/gq/6ljhmvm1713fykdjqbl188pm0000gn/T/ipykernel_8104/2636475310.py:13: DtypeWarning: Columns (1,2,353,381,383,384,388,389,583,584,585) have mixed types. Specify dtype option on import or set low_memory=False.
  ccrcc = pd.read_csv(os.path.join(input_dir, 'sircle_PorMandR_ClearCellRenalCellCarcinoma.csv'))
/var/folders/gq/6ljhmvm1713fykdjqbl188pm0000gn/T/ipykernel_8104/2636475310.py:14: DtypeWarning: Columns (1,2,1161,1163,1164,1167,1176,1177,1178,1179,1180,1181,1182,1195,1197,1198,1202,1203,1208,1974,1975,1976) have mixed types. Specify dtype option on import or set low_memory=False.
  pancan = pd.read_csv(os.path.join(input_dir, 'sircle_PorMandR_PanCan.csv'))

TMDE 26 392 150
TMDS 68 984 328
TPDE_TMDS 53 561 251
TPDE 8 196 29
TPDS_TMDE 96 781 648
TPDS 30 424 166
MDS_TMDE 23 337 88
MDE 17 412 80
MDE_TMDS 139 977 563
MDS 8 155 20

In [17]:

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.DataFrame({
    'Cluster': cluster,
    'Type': lbls,
    'Value': values, 
    'Genes': genes})
plt.rcParams["figure.figsize"] = (2.5,3)
plt.rcParams['svg.fonttype'] = 'none'

ax = sns.histplot(df, x='Cluster', hue='Type', hue_order=['Shared', 'PanCan unique', 'ccRCC unique'], 
                  weights='Value', multiple='stack', palette=['teal', 'lightgrey', 'darkorange'])
plt.setp(ax.xaxis.get_majorticklabels(), rotation=45, horizontalalignment='right')
plt.title('Overlap between PanCan and ccRCC genes')
ax.spines['bottom'].set_linewidth(1.0)
ax.spines['top'].set_linewidth(0)
ax.spines['left'].set_linewidth(1.0)
ax.spines['right'].set_linewidth(0)
plt.savefig(os.path.join(fig_dir, f'StackedBar_overlap.svg'))
plt.show()

No description has been provided for this image

In [18]:

df.to_csv('PanCan_CCRCC_SiRCLe.csv')

Plot ORA to show that this is also different¶

Clearly will be given they get different genes.

Here we plot as an overlapping dot plot the different ORA terms that pop up.

In [19]:

import pandas as pd
from sciutil import SciUtil
from matplotlib_venn import venn3
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
import matplotlib.cm as cm
import numpy as np
import matplotlib.cm as cm
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (1,3)
sns.set_theme(style="whitegrid")
plt.rcParams.update({'font.size': 8})

cmaps = {'Down regulated': {'CpG': 'darkorange', 'Protein': 'royalblue', 'RNA': 'teal'},
        'Up regulated': {'CpG': 'limegreen', 'Protein': 'darkred', 'RNA': 'lightcoral'},
        }
cmaps = {'MDS': {'CpG': 'darkorange', 'Protein': 'darkorange', 'RNA': 'grey'},
         'MDS_TMDE': {'CpG': 'darkorange', 'Protein': 'darkorange', 'RNA': 'grey'},
         'MDE': {'CpG': 'darkorange', 'Protein': 'darkorange', 'RNA': 'grey'},
         'MDE_TMDS': {'CpG': 'darkorange', 'Protein': 'darkorange', 'RNA': 'grey'}, 
         'TMDE': {'CpG': 'darkorange', 'Protein': 'darkorange', 'RNA': 'grey'}, 
         'TMDS': {'CpG': 'darkorange', 'Protein': 'darkorange', 'RNA': 'grey'},
         'TPDE': {'CpG': 'darkorange', 'Protein': 'darkorange', 'RNA': 'grey'}, 
         'TPDE_TMDS': {'CpG': 'darkorange', 'Protein': 'darkorange', 'RNA': 'grey'},
          'TPDS': {'CpG': 'darkorange', 'Protein': 'darkorange', 'RNA': 'grey'},
         'TPDS_TMDE': {'CpG': 'darkorange', 'Protein': 'darkorange', 'RNA': 'grey'}}

u = SciUtil()
plt.rc('legend', fontsize=8)    # legend fontsize
test_title = 'PanCan'
rcm_labels = ["TMDE", "TMDS", "TPDE_TMDS", "TPDE", "TPDS_TMDE", "TPDS", "MDS_TMDE", "MDE",  "MDE_TMDS", "MDS"]

comparisons = {}
# Create the comparisons for these 
for r in rcm_labels:
    comparisons[r] = [f'{input_dir}/ORA_PanCan/ClusterGoSummary_{r}_RCM.csv',
                      f'{input_dir}/ORA_ccRCC/ClusterGoSummary_{r}_RCM.csv',
                     ]

for c in comparisons:
    combined_df = pd.DataFrame()
    rna_df = pd.read_csv(f'{comparisons[c][0]}')
    rna_df['GeneRatio'] = [int(g.split('/')[0])/int(g.split('/')[1]) for g in rna_df['GeneRatio'].values]
    rna_df['Data'] = 'RNA'
    rna_df = rna_df[rna_df['p.adjust'] < 0.05]

    rna_df.sort_values('p.adjust', inplace=True)

    protein_df = pd.read_csv(f'{comparisons[c][1]}')
    protein_df['GeneRatio'] = [int(g.split('/')[0])/int(g.split('/')[1]) for g in protein_df['GeneRatio'].values]
    protein_df['Data'] = 'Protein'
    protein_df = protein_df[protein_df['p.adjust'] < 0.05]
    protein_df.sort_values('p.adjust', inplace=True)

    combined_df = pd.concat([combined_df, rna_df])
    combined_df = pd.concat([combined_df, protein_df])
    # Concat all together
    num = 10
    rna_df = pd.concat([rna_df.head(num), protein_df.head(num)])
    rna_df.sort_values('p.adjust', inplace=True)

    max_c = max(rna_df['Count'].values)
    min_c = min(rna_df['Count'].values)
    size = [int(1 + ((int(c) - min_c)/(max_c - min_c))*100) for c in rna_df['Count'].values]

    norm = mpl.colors.Normalize(vmin=np.min(rna_df['p.adjust'].values), vmax=np.max(rna_df['p.adjust'].values))
    cmap = cm.RdBu
    m = cm.ScalarMappable(norm=norm, cmap=cmap)
    c_map = cmaps[c]
    colours = [c_map.get(v) for v in rna_df['Data'].values]
    plt.scatter(rna_df['GeneRatio'].values, rna_df['Description'].values, s=size, c=colours)
    plt.title(c)
    combined_df.to_csv(os.path.join(output_dir, f'{c}_StageIV-StageI-Tumour.csv'), index=False)

    gmin = plt.scatter([], [], s=10 + int(100*(min_c/(max_c - min_c))), marker='o', color='#222')
    gmax = plt.scatter([], [], s=10 + int(100*(1)), marker='o', color='#222')
    legend = plt.legend((gmin, gmax),
                        (str(min_c), str(max_c)),
                        scatterpoints=1,
                        loc='lower left',
                        ncol=1,
                        fontsize=8, bbox_to_anchor=(0, -0.1))
    
    legend.set_title("No. Genes")
    plt.gca().add_artist(legend)
    plt.savefig(os.path.join(fig_dir, f'DotPlot_GO_{c}.svg'))
    plt.show()

In [20]:

comparisons

Out[20]:

{'TMDE': ['Input_Data/ORA_PanCan/ClusterGoSummary_TMDE_RCM.csv',
  'Input_Data/ORA_ccRCC/ClusterGoSummary_TMDE_RCM.csv'],
 'TMDS': ['Input_Data/ORA_PanCan/ClusterGoSummary_TMDS_RCM.csv',
  'Input_Data/ORA_ccRCC/ClusterGoSummary_TMDS_RCM.csv'],
 'TPDE_TMDS': ['Input_Data/ORA_PanCan/ClusterGoSummary_TPDE_TMDS_RCM.csv',
  'Input_Data/ORA_ccRCC/ClusterGoSummary_TPDE_TMDS_RCM.csv'],
 'TPDE': ['Input_Data/ORA_PanCan/ClusterGoSummary_TPDE_RCM.csv',
  'Input_Data/ORA_ccRCC/ClusterGoSummary_TPDE_RCM.csv'],
 'TPDS_TMDE': ['Input_Data/ORA_PanCan/ClusterGoSummary_TPDS_TMDE_RCM.csv',
  'Input_Data/ORA_ccRCC/ClusterGoSummary_TPDS_TMDE_RCM.csv'],
 'TPDS': ['Input_Data/ORA_PanCan/ClusterGoSummary_TPDS_RCM.csv',
  'Input_Data/ORA_ccRCC/ClusterGoSummary_TPDS_RCM.csv'],
 'MDS_TMDE': ['Input_Data/ORA_PanCan/ClusterGoSummary_MDS_TMDE_RCM.csv',
  'Input_Data/ORA_ccRCC/ClusterGoSummary_MDS_TMDE_RCM.csv'],
 'MDE': ['Input_Data/ORA_PanCan/ClusterGoSummary_MDE_RCM.csv',
  'Input_Data/ORA_ccRCC/ClusterGoSummary_MDE_RCM.csv'],
 'MDE_TMDS': ['Input_Data/ORA_PanCan/ClusterGoSummary_MDE_TMDS_RCM.csv',
  'Input_Data/ORA_ccRCC/ClusterGoSummary_MDE_TMDS_RCM.csv'],
 'MDS': ['Input_Data/ORA_PanCan/ClusterGoSummary_MDS_RCM.csv',
  'Input_Data/ORA_ccRCC/ClusterGoSummary_MDS_RCM.csv']}

In [24]:

with pd.ExcelWriter('SiRCLe_PanCan_ORA_joined_sheets.xlsx') as writer:
        # Also read in the patient information
    for c in comparisons:
        combined_df = pd.DataFrame()
        rna_df = pd.read_csv(f'{comparisons[c][0]}', index_col=0)
        rna_df['GeneRatio'] = [int(g.split('/')[0])/int(g.split('/')[1]) for g in rna_df['GeneRatio'].values]
        rna_df['Data'] = 'PanCan'
        rna_df = rna_df[rna_df['p.adjust'] < 0.05]

        rna_df.sort_values('p.adjust', inplace=True)

        protein_df = pd.read_csv(f'{comparisons[c][1]}', index_col=0)
        protein_df['GeneRatio'] = [int(g.split('/')[0])/int(g.split('/')[1]) for g in protein_df['GeneRatio'].values]
        protein_df['Data'] = 'ccRCC'
        protein_df = protein_df[protein_df['p.adjust'] < 0.05]
        protein_df.sort_values('p.adjust', inplace=True)
        
        combined_df = protein_df.join(rna_df, lsuffix='_ccRCC', rsuffix='_PanCan', how='outer')
        # geneID_PanCan and geneID_ccRCC
        cc = combined_df['geneID_ccRCC'].values
        cc_g_only = []
        p_g_only = []
        shared_g = []
        pc = []
        pp = []
        n_p = []
        n_c = []
        n_s = []
        for i, x in enumerate(combined_df['geneID_PanCan'].values):
            try:
                cc_g = cc[i].split('/')
            except:
                cc_g = []
            try:
                p_g = x.split('/')
            except:
                p_g = []
                
            shared = [g for g in cc_g if g in p_g]
            shared_g.append(' '.join(shared))
            cco = [g for g in cc_g if g not in p_g]
            cc_g_only.append(' '.join(cco))
            pgo = [g for g in p_g if g not in cc_g]
            p_g_only.append(' '.join(pgo))
            if len(cc_g) != 0 and len(shared) != 0:
                pc.append(len(shared)/len(cc_g))
            else:
                pc.append(0)
            if len(p_g) != 0 and len(shared) != 0:
                pp.append(len(shared)/len(p_g))
            else:
                pp.append(0)
            n_p.append(len(pgo))
            n_c.append(len(cco))
            n_s.append(len(shared))
        combined_df['ccRCC_genes_only'] = cc_g_only
        combined_df['PanCan_genes_only'] = p_g_only
        combined_df['shared'] = shared_g
        combined_df['percent shared of ccRCC'] = pc
        combined_df['percent shared of PanCan'] = pp
        combined_df['Number genes PanCan Only'] = n_p
        combined_df['Number genes ccRCC Only'] = n_c
        combined_df['Number genes shared'] = n_s
        combined_df.sort_values('Number genes shared', inplace=True)
        combined_df.to_excel(writer, sheet_name=c)

In [6]:

combined_df = protein_df.join(rna_df, lsuffix='_PanCan', rsuffix='_ccRCC', how='outer')
# geneID_PanCan and geneID_ccRCC
cc = combined_df['geneID_ccRCC'].values
cc_g_only = []
p_g_only = []
shared = []
pc = []
pp = []
n_p = []
n_c = []
n_s = []
for i, x in enumerate(combined_df['geneID_PanCan'].values):
    try:
        cc_g = cc[i].split('/')
    except:
        cc_g = []
    try:
        p_g = x.split('/')
    except:
        p_g = []
    if len(cc_g) > 0:
        print(p_g, cc_g)
    shared.append(' '.join([g for g in cc_g if g in p_g]))
    cc_g_only.append(' '.join([g for g in cc_g if g not in p_g]))
    p_g_only.append(' '.join([g for g in p_g if g not in cc_g]))
    pc.append(len(shared)/len(cc_g_only))
    pp.append(len(shared)/len(p_g_only))
    n_p.append(len(p_g_only))
    n_c.append(len(cc_g_only))
    n_s.append(len(shared))

combined_df['ccRCC_genes_only'] = cc_g_only
combined_df['PanCan_genes_only'] = p_g_only
combined_df['shared'] = shared
combined_df['percent shared of ccRCC'] = pc
combined_df['percent shared of PanCan'] = pp
combined_df['Number genes PanCan Only'] = n_p
combined_df['Number genes ccRCC Only'] = n_c
combined_df['Number genes shared'] = n_s

[] ['CA3', 'CA4']
[] ['HSPB6', 'CRYAB']
[] ['RGN', 'CYP27A1']
[] ['RGN', 'CYB5A']
[] ['RGN', 'CD34']
[] ['FHL1', 'CRYAB', 'RBP4', 'FRZB']
[] ['SOD3', 'CRYAB', 'RGN', 'CD34', 'GPX3']
[] ['RGN', 'CD34']
[] ['CRYAB', 'RGN', 'CD34']

In [7]:

combined_df

Out[7]:

	ONTOLOGY_PanCan	ID_PanCan	Description_PanCan	GeneRatio_PanCan	BgRatio_PanCan	pvalue_PanCan	p.adjust_PanCan	qvalue_PanCan	geneID_PanCan	Count_PanCan	...	Count_ccRCC	Data_ccRCC	ccRCC_genes_only	PanCan_genes_only	shared	percent shared of ccRCC	percent shared of PanCan	Number genes PanCan Only	Number genes ccRCC Only	Number genes shared
GO:0000038	BP	GO:0000038	very long-chain fatty acid metabolic process	0.037975	34/17152	6.036612e-07	0.000429	0.000388	ACAA1/ACOT2/ACOX1/ACSL6/ACOX2/ACOT1	6.0	...	NaN	NaN		ACAA1 ACOT2 ACOX1 ACSL6 ACOX2 ACOT1		1.0	1.0	1	1	1
GO:0001655	BP	GO:0001655	urogenital system development	0.063291	318/17152	7.419318e-04	0.042771	0.038690	COL4A4/CALB1/ALDH1A2/RGN/RBP4/AQP2/COL4A3/UMOD...	10.0	...	NaN	NaN		COL4A4 CALB1 ALDH1A2 RGN RBP4 AQP2 COL4A3 UMOD...		1.0	1.0	2	2	2
GO:0002791	BP	GO:0002791	regulation of peptide secretion	0.063291	303/17152	5.108787e-04	0.035468	0.032083	CASR/LLGL2/RAPGEF3/RAB11FIP3/CHGA/WLS/RBP4/HAD...	10.0	...	NaN	NaN		CASR LLGL2 RAPGEF3 RAB11FIP3 CHGA WLS RBP4 HAD...		1.0	1.0	3	3	3
GO:0004089	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	2.0	ccRCC	CA3 CA4			1.0	1.0	4	4	4
GO:0005212	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	2.0	ccRCC	HSPB6 CRYAB			1.0	1.0	5	5	5
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
GO:0098862	CC	GO:0098862	cluster of actin-based cell projections	0.038217	154/17707	2.524887e-03	0.036891	0.031450	CALB1/ATP6V0A4/FOLR1/PTH1R/DCXR/SLC22A12	6.0	...	NaN	NaN		CALB1 ATP6V0A4 FOLR1 PTH1R DCXR SLC22A12		1.0	1.0	73	73	73
GO:1901616	BP	GO:1901616	organic hydroxy compound catabolic process	0.031646	73/17152	5.646239e-04	0.035468	0.032083	CYP24A1/MOXD1/CYP27B1/LDHD/INPP5J	5.0	...	NaN	NaN		CYP24A1 MOXD1 CYP27B1 LDHD INPP5J		1.0	1.0	74	74	74
GO:1904406	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	2.0	ccRCC	RGN CD34			1.0	1.0	75	75	75
GO:1990204	CC	GO:1990204	oxidoreductase complex	0.031847	109/17707	2.865031e-03	0.039658	0.033809	DLST/DBT/GPD1L/UQCRFS1/OGDHL	5.0	...	NaN	NaN		DLST DBT GPD1L UQCRFS1 OGDHL		1.0	1.0	76	76	76
GO:2000378	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	3.0	ccRCC	CRYAB RGN CD34			1.0	1.0	77	77	77

77 rows × 30 columns

In [13]:

combined_df = pd.DataFrame()
rna_df = pd.read_csv(f'{comparisons[c][0]}', index_col=0)
rna_df['GeneRatio'] = [int(g.split('/')[0])/int(g.split('/')[1]) for g in rna_df['GeneRatio'].values]
rna_df['Data'] = 'ccRCC'
rna_df = rna_df[rna_df['p.adjust'] < 0.05]

rna_df.sort_values('p.adjust', inplace=True)

protein_df = pd.read_csv(f'{comparisons[c][1]}', index_col=0)
protein_df['GeneRatio'] = [int(g.split('/')[0])/int(g.split('/')[1]) for g in protein_df['GeneRatio'].values]
protein_df['Data'] = 'PanCan'
protein_df = protein_df[protein_df['p.adjust'] < 0.05]
protein_df.sort_values('p.adjust', inplace=True)


combined_df = protein_df.join(rna_df, lsuffix='_PanCan', rsuffix='_ccRCC', how='outer')
# geneID_PanCan and geneID_ccRCC
cc = combined_df['geneID_ccRCC'].values
cc_g_only = []
p_g_only = []
shared = []
for i, x in enumerate(combined_df['geneID_PanCan'].values):
    try:
        cc_g = cc[i].split('/')
    except:
        cc_g = []
    try:
        p_g = x.split('/')
    except:
        p_g = []
    shared.append(' '.join([g for g in cc_g if g in p_g]))
    cc_g_only.append(' '.join([g for g in cc_g if g not in p_g]))
    p_g_only.append(' '.join([g for g in p_g if g not in cc_g]))
combined_df['ccRCC_genes_only'] = cc_g_only
combined_df['PanCan_genes_only'] = p_g_only
combined_df['shared'] = shared
combined_df

Out[13]:

	ONTOLOGY_PanCan	ID_PanCan	Description_PanCan	GeneRatio_PanCan	BgRatio_PanCan	pvalue_PanCan	p.adjust_PanCan	qvalue_PanCan	geneID_PanCan	Count_PanCan	...	BgRatio_ccRCC	pvalue_ccRCC	p.adjust_ccRCC	qvalue_ccRCC	geneID_ccRCC	Count_ccRCC	Data_ccRCC	ccRCC_genes_only	PanCan_genes_only	shared
GO:0000038	BP	GO:0000038	very long-chain fatty acid metabolic process	0.037975	34/17152	6.036612e-07	0.000429	0.000388	ACAA1/ACOT2/ACOX1/ACSL6/ACOX2/ACOT1	6.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN		ACAA1 ACOT2 ACOX1 ACSL6 ACOX2 ACOT1
GO:0001655	BP	GO:0001655	urogenital system development	0.063291	318/17152	7.419318e-04	0.042771	0.038690	COL4A4/CALB1/ALDH1A2/RGN/RBP4/AQP2/COL4A3/UMOD...	10.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN		COL4A4 CALB1 ALDH1A2 RGN RBP4 AQP2 COL4A3 UMOD...
GO:0002791	BP	GO:0002791	regulation of peptide secretion	0.063291	303/17152	5.108787e-04	0.035468	0.032083	CASR/LLGL2/RAPGEF3/RAB11FIP3/CHGA/WLS/RBP4/HAD...	10.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN		CASR LLGL2 RAPGEF3 RAB11FIP3 CHGA WLS RBP4 HAD...
GO:0004089	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	12/17127	0.000168	0.015324	0.009927	CA3/CA4	2.0	ccRCC	CA3 CA4
GO:0005212	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	19/17127	0.000433	0.019712	0.012769	HSPB6/CRYAB	2.0	ccRCC	HSPB6 CRYAB
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
GO:0098862	CC	GO:0098862	cluster of actin-based cell projections	0.038217	154/17707	2.524887e-03	0.036891	0.031450	CALB1/ATP6V0A4/FOLR1/PTH1R/DCXR/SLC22A12	6.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN		CALB1 ATP6V0A4 FOLR1 PTH1R DCXR SLC22A12
GO:1901616	BP	GO:1901616	organic hydroxy compound catabolic process	0.031646	73/17152	5.646239e-04	0.035468	0.032083	CYP24A1/MOXD1/CYP27B1/LDHD/INPP5J	5.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN		CYP24A1 MOXD1 CYP27B1 LDHD INPP5J
GO:1904406	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	18/17251	0.000329	0.045530	0.035799	RGN/CD34	2.0	ccRCC	RGN CD34
GO:1990204	CC	GO:1990204	oxidoreductase complex	0.031847	109/17707	2.865031e-03	0.039658	0.033809	DLST/DBT/GPD1L/UQCRFS1/OGDHL	5.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN		DLST DBT GPD1L UQCRFS1 OGDHL
GO:2000378	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	62/17251	0.000108	0.026836	0.021101	CRYAB/RGN/CD34	3.0	ccRCC	CRYAB RGN CD34

77 rows × 25 columns

In [12]:

combined_df[combined_df['Description_PanCan'] == 'lung development']

Out[12]:

	ONTOLOGY_PanCan	ID_PanCan	Description_PanCan	GeneRatio_PanCan	BgRatio_PanCan	pvalue_PanCan	p.adjust_PanCan	qvalue_PanCan	geneID_PanCan	Count_PanCan	...	BgRatio_ccRCC	pvalue_ccRCC	p.adjust_ccRCC	qvalue_ccRCC	geneID_ccRCC	Count_ccRCC	Data_ccRCC	ccRCC_genes_only	PanCan_genes_only	shared

0 rows × 25 columns

Look at just what is shared...¶

In [14]:

import pandas as pd
from sciutil import SciUtil
from matplotlib_venn import venn3
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
import matplotlib.cm as cm
import numpy as np
import matplotlib.cm as cm
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (1,3)
sns.set_theme(style="whitegrid")
plt.rcParams.update({'font.size': 8})

cmaps = {'Down regulated': {'CpG': 'darkorange', 'Protein': 'royalblue', 'RNA': 'teal'},
        'Up regulated': {'CpG': 'limegreen', 'Protein': 'darkred', 'RNA': 'lightcoral'},
        }
cmaps = {'MDS': {'CpG': 'darkorange', 'Protein': 'darkorange', 'RNA': 'grey'},
         'MDS_TMDE': {'CpG': 'darkorange', 'Protein': 'darkorange', 'RNA': 'grey'},
         'MDE': {'CpG': 'darkorange', 'Protein': 'darkorange', 'RNA': 'grey'},
         'MDE_TMDS': {'CpG': 'darkorange', 'Protein': 'darkorange', 'RNA': 'grey'}, 
         'TMDE': {'CpG': 'darkorange', 'Protein': 'darkorange', 'RNA': 'grey'}, 
         'TMDS': {'CpG': 'darkorange', 'Protein': 'darkorange', 'RNA': 'grey'},
         'TPDE': {'CpG': 'darkorange', 'Protein': 'darkorange', 'RNA': 'grey'}, 
         'TPDE_TMDS': {'CpG': 'darkorange', 'Protein': 'darkorange', 'RNA': 'grey'},
          'TPDS': {'CpG': 'darkorange', 'Protein': 'darkorange', 'RNA': 'grey'},
         'TPDS_TMDE': {'CpG': 'darkorange', 'Protein': 'darkorange', 'RNA': 'grey'}}

u = SciUtil()
plt.rc('legend', fontsize=8)    # legend fontsize
test_title = 'PanCan'
rcm_labels = ["TMDE", "TMDS", "TPDE_TMDS", "TPDE", "TPDS_TMDE", "TPDS", "MDS_TMDE", "MDE",  "MDE_TMDS", "MDS"]

comparisons = {}
# Create the comparisons for these 
for r in rcm_labels:
    comparisons[r] = [f'{input_dir}/ORA_PanCan/ClusterGoSummary_{r}_RCM.csv',
                      f'{input_dir}/ORA_ccRCC/ClusterGoSummary_{r}_RCM.csv',
                     ]

for c in comparisons:
    try:
        combined_df = pd.DataFrame()
        rna_df = pd.read_csv(f'{comparisons[c][0]}')
        rna_df['GeneRatio'] = [int(g.split('/')[0])/int(g.split('/')[1]) for g in rna_df['GeneRatio'].values]
        rna_df['Data'] = 'RNA'
        rna_df = rna_df[rna_df['p.adjust'] < 0.05]
        rna_df = rna_df[rna_df['Count'] > 10]
        rna_df.sort_values('p.adjust', inplace=True)

        protein_df = pd.read_csv(f'{comparisons[c][1]}')
        protein_df['GeneRatio'] = [int(g.split('/')[0])/int(g.split('/')[1]) for g in protein_df['GeneRatio'].values]
        protein_df['Data'] = 'Protein'
        protein_df = protein_df[protein_df['p.adjust'] < 0.05]
        protein_df = protein_df[protein_df['Count'] > 10]
        protein_df.sort_values('p.adjust', inplace=True)

        # Filter to only include shared GO terms
        shared_go = list(set(rna_df['ID'].values) & set(protein_df['ID'].values))
        combined_df = pd.concat([combined_df, rna_df])
        combined_df = pd.concat([combined_df, protein_df])
        rna_df = rna_df[rna_df['ID'].isin(shared_go)]
        protein_df = protein_df[protein_df['ID'].isin(shared_go)]

        # Concat all together
        num = 10
        rna_df = pd.concat([rna_df.head(num), protein_df.head(num)])
        rna_df.sort_values('p.adjust', inplace=True)

        max_c = max(rna_df['Count'].values)
        min_c = min(rna_df['Count'].values)
        size = [int(1 + ((int(c) - min_c)/(max_c - min_c))*100) for c in rna_df['Count'].values]

        norm = mpl.colors.Normalize(vmin=np.min(rna_df['p.adjust'].values), vmax=np.max(rna_df['p.adjust'].values))
        cmap = cm.RdBu
        m = cm.ScalarMappable(norm=norm, cmap=cmap)
        c_map = cmaps[c]
        colours = [c_map.get(v) for v in rna_df['Data'].values]
        plt.scatter(rna_df['GeneRatio'].values, rna_df['Description'].values, s=size, c=colours)
        plt.title(c)
        combined_df.to_csv(os.path.join(output_dir, f'{c}_StageIV-StageI-Tumour.csv'), index=False)

        gmin = plt.scatter([], [], s=10 + int(100*(min_c/(max_c - min_c))), marker='o', color='#222')
        gmax = plt.scatter([], [], s=10 + int(100*(1)), marker='o', color='#222')
        legend = plt.legend((gmin, gmax),
                            (str(min_c), str(max_c)),
                            scatterpoints=1,
                            loc='lower left',
                            ncol=1,
                            fontsize=8, bbox_to_anchor=(0, -0.1))

        legend.set_title("No. Genes")
        plt.gca().add_artist(legend)
        plt.savefig(os.path.join(fig_dir, f'DotPlot_GO_{c}_Shared.svg'))
        plt.show()
    except:
        print("NO shared", c)

NO shared TPDE

NO shared MDS

Also get the unique ones¶

In [24]:

import pandas as pd
from sciutil import SciUtil
from matplotlib_venn import venn3
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
import matplotlib.cm as cm
import numpy as np
import matplotlib.cm as cm
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (1,3)
sns.set_theme(style="whitegrid")
plt.rcParams.update({'font.size': 8})

cmaps = {'Down regulated': {'CpG': 'teal', 'Protein': 'royalblue', 'RNA': 'teal'},
        'Up regulated': {'CpG': 'limegreen', 'Protein': 'darkred', 'RNA': 'lightcoral'},
        }
cmaps = {'MDS': {'CpG': 'darkorange', 'Protein': 'darkorange', 'RNA': 'grey'},
         'MDS_TMDE': {'CpG': 'darkorange', 'Protein': 'darkorange', 'RNA': 'grey'},
         'MDE': {'CpG': 'darkorange', 'Protein': 'darkorange', 'RNA': 'grey'},
         'MDE_TMDS': {'CpG': 'darkorange', 'Protein': 'darkorange', 'RNA': 'grey'}, 
         'TMDE': {'CpG': 'darkorange', 'Protein': 'darkorange', 'RNA': 'grey'}, 
         'TMDS': {'CpG': 'darkorange', 'Protein': 'darkorange', 'RNA': 'grey'},
         'TPDE': {'CpG': 'darkorange', 'Protein': 'darkorange', 'RNA': 'grey'}, 
         'TPDE_TMDS': {'CpG': 'darkorange', 'Protein': 'darkorange', 'RNA': 'grey'},
          'TPDS': {'CpG': 'darkorange', 'Protein': 'darkorange', 'RNA': 'grey'},
         'TPDS_TMDE': {'CpG': 'darkorange', 'Protein': 'darkorange', 'RNA': 'grey'}}

u = SciUtil()
plt.rc('legend', fontsize=8)    # legend fontsize
test_title = 'PanCan'
rcm_labels = ["TMDE", "TMDS", "TPDE_TMDS", "TPDE", "TPDS_TMDE", "TPDS", "MDS_TMDE", "MDE",  "MDE_TMDS", "MDS"]

comparisons = {}
# Create the comparisons for these 
for r in rcm_labels:
    comparisons[r] = [f'{input_dir}/ORA_PanCan/ClusterGoSummary_{r}_RCM.csv',
                      f'{input_dir}/ORA_ccRCC/ClusterGoSummary_{r}_RCM.csv',
                     ]

for c in comparisons:
    try:
        combined_df = pd.DataFrame()
        rna_df = pd.read_csv(f'{comparisons[c][0]}')
        rna_df['GeneRatio'] = [int(g.split('/')[0])/int(g.split('/')[1]) for g in rna_df['GeneRatio'].values]
        rna_df['Data'] = 'RNA'
        rna_df = rna_df[rna_df['p.adjust'] < 0.05]
        rna_df = rna_df[rna_df['Count'] > 10]
        rna_df.sort_values('p.adjust', inplace=True)

        protein_df = pd.read_csv(f'{comparisons[c][1]}')
        protein_df['GeneRatio'] = [int(g.split('/')[0])/int(g.split('/')[1]) for g in protein_df['GeneRatio'].values]
        protein_df['Data'] = 'Protein'
        protein_df = protein_df[protein_df['p.adjust'] < 0.05]
        protein_df = protein_df[protein_df['Count'] > 10]
        protein_df.sort_values('p.adjust', inplace=True)

        # Filter to only include shared GO terms
        shared_go = [x for x in rna_df['ID'].values if x not in list(protein_df['ID'].values)]
        combined_df = pd.concat([combined_df, rna_df])
        combined_df = pd.concat([combined_df, protein_df])
        rna_df = rna_df[rna_df['ID'].isin(shared_go)]
        
        shared_go = [x for x in protein_df['ID'].values if x not in list(rna_df['ID'].values)]
        protein_df = protein_df[protein_df['ID'].isin(shared_go)]

        # Concat all together
        num = 10
        rna_df = pd.concat([rna_df.head(num), protein_df.head(num)])
        rna_df.sort_values('p.adjust', inplace=True)

        max_c = max(rna_df['Count'].values)
        min_c = min(rna_df['Count'].values)
        size = [int(1 + ((int(c) - min_c)/(max_c - min_c))*100) for c in rna_df['Count'].values]

        norm = mpl.colors.Normalize(vmin=np.min(rna_df['p.adjust'].values), vmax=np.max(rna_df['p.adjust'].values))
        cmap = cm.RdBu
        m = cm.ScalarMappable(norm=norm, cmap=cmap)
        c_map = cmaps[c]
        colours = [c_map.get(v) for v in rna_df['Data'].values]
        plt.scatter(rna_df['GeneRatio'].values, rna_df['Description'].values, s=size, c=colours)
        plt.title(c)
        combined_df.to_csv(os.path.join(output_dir, f'{c}_StageIV-StageI-Tumour.csv'), index=False)

        gmin = plt.scatter([], [], s=10 + int(100*(min_c/(max_c - min_c))), marker='o', color='#222')
        gmax = plt.scatter([], [], s=10 + int(100*(1)), marker='o', color='#222')
        legend = plt.legend((gmin, gmax),
                            (str(min_c), str(max_c)),
                            scatterpoints=1,
                            loc='lower left',
                            ncol=1,
                            fontsize=8, bbox_to_anchor=(0, -0.1))

        legend.set_title("No. Genes")
        plt.gca().add_artist(legend)
        plt.savefig(os.path.join(fig_dir, f'DotPlot_GO_{c}_UNIQUE.svg'))
        plt.show()
    except:
        print("NO shared", c)

Next we want to look at what genes differ in the stage I vs stage IV comparison that come up as important¶

In [12]:

from sciutil import SciUtil

u = SciUtil()

ccrcc = pd.read_csv(os.path.join(input_dir, 'ClearCellRenalCellCarcinoma_mean_Integrated_comparison_Late-Early.csv'))
pancan = pd.read_csv(os.path.join(input_dir, 'PanCan_mean_Integrated_comparison_Late-Early.csv'))

rcm_labels = ["TMDE", "TMDS", "TPDE_TMDS", "TPDE", "TPDS_TMDE", "TPDS", "MDS_TMDE", "MDE",  "MDE_TMDS", "MDS"]

for r in rcm_labels:
    ccrcc_r = ccrcc[ccrcc[reg_label].values == r]
    pancan_r = pancan[pancan[reg_label].values == r]
    #ccrcc_r = ccrcc_r[ccrcc_r['Integrated padj (Late-Early)'] < 0.25]
    if len(ccrcc_r.values) > 1:
        ccrcc_r.sort_values('Integrated diff (Late-Early)', inplace=True)
        u.dp(['ccRCC LOW tail', r])
        print('\n'.join(list(ccrcc_r.head()['id'].values)))
        u.dp(['ccRCC HIGH tail', r])
        print('\n'.join(list(ccrcc_r.tail()['id'].values)))

--------------------------------------------------------------------------------
                              ccRCC LOW tail	TMDE	                              
--------------------------------------------------------------------------------
PAMR1
DGKH
ARHGEF10
COL6A3
A2M
--------------------------------------------------------------------------------
                             ccRCC HIGH tail	TMDE	                              
--------------------------------------------------------------------------------
TMED9
XPOT
CCDC86
NCEH1
HM13
--------------------------------------------------------------------------------
                              ccRCC LOW tail	TMDS	                              
--------------------------------------------------------------------------------
FBLIM1
TPRN
HOPX
CENPV
CLDN7
--------------------------------------------------------------------------------
                             ccRCC HIGH tail	TMDS	                              
--------------------------------------------------------------------------------
FASTKD3
ACOT8
ECSIT
SLC25A1
SFXN1
--------------------------------------------------------------------------------
                           ccRCC LOW tail	TPDE_TMDS	                            
--------------------------------------------------------------------------------
SPRY1
PTCHD4
C3orf36
DOCK9-AS1
CYSLTR1
--------------------------------------------------------------------------------
                           ccRCC HIGH tail	TPDE_TMDS	                           
--------------------------------------------------------------------------------
RAB9B
TP73
TRIM36
EPHA10
C1QL1
--------------------------------------------------------------------------------
                              ccRCC LOW tail	TPDE	                              
--------------------------------------------------------------------------------
MARCKSL1
MPZL2
MMRN2
RFLNB
LRRC8C
--------------------------------------------------------------------------------
                             ccRCC HIGH tail	TPDE	                              
--------------------------------------------------------------------------------
FAM114A1
ERAP1
RIPK2
TRIP13
APOL2
--------------------------------------------------------------------------------
                           ccRCC LOW tail	TPDS_TMDE	                            
--------------------------------------------------------------------------------
SELE
ARHGAP40
MROH2B
TDRD9
CPAMD8
--------------------------------------------------------------------------------
                           ccRCC HIGH tail	TPDS_TMDE	                           
--------------------------------------------------------------------------------
SLC22A10
IL13RA2
ERICH6
C11orf87
FER1L6-AS2
--------------------------------------------------------------------------------
                              ccRCC LOW tail	TPDS	                              
--------------------------------------------------------------------------------
SCIN
ADH1B
SCD5
ALDOB
LAMA2
--------------------------------------------------------------------------------
                             ccRCC HIGH tail	TPDS	                              
--------------------------------------------------------------------------------
PC
ALDH2
FAM151A
IGF2BP2
DNAJC12
--------------------------------------------------------------------------------
                            ccRCC LOW tail	MDS_TMDE	                            
--------------------------------------------------------------------------------
FLJ16779
SYT14
CDO1
HOXA7
FGF10
--------------------------------------------------------------------------------
                           ccRCC HIGH tail	MDS_TMDE	                            
--------------------------------------------------------------------------------
SLC16A10
GPR88
B4GALNT2
FBXW10
TFAP2A
--------------------------------------------------------------------------------
                              ccRCC LOW tail	MDE	                               
--------------------------------------------------------------------------------
CALCRL
CD36
MEF2C
LZTS1
VWA1
--------------------------------------------------------------------------------
                              ccRCC HIGH tail	MDE	                              
--------------------------------------------------------------------------------
MYDGF
FNDC3B
P4HB
IL4I1
FABP6
--------------------------------------------------------------------------------
                            ccRCC LOW tail	MDE_TMDS	                            
--------------------------------------------------------------------------------
STEAP4
BCL11A
VIP
ACKR1
CD69
--------------------------------------------------------------------------------
                           ccRCC HIGH tail	MDE_TMDS	                            
--------------------------------------------------------------------------------
UGT1A10
IGF2BP3
MXD3
GXYLT2
IL20RB
--------------------------------------------------------------------------------
                              ccRCC LOW tail	MDS	                               
--------------------------------------------------------------------------------
CRHBP
NDRG2
PBX1
DNAJC11
HS6ST1
--------------------------------------------------------------------------------
                              ccRCC HIGH tail	MDS	                              
--------------------------------------------------------------------------------
FOLR1
HADH
XYLB
CYP8B1
PFN2

/var/folders/gq/6ljhmvm1713fykdjqbl188pm0000gn/T/ipykernel_28439/1340551624.py:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ccrcc_r.sort_values('Integrated diff (Late-Early)', inplace=True)

In [13]:

for r in rcm_labels:
    pancan_r = pancan[pancan[reg_label].values == r]
    #pancan_r = pancan_r[pancan_r['Integrated padj (Late-Early)'] < 0.2]
    if len(pancan_r.values) > 1:
        pancan_r.sort_values('Integrated diff (Late-Early)', inplace=True)
        u.dp(['PanCan LOW tail', r])
        print('\n'.join(list(pancan_r.head()['id'].values)))
        u.dp(['PanCan HIGH tail', r])
        print('\n'.join(list(pancan_r.tail()['id'].values)))

--------------------------------------------------------------------------------
                             PanCan LOW tail	TMDE	                              
--------------------------------------------------------------------------------
ITGBL1
SELENOK
CPD
BZW2
SPINT1
--------------------------------------------------------------------------------
                             PanCan HIGH tail	TMDE	                             
--------------------------------------------------------------------------------
BMP1
TAP2
SERPINE1
EIF2AK2
OAS2
--------------------------------------------------------------------------------
                             PanCan LOW tail	TMDS	                              
--------------------------------------------------------------------------------
KLK11
PPL
ABLIM2
COQ9
EVPL
--------------------------------------------------------------------------------
                             PanCan HIGH tail	TMDS	                             
--------------------------------------------------------------------------------
CAV2
SERPINA1
CTSH
RRAS
WFS1
--------------------------------------------------------------------------------
                           PanCan LOW tail	TPDE_TMDS	                           
--------------------------------------------------------------------------------
CILP2
MESTIT1
DERL3
LINC01224
ALG1L2
--------------------------------------------------------------------------------
                          PanCan HIGH tail	TPDE_TMDS	                           
--------------------------------------------------------------------------------
HOXD10
NCCRP1
BNC1
KLK13
CLCA2
--------------------------------------------------------------------------------
                             PanCan LOW tail	TPDE	                              
--------------------------------------------------------------------------------
MDK
CRABP2
JPT1
COL11A1
LAD1
--------------------------------------------------------------------------------
                             PanCan HIGH tail	TPDE	                             
--------------------------------------------------------------------------------
LOXL2
PLEK2
KRT17
IGF2BP1
KRT14
--------------------------------------------------------------------------------
                           PanCan LOW tail	TPDS_TMDE	                           
--------------------------------------------------------------------------------
SPINK7
LAMB4
SLN
PLP1
GREM2
--------------------------------------------------------------------------------
                          PanCan HIGH tail	TPDS_TMDE	                           
--------------------------------------------------------------------------------
CLDN18
SFTPA1
SFTA1P
RASGRF1
PCDH15
--------------------------------------------------------------------------------
                             PanCan LOW tail	TPDS	                              
--------------------------------------------------------------------------------
KRT4
HOPX
CMA1
MAPT
SH3BGRL2
--------------------------------------------------------------------------------
                             PanCan HIGH tail	TPDS	                             
--------------------------------------------------------------------------------
STARD13
STX11
NPNT
FCN3
CAV1
--------------------------------------------------------------------------------
                           PanCan LOW tail	MDS_TMDE	                            
--------------------------------------------------------------------------------
MAL
SLC27A6
CWH43
PPP1R3C
ZNF844
--------------------------------------------------------------------------------
                           PanCan HIGH tail	MDS_TMDE	                           
--------------------------------------------------------------------------------
C8orf34-AS1
GATA6
HHIP
AGER
TBX4
--------------------------------------------------------------------------------
                              PanCan LOW tail	MDE	                              
--------------------------------------------------------------------------------
ANKRD22
EZH2
C15orf48
ITGA11
FNDC1
--------------------------------------------------------------------------------
                             PanCan HIGH tail	MDE	                              
--------------------------------------------------------------------------------
OAS3
MMP1
LAMC2
INHBA
S100A7
--------------------------------------------------------------------------------
                           PanCan LOW tail	MDE_TMDS	                            
--------------------------------------------------------------------------------
CP
IGLL5
UGT1A10
CEACAM5
EYA2
--------------------------------------------------------------------------------
                           PanCan HIGH tail	MDE_TMDS	                           
--------------------------------------------------------------------------------
LINC00707
PI3
KLK10
TENM2
LINC00520
--------------------------------------------------------------------------------
                              PanCan LOW tail	MDS	                              
--------------------------------------------------------------------------------
FRZB
PHYHD1
PLAC9
MFAP4
SOD3
--------------------------------------------------------------------------------
                             PanCan HIGH tail	MDS	                              
--------------------------------------------------------------------------------
CYB5A
SLC9A3R2
SHANK3
PPP1R14A
CA4

/var/folders/gq/6ljhmvm1713fykdjqbl188pm0000gn/T/ipykernel_28439/2038062952.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pancan_r.sort_values('Integrated diff (Late-Early)', inplace=True)

For comparison look at each of the raw logFCs for methylation, RNA, and protein and check if the low and high changes correspond¶

i.e. we want to check if they compare

In [14]:

pancan.sort_values('Protein-LogFC mean (Late-Early)', inplace=True)
u.dp(['PanCan LOW tail Protein', r])
print('\n'.join(list(pancan.head()['id'].values)))
u.dp(['PanCan HIGH tail Protein', r])
print('\n'.join(list(pancan.tail()['id'].values)))

--------------------------------------------------------------------------------
                          PanCan LOW tail Protein	MDS	                          
--------------------------------------------------------------------------------
KRT13
ADH7
PLA2G2A
CRABP2
PTN
--------------------------------------------------------------------------------
                         PanCan HIGH tail Protein	MDS	                          
--------------------------------------------------------------------------------
ANXA3
ENPEP
CAMP
CAV2
CA4

In [15]:

pancan.sort_values('RNA-LogFC mean (Late-Early)', inplace=True)
u.dp(['PanCan LOW tail RNA', r])
print('\n'.join(list(pancan.head()['id'].values)))
u.dp(['PanCan HIGH tail RNA', r])
print('\n'.join(list(pancan.tail()['id'].values)))

--------------------------------------------------------------------------------
                            PanCan LOW tail RNA	MDS	                            
--------------------------------------------------------------------------------
KRT13
KRT4
KRT19
CEACAM5
KRT15
--------------------------------------------------------------------------------
                           PanCan HIGH tail RNA	MDS	                            
--------------------------------------------------------------------------------
INHBA
SFTPA1
FCN3
CAV1
AGER

In [16]:

pancan.sort_values('CpG-LogFC mean (Late-Early)', inplace=True)
u.dp(['PanCan LOW tail CpG', r])
print('\n'.join(list(pancan.head()['id'].values)))
u.dp(['PanCan HIGH tail CpG', r])
print('\n'.join(list(pancan.tail()['id'].values)))

--------------------------------------------------------------------------------
                            PanCan LOW tail CpG	MDS	                            
--------------------------------------------------------------------------------
SPATS2
ATP8A1
PCCB
CPD
AQP4
--------------------------------------------------------------------------------
                           PanCan HIGH tail CpG	MDS	                            
--------------------------------------------------------------------------------
ERO1A
OVCH2
CLDN18
ROBO4
HOXC9

In [17]:

ccrcc.sort_values('Protein-LogFC mean (Late-Early)', inplace=True)
u.dp(['ccRCC LOW tail Protein', r])
print('\n'.join(list(ccrcc.head()['id'].values)))
u.dp(['ccRCC HIGH tail Protein', r])
print('\n'.join(list(ccrcc.tail()['id'].values)))

--------------------------------------------------------------------------------
                          ccRCC LOW tail Protein	MDS	                           
--------------------------------------------------------------------------------
UMOD
CPA3
SLC7A9
CLDN7
G6PC
--------------------------------------------------------------------------------
                          ccRCC HIGH tail Protein	MDS	                          
--------------------------------------------------------------------------------
GLIPR1
PDIA2
FABP7
IGFLR1
FABP6

In [18]:

ccrcc.sort_values('RNA-LogFC mean (Late-Early)', inplace=True)
u.dp(['ccRCC LOW tail RNA', r])
print('\n'.join(list(ccrcc.head()['id'].values)))
u.dp(['ccRCC HIGH tail RNA', r])
print('\n'.join(list(ccrcc.tail()['id'].values)))

--------------------------------------------------------------------------------
                            ccRCC LOW tail RNA	MDS	                             
--------------------------------------------------------------------------------
LTF
ALDOB
STEAP4
LUM
REG1A
--------------------------------------------------------------------------------
                            ccRCC HIGH tail RNA	MDS	                            
--------------------------------------------------------------------------------
IGF2BP3
C1QL1
OR2AT4
FAM151A
IL20RB

In [19]:

ccrcc.sort_values('CpG-LogFC mean (Late-Early)', inplace=True)
u.dp(['ccRCC LOW tail CpG', r])
print('\n'.join(list(ccrcc.head()['id'].values)))
u.dp(['ccRCC HIGH tail CpG', r])
print('\n'.join(list(ccrcc.tail()['id'].values)))

--------------------------------------------------------------------------------
                            ccRCC LOW tail CpG	MDS	                             
--------------------------------------------------------------------------------
SMC4
B4GALNT2
GTSF1
GPR82
CASK
--------------------------------------------------------------------------------
                            ccRCC HIGH tail CpG	MDS	                            
--------------------------------------------------------------------------------
ESF1
CACNA2D3
TMEM155
HOXA7
RAB9B

Lastly look at the pathways!¶

In [20]:

pancan_metabolic_pathways = pd.read_csv(os.path.join(input_dir, 'PanCan_Late_vs_Early_All_GSEA_MetabolicPathways.csv'))
ccrcc_metabolic_pathways = pd.read_csv(os.path.join(input_dir, f'ClearCellRenalCellCarcinoma_Late_vs_Early_All_GSEA_MetabolicPathways.csv'))
ccrcc_metabolic_pathways

Out[20]:

	pathway	pval	padj	ES	NES	nMoreExtreme	size	leadingEdge
0	Alanine and Aspartate Metabolism	0.014339	0.118568	0.565721	1.628672	92	20	NaN
1	Amino and Nucleotide Sugar Metabolism	0.503112	0.749078	0.550645	0.998549	2909	4	NaN
2	Arginine and Proline Metabolism	0.096271	0.353695	0.495757	1.375701	616	17	NaN
3	beta-Alanine metabolism	0.044252	0.247074	-0.769299	-1.590818	183	5	NaN
4	Bile Acid Biosynthesis	0.557572	0.776164	0.369497	0.931154	3471	12	NaN
...	...	...	...	...	...	...	...	...
62	Tryptophan metabolism	0.074895	0.330567	0.512519	1.422215	479	17	NaN
63	Tyrosine metabolism	0.671626	0.818163	0.272046	0.866257	4558	30	NaN
64	Urea Cycle	0.622580	0.786314	0.382026	0.883746	3793	9	NaN
65	Valine, Leucine and Isoleucine Metabolism	0.002946	0.040228	0.542631	1.739766	19	31	NaN
66	Xenobiotics Metabolism	0.264779	0.521771	0.412428	1.158738	1701	18	NaN

67 rows × 8 columns

In [21]:

ccrcc_metabolic_pathways = ccrcc_metabolic_pathways[ccrcc_metabolic_pathways['padj'] < 0.25]
ccrcc_metabolic_pathways

Out[21]:

	pathway	pval	padj	ES	NES	nMoreExtreme	size	leadingEdge
0	Alanine and Aspartate Metabolism	0.014339	0.118568	0.565721	1.628672	92	20	NaN
3	beta-Alanine metabolism	0.044252	0.247074	-0.769299	-1.590818	183	5	NaN
10	Citric Acid Cycle	0.000282	0.007494	0.547291	1.925616	1	48	NaN
17	Ethanol Metabolism	0.018351	0.122951	-0.713552	-1.726208	72	8	NaN
26	Glutamate metabolism	0.003753	0.041904	0.567009	1.741973	24	26	NaN
27	Glutathione Metabolism	0.012191	0.116687	0.689702	1.646831	74	10	NaN
29	Glycine, Serine and Threonine Metabolism	0.033878	0.206350	0.465506	1.492490	229	31	NaN
42	Oxidative Phosphorylation	0.000139	0.007494	0.574652	2.109133	0	60	NaN
49	Protein Modification	0.000336	0.007494	0.920089	1.976506	1	7	NaN
57	Steroid Metabolism	0.003002	0.040228	0.576724	1.771820	19	26	NaN
60	Transport, Lysosomal	0.015927	0.118568	0.569939	1.621708	102	19	NaN
65	Valine, Leucine and Isoleucine Metabolism	0.002946	0.040228	0.542631	1.739766	19	31	NaN

In [22]:

pancan_metabolic_pathways = pancan_metabolic_pathways[pancan_metabolic_pathways['padj'] < 0.25]
pancan_metabolic_pathways

Out[22]:

	pathway	pval	padj	ES	NES	nMoreExtreme	size	leadingEdge
0	O-Glycan Biosynthesis	0.001418	0.060993	-0.940797	-1.851958	4	4	NaN

In [ ]: