Notebook VAE Part 2 ccRCC Figure 4
Consolidate pathways so that we can do the metabolomics vis¶
In [1]:
# Need to add in entrez gene ID and also make labels for genes for ORA
# Imports
import pandas as pd
import os
from sciutil import SciUtil
# Setup file locations and label of the cancer
u = SciUtil()
cancer = 'ClearCellRenalCellCarcinoma'
input_dir = 'Input_RCM'
output_dir = 'Output_Data'
supp_dir = 'Required_Refs'
fig_dir = 'Output_Figures'
regLabel = 'RG2_Changes_filtered'
files = [f for f in os.listdir(fig_dir) if 'GSEA' in f]
# Save consolidated pathways for each of the analyses
conditions = ['Stage_IV_vs_Stage_I', 'Late_vs_Early']
for cond in conditions:
c_df = pd.DataFrame()
for f in files:
if cond in f and '_Pathways' in f:
df = pd.read_csv(os.path.join(fig_dir, f))
c_df = pd.concat([c_df, df])
u.dp([f])
print(df[df['padj'] < 0.25])
c_df.to_csv(os.path.join(output_dir, f'{cond}_Pathways.tsv'), sep='\t', index=False)
-------------------------------------------------------------------------------- Stage_IV_vs_Stage_I_All_GSEA_Pathways.csv -------------------------------------------------------------------------------- pathway pval padj \ 6 KEGG_ASCORBATE_AND_ALDARATE_METABOLISM 0.027883 0.232360 10 KEGG_STEROID_HORMONE_BIOSYNTHESIS 0.000698 0.036642 11 KEGG_OXIDATIVE_PHOSPHORYLATION 0.000826 0.036642 57 KEGG_PORPHYRIN_AND_CHLOROPHYLL_METABOLISM 0.019941 0.194614 62 KEGG_METABOLISM_OF_XENOBIOTICS_BY_CYTOCHROME_P450 0.005635 0.089641 64 KEGG_DRUG_METABOLISM_OTHER_ENZYMES 0.008004 0.116732 83 KEGG_CELL_CYCLE 0.000264 0.036642 84 KEGG_OOCYTE_MEIOSIS 0.001516 0.053052 85 KEGG_P53_SIGNALING_PATHWAY 0.012213 0.153762 104 KEGG_CELL_ADHESION_MOLECULES_CAMS 0.014420 0.157719 109 KEGG_ANTIGEN_PROCESSING_AND_PRESENTATION 0.000838 0.036642 132 KEGG_PROGESTERONE_MEDIATED_OOCYTE_MATURATION 0.004234 0.085325 134 KEGG_ADIPOCYTOKINE_SIGNALING_PATHWAY 0.002113 0.061641 138 KEGG_ALDOSTERONE_REGULATED_SODIUM_REABSORPTION 0.002983 0.074584 140 KEGG_PROXIMAL_TUBULE_BICARBONATE_RECLAMATION 0.013180 0.153762 141 KEGG_ALZHEIMERS_DISEASE 0.004876 0.085325 142 KEGG_PARKINSONS_DISEASE 0.027484 0.232360 144 KEGG_HUNTINGTONS_DISEASE 0.023250 0.214147 165 KEGG_ASTHMA 0.004829 0.085325 168 KEGG_ALLOGRAFT_REJECTION 0.012448 0.153762 174 KEGG_VIRAL_MYOCARDITIS 0.020017 0.194614 ES NES nMoreExtreme size leadingEdge 6 0.587537 1.540481 190 16 NaN 10 0.654090 1.892871 4 24 NaN 11 0.462153 1.704033 6 90 NaN 57 0.548087 1.568666 141 23 NaN 62 0.532872 1.658889 41 34 NaN 64 0.577942 1.654113 56 23 NaN 83 0.602546 1.928077 1 39 NaN 84 0.603294 1.788716 10 27 NaN 85 0.525849 1.596041 89 30 NaN 104 -0.318449 -1.422493 22 85 NaN 109 -0.472242 -1.825362 1 41 NaN 132 0.659466 1.729074 28 16 NaN 134 -0.552940 -1.874310 5 24 NaN 138 0.635651 1.763522 20 20 NaN 140 -0.544339 -1.703191 39 18 NaN 141 0.424413 1.554985 40 86 NaN 142 0.391843 1.421975 228 80 NaN 144 0.394850 1.434857 193 81 NaN 165 -0.542890 -1.790142 13 22 NaN 168 -0.447943 -1.622105 32 31 NaN 174 -0.382627 -1.502292 45 44 NaN -------------------------------------------------------------------------------- Late_vs_Early_TPDE_TMDS_GSEA_Pathways.csv -------------------------------------------------------------------------------- pathway pval padj \ 6 KEGG_NEUROACTIVE_LIGAND_RECEPTOR_INTERACTION 0.003292 0.041415 8 KEGG_OOCYTE_MEIOSIS 0.003451 0.041415 16 KEGG_JAK_STAT_SIGNALING_PATHWAY 0.024649 0.197190 ES NES nMoreExtreme size leadingEdge 6 -0.658415 -1.894727 11 17 NaN 8 0.878130 1.636022 19 5 NaN 16 -0.846145 -1.549914 106 4 NaN -------------------------------------------------------------------------------- Late_vs_Early_TMDE_GSEA_Pathways.csv -------------------------------------------------------------------------------- pathway pval padj ES \ 15 KEGG_T_CELL_RECEPTOR_SIGNALING_PATHWAY 0.016882 0.219471 -0.830475 16 KEGG_B_CELL_RECEPTOR_SIGNALING_PATHWAY 0.016882 0.219471 -0.830475 NES nMoreExtreme size leadingEdge 15 -1.682063 59 4 NaN 16 -1.682063 59 4 NaN -------------------------------------------------------------------------------- Late_vs_Early_TPDS_TMDE_GSEA_Pathways.csv -------------------------------------------------------------------------------- pathway pval padj \ 1 KEGG_ASCORBATE_AND_ALDARATE_METABOLISM 0.041644 0.183234 2 KEGG_STEROID_HORMONE_BIOSYNTHESIS 0.016535 0.121253 3 KEGG_STARCH_AND_SUCROSE_METABOLISM 0.041644 0.183234 4 KEGG_RETINOL_METABOLISM 0.005062 0.054677 5 KEGG_PORPHYRIN_AND_CHLOROPHYLL_METABOLISM 0.025764 0.161945 6 KEGG_METABOLISM_OF_XENOBIOTICS_BY_CYTOCHROME_P450 0.004978 0.054677 7 KEGG_DRUG_METABOLISM_CYTOCHROME_P450 0.006213 0.054677 14 KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION 0.002990 0.054677 15 KEGG_CHEMOKINE_SIGNALING_PATHWAY 0.057613 0.230453 24 KEGG_AXON_GUIDANCE 0.032810 0.180455 31 KEGG_HEMATOPOIETIC_CELL_LINEAGE 0.002692 0.054677 ES NES nMoreExtreme size leadingEdge 1 0.802752 1.459200 230 5 NaN 2 0.728244 1.604836 97 10 NaN 3 0.802752 1.459200 230 5 NaN 4 0.772047 1.701366 29 10 NaN 5 0.795713 1.519386 144 6 NaN 6 0.730575 1.730148 29 13 NaN 7 0.752845 1.701978 36 11 NaN 14 -0.734764 -1.819878 11 12 NaN 15 -0.740379 -1.497316 251 6 NaN 24 -0.747647 -1.584614 139 7 NaN 31 -0.908226 -1.732726 11 5 NaN -------------------------------------------------------------------------------- Late_vs_Early_MDE_TMDS_GSEA_Pathways.csv -------------------------------------------------------------------------------- pathway pval padj \ 0 KEGG_STEROID_HORMONE_BIOSYNTHESIS 0.006767 0.106913 3 KEGG_DRUG_METABOLISM_OTHER_ENZYMES 0.000549 0.043343 13 KEGG_CELL_CYCLE 0.021322 0.240629 14 KEGG_OOCYTE_MEIOSIS 0.006439 0.106913 22 KEGG_NOTCH_SIGNALING_PATHWAY 0.025116 0.248017 28 KEGG_CELL_ADHESION_MOLECULES_CAMS 0.005335 0.106913 39 KEGG_NATURAL_KILLER_CELL_MEDIATED_CYTOTOXICITY 0.015328 0.201825 40 KEGG_T_CELL_RECEPTOR_SIGNALING_PATHWAY 0.005542 0.106913 ES NES nMoreExtreme size leadingEdge 0 0.875150 1.639698 36 4 NaN 3 0.943345 1.767470 2 4 NaN 13 0.661857 1.598604 120 9 NaN 14 0.759500 1.696611 35 7 NaN 22 -0.776874 -1.640029 113 5 NaN 28 -0.517999 -1.747505 21 23 NaN 39 -0.512668 -1.653863 62 20 NaN 40 -0.583989 -1.804562 22 17 NaN -------------------------------------------------------------------------------- Late_vs_Early_All_GSEA_Pathways.csv -------------------------------------------------------------------------------- pathway pval padj \ 1 KEGG_CITRATE_CYCLE_TCA_CYCLE 0.005114 0.055938 6 KEGG_ASCORBATE_AND_ALDARATE_METABOLISM 0.013492 0.109754 7 KEGG_FATTY_ACID_METABOLISM 0.060782 0.247370 10 KEGG_STEROID_HORMONE_BIOSYNTHESIS 0.007009 0.068143 11 KEGG_OXIDATIVE_PHOSPHORYLATION 0.000132 0.006244 14 KEGG_ALANINE_ASPARTATE_AND_GLUTAMATE_METABOLISM 0.000602 0.009572 17 KEGG_VALINE_LEUCINE_AND_ISOLEUCINE_DEGRADATION 0.000435 0.008468 19 KEGG_LYSINE_DEGRADATION 0.008990 0.082801 20 KEGG_ARGININE_AND_PROLINE_METABOLISM 0.004742 0.055938 24 KEGG_TRYPTOPHAN_METABOLISM 0.000598 0.009572 35 KEGG_GLYCOSAMINOGLYCAN_BIOSYNTHESIS_HEPARAN_SU... 0.019814 0.144478 48 KEGG_GLYOXYLATE_AND_DICARBOXYLATE_METABOLISM 0.051718 0.220750 49 KEGG_PROPANOATE_METABOLISM 0.018201 0.138486 50 KEGG_BUTANOATE_METABOLISM 0.037787 0.204882 54 KEGG_PANTOTHENATE_AND_COA_BIOSYNTHESIS 0.023732 0.148322 57 KEGG_PORPHYRIN_AND_CHLOROPHYLL_METABOLISM 0.006882 0.068143 61 KEGG_AMINOACYL_TRNA_BIOSYNTHESIS 0.042222 0.204882 64 KEGG_DRUG_METABOLISM_OTHER_ENZYMES 0.003059 0.041173 67 KEGG_RIBOSOME 0.043937 0.204882 69 KEGG_DNA_REPLICATION 0.004941 0.055938 72 KEGG_BASE_EXCISION_REPAIR 0.030388 0.177264 76 KEGG_MAPK_SIGNALING_PATHWAY 0.041762 0.204882 79 KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION 0.039503 0.204882 82 KEGG_NEUROACTIVE_LIGAND_RECEPTOR_INTERACTION 0.000403 0.008468 83 KEGG_CELL_CYCLE 0.055257 0.230236 84 KEGG_OOCYTE_MEIOSIS 0.023277 0.148322 85 KEGG_P53_SIGNALING_PATHWAY 0.023242 0.148322 88 KEGG_LYSOSOME 0.033134 0.187048 90 KEGG_PEROXISOME 0.000143 0.006244 93 KEGG_CARDIAC_MUSCLE_CONTRACTION 0.045659 0.204882 99 KEGG_TGF_BETA_SIGNALING_PATHWAY 0.012332 0.107904 102 KEGG_FOCAL_ADHESION 0.000379 0.008468 103 KEGG_ECM_RECEPTOR_INTERACTION 0.000329 0.008468 104 KEGG_CELL_ADHESION_MOLECULES_CAMS 0.001606 0.023427 106 KEGG_TIGHT_JUNCTION 0.013798 0.109754 108 KEGG_COMPLEMENT_AND_COAGULATION_CASCADES 0.042629 0.204882 132 KEGG_PROGESTERONE_MEDIATED_OOCYTE_MATURATION 0.048254 0.211111 141 KEGG_ALZHEIMERS_DISEASE 0.000266 0.008468 142 KEGG_PARKINSONS_DISEASE 0.000135 0.006244 144 KEGG_HUNTINGTONS_DISEASE 0.000134 0.006244 145 KEGG_PRION_DISEASES 0.020822 0.145756 146 KEGG_VIBRIO_CHOLERAE_INFECTION 0.025872 0.156127 167 KEGG_SYSTEMIC_LUPUS_ERYTHEMATOSUS 0.044651 0.204882 ES NES nMoreExtreme size leadingEdge 1 0.547111 1.717589 33 28 NaN 6 0.594532 1.615124 84 16 NaN 7 0.415302 1.398868 417 39 NaN 10 0.558980 1.689741 45 24 NaN 11 0.541345 2.116049 0 90 NaN 14 0.604427 1.897524 3 28 NaN 17 0.552392 1.875476 2 41 NaN 19 0.547871 1.656160 58 24 NaN 20 0.518489 1.705473 31 35 NaN 24 0.608842 1.927103 3 29 NaN 35 -0.750362 -1.654825 80 6 NaN 48 0.617985 1.478837 315 10 NaN 49 0.500974 1.572748 120 28 NaN 50 0.479204 1.490929 249 27 NaN 54 0.658941 1.576844 144 10 NaN 57 0.564159 1.687789 44 23 NaN 61 0.552395 1.500652 265 16 NaN 64 0.593996 1.777053 19 23 NaN 67 0.413574 1.435351 307 45 NaN 69 0.636515 1.699993 30 15 NaN 72 0.687294 1.535138 183 8 NaN 76 -0.310068 -1.345832 108 76 NaN 79 -0.281236 -1.302744 88 112 NaN 82 -0.432127 -1.916093 0 86 NaN 83 0.419709 1.413713 379 39 NaN 84 0.498118 1.549776 153 27 NaN 85 0.485798 1.548392 154 30 NaN 88 0.410228 1.453412 232 51 NaN 90 0.557240 1.946593 0 47 NaN 93 0.427980 1.441572 313 39 NaN 99 -0.518088 -1.687484 43 21 NaN 102 -0.451503 -1.941829 0 73 NaN 103 -0.572447 -2.207507 0 43 NaN 104 -0.378200 -1.671816 3 85 NaN 106 -0.401102 -1.546756 41 43 NaN 108 -0.365405 -1.401211 130 42 NaN 132 0.545288 1.481346 303 16 NaN 141 0.492825 1.914140 1 86 NaN 142 0.504846 1.936789 0 80 NaN 144 0.533168 2.050072 0 81 NaN 145 -0.587613 -1.663543 78 13 NaN 146 0.488622 1.533970 171 28 NaN 167 -0.367315 -1.403624 138 41 NaN -------------------------------------------------------------------------------- Late_vs_Early_MDS_GSEA_Pathways.csv -------------------------------------------------------------------------------- pathway pval padj \ 0 KEGG_FATTY_ACID_METABOLISM 0.028030 0.056394 1 KEGG_VALINE_LEUCINE_AND_ISOLEUCINE_DEGRADATION 0.081128 0.121692 3 KEGG_PPAR_SIGNALING_PATHWAY 0.002595 0.015572 4 KEGG_PEROXISOME 0.028197 0.056394 ES NES nMoreExtreme size leadingEdge 0 0.756713 1.652515 107 5 NaN 1 0.733443 1.479070 327 4 NaN 3 0.860951 1.880148 9 5 NaN 4 0.813519 1.640554 113 4 NaN -------------------------------------------------------------------------------- Late_vs_Early_MDS_TMDE_GSEA_Pathways.csv -------------------------------------------------------------------------------- Empty DataFrame Columns: [pathway, pval, padj, ES, NES, nMoreExtreme, size, leadingEdge] Index: [] -------------------------------------------------------------------------------- Late_vs_Early_TPDE_GSEA_Pathways.csv -------------------------------------------------------------------------------- Empty DataFrame Columns: [pathway, pval, padj, ES, NES, nMoreExtreme, size, leadingEdge] Index: [] -------------------------------------------------------------------------------- Late_vs_Early_MDE_GSEA_Pathways.csv -------------------------------------------------------------------------------- pathway pval padj ES NES nMoreExtreme \ 17 KEGG_AXON_GUIDANCE 0.000679 0.042748 0.813272 1.855256 3 size leadingEdge 17 8 NaN -------------------------------------------------------------------------------- Late_vs_Early_TPDS_GSEA_Pathways.csv -------------------------------------------------------------------------------- pathway pval padj \ 3 KEGG_ASCORBATE_AND_ALDARATE_METABOLISM 0.042114 0.237198 13 KEGG_LYSINE_DEGRADATION 0.036252 0.237198 14 KEGG_ARGININE_AND_PROLINE_METABOLISM 0.013734 0.237198 18 KEGG_TRYPTOPHAN_METABOLISM 0.002377 0.123613 19 KEGG_BETA_ALANINE_METABOLISM 0.050911 0.237198 20 KEGG_SELENOAMINO_ACID_METABOLISM 0.033902 0.237198 22 KEGG_GLYCEROLIPID_METABOLISM 0.037713 0.237198 25 KEGG_PYRUVATE_METABOLISM 0.037587 0.237198 27 KEGG_PROPANOATE_METABOLISM 0.054738 0.237198 32 KEGG_LIMONENE_AND_PINENE_DEGRADATION 0.037312 0.237198 44 KEGG_TIGHT_JUNCTION 0.051334 0.237198 48 KEGG_PROXIMAL_TUBULE_BICARBONATE_RECLAMATION 0.019702 0.237198 ES NES nMoreExtreme size leadingEdge 3 0.700539 1.532372 211 6 NaN 13 0.609111 1.575044 182 10 NaN 14 0.586664 1.690474 69 14 NaN 18 0.709532 1.834715 11 10 NaN 19 0.590893 1.527938 256 10 NaN 20 0.747188 1.531119 168 5 NaN 22 0.740437 1.517286 187 5 NaN 25 0.588003 1.563041 189 11 NaN 27 0.525767 1.515000 278 14 NaN 32 0.740906 1.518246 185 5 NaN 44 -0.567427 -1.521261 253 11 NaN 48 -0.753408 -1.661913 97 6 NaN -------------------------------------------------------------------------------- Late_vs_Early_TMDS_GSEA_Pathways.csv -------------------------------------------------------------------------------- pathway pval padj \ 1 KEGG_CITRATE_CYCLE_TCA_CYCLE 0.007217 0.099599 10 KEGG_VALINE_LEUCINE_AND_ISOLEUCINE_DEGRADATION 0.026498 0.203154 40 KEGG_ABC_TRANSPORTERS 0.013986 0.130040 47 KEGG_PEROXISOME 0.002829 0.097606 52 KEGG_CELL_ADHESION_MOLECULES_CAMS 0.005419 0.099599 53 KEGG_ADHERENS_JUNCTION 0.001553 0.097606 54 KEGG_TIGHT_JUNCTION 0.012353 0.130040 57 KEGG_FC_GAMMA_R_MEDIATED_PHAGOCYTOSIS 0.015077 0.130040 58 KEGG_LEUKOCYTE_TRANSENDOTHELIAL_MIGRATION 0.006211 0.099599 ES NES nMoreExtreme size leadingEdge 1 0.592830 1.656132 62 16 NaN 10 0.541362 1.532047 232 17 NaN 40 0.822230 1.557723 99 4 NaN 47 0.548184 1.694294 25 26 NaN 52 -0.656031 -1.925157 10 8 NaN 53 -0.853799 -2.014262 3 5 NaN 54 -0.558370 -1.798850 21 10 NaN 57 -0.818702 -1.751089 42 4 NaN 58 -0.787309 -1.857400 15 5 NaN
In [2]:
# Save consolidated pathways for each of the analyses
conditions = ['Stage_IV_vs_Stage_I', 'Late_vs_Early']
for cond in conditions:
c_df = pd.DataFrame()
for f in files:
if cond in f and 'MetabolicPathways' in f:
df = pd.read_csv(os.path.join(fig_dir, f))
c_df = pd.concat([c_df, df])
u.dp([f])
#print(df[df['padj'] < 0.25])
c_df.to_csv(os.path.join(output_dir, f'{cond}_MetabolicPathways.tsv'), sep='\t', index=False)
-------------------------------------------------------------------------------- Stage_IV_vs_Stage_I_All_GSEA_MetabolicPathways.csv -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- Late_vs_Early_TMDE_GSEA_MetabolicPathways.csv -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- Late_vs_Early_TMDS_GSEA_MetabolicPathways.csv -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- Late_vs_Early_All_GSEA_MetabolicPathways.csv -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- Late_vs_Early_TPDS_GSEA_MetabolicPathways.csv -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- Late_vs_Early_MDE_TMDS_GSEA_MetabolicPathways.csv -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- Late_vs_Early_TPDE_TMDS_GSEA_MetabolicPathways.csv -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- Late_vs_Early_MDS_GSEA_MetabolicPathways.csv -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- Late_vs_Early_TPDS_TMDE_GSEA_MetabolicPathways.csv -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- Late_vs_Early_MDS_TMDE_GSEA_MetabolicPathways.csv -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- Late_vs_Early_MDE_GSEA_MetabolicPathways.csv -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- Late_vs_Early_TPDE_GSEA_MetabolicPathways.csv --------------------------------------------------------------------------------
Now look at what the pathways are for each of the S1 and S4/Early and late¶
In [3]:
import seaborn as sns
import matplotlib as mpl
import pandas as pd
import matplotlib.cm as cm
import numpy as np
import matplotlib.cm as cm
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (3,10)
sns.set_theme(style="whitegrid")
#cmaps = {'Down regulated': {'CpG': 'darkorange', 'Protein': 'royalblue', 'RNA': 'teal'},
# 'Up regulated': {'CpG': 'limegreen', 'Protein': 'darkred', 'RNA': 'lightcoral'},
# }
cmaps = {'Down regulated': {'CpG S1': 'darkorange', 'Protein S1': 'royalblue', 'RNA S1': 'teal',
'CpG S4': 'darkorange', 'Protein S4': 'royalblue', 'RNA S4': 'teal'},
'Up regulated': {'CpG S1': 'limegreen', 'Protein S1': 'darkred', 'RNA S1': 'lightcoral',
'CpG S4': 'limegreen', 'Protein S4': 'darkred', 'RNA S4': 'lightcoral'},
}
# cmaps = {'Down regulated': {'CpG S1': 'chocolate', 'Protein S1': 'royalblue', 'RNA S1': 'teal',
# 'CpG S4': 'orange', 'Protein S4': 'navy', 'RNA S4': 'turquoise'},
# 'Up regulated': {'CpG S1': 'green', 'Protein S1': 'red', 'RNA S1': 'orange',
# 'CpG S4': 'limegreen', 'Protein S4': 'darkred', 'RNA S4': 'lightcoral'},
# }
comparisons = {'Down regulated': ['SiRCle_StageIV__RNAseq_Negative_LogFC-1_pvalue0.05.csv',
'SiRCle_StageIV__Protein_Negative_LogFC-1_pvalue0.05.csv',
'SiRCle_StageI__Methylation_Positive_LogFC0.1_pvalue0.05.csv'
],
'Up regulated': ['SiRCle_StageIV__RNAseq_Positive_LogFC1_pvalue0.05.csv',
'SiRCle_StageIV__Protein_Positive_LogFC1_pvalue0.05.csv',
'SiRCle_StageIV__Methylation_Negative_LogFC-0.1_pvalue0.05.csv'
]
}
num = 10
sizes = {'Down regulated': {'min': 18, 'max': 374},
'Up regulated': {'min': 16, 'max': 448}}
for c in comparisons:
combined_df = pd.DataFrame()
rna_df = pd.read_csv(f'{data_dir}{comparisons[c][0].replace("IV", "I")}')
rna_df['GeneRatio'] = [int(g.split('/')[0])/int(g.split('/')[1]) for g in rna_df['GeneRatio'].values]
rna_df['Data'] = 'RNA S1'
rna_df.sort_values('p.adjust', inplace=True)
protein_df = pd.read_csv(f'{data_dir}{comparisons[c][1].replace("IV", "I")}')
protein_df['GeneRatio'] = [int(g.split('/')[0])/int(g.split('/')[1]) for g in protein_df['GeneRatio'].values]
protein_df['Data'] = 'Protein S1'
protein_df.sort_values('p.adjust', inplace=True)
cpg_df = pd.read_csv(f'{data_dir}{comparisons[c][2].replace("IV", "I")}')
cpg_df['GeneRatio'] = [int(g.split('/')[0])/int(g.split('/')[1]) for g in cpg_df['GeneRatio'].values]
cpg_df['Data'] = 'CpG S1'
cpg_df.sort_values('p.adjust', inplace=True)
combined_df = combined_df.append(rna_df)
combined_df = combined_df.append(cpg_df)
combined_df = combined_df.append(protein_df)
rna_df = pd.concat([rna_df.head(num), cpg_df.head(num), protein_df.head(num)])
rna_df.sort_values('p.adjust', inplace=True)
max_c = sizes[c]['max'] ##max(rna_df['Count'].values)
min_c = sizes[c]['min'] #min(rna_df['Count'].values)
size = [int(1 + ((int(c) - min_c)/(max_c - min_c))*300) for c in rna_df['Count'].values]
norm = mpl.colors.Normalize(vmin=np.min(rna_df['p.adjust'].values), vmax=np.max(rna_df['p.adjust'].values))
cmap = cm.RdBu
m = cm.ScalarMappable(norm=norm, cmap=cmap)
c_map = cmaps[c]
colours = [c_map.get(v) for v in rna_df['Data'].values]
plt.scatter(rna_df['GeneRatio'].values, rna_df['Description'].values, s=size, c=colours, marker=',',
edgecolors='black', alpha=0.8)#c=m.to_rgba(rna_df['p.adjust'].values))
rna_df = pd.read_csv(f'{data_dir}{comparisons[c][0]}')
rna_df['GeneRatio'] = [int(g.split('/')[0])/int(g.split('/')[1]) for g in rna_df['GeneRatio'].values]
rna_df['Data'] = 'RNA S4'
rna_df.sort_values('p.adjust', inplace=True)
protein_df = pd.read_csv(f'{data_dir}{comparisons[c][1]}')
protein_df['GeneRatio'] = [int(g.split('/')[0])/int(g.split('/')[1]) for g in protein_df['GeneRatio'].values]
protein_df['Data'] = 'Protein S4'
protein_df.sort_values('p.adjust', inplace=True)
cpg_df = pd.read_csv(f'{data_dir}{comparisons[c][2]}')
cpg_df['GeneRatio'] = [int(g.split('/')[0])/int(g.split('/')[1]) for g in cpg_df['GeneRatio'].values]
cpg_df['Data'] = 'CpG S4'
cpg_df.sort_values('p.adjust', inplace=True)
combined_df = combined_df.append(rna_df)
combined_df = combined_df.append(cpg_df)
combined_df = combined_df.append(protein_df)
# Concat all together
num = 10
rna_df = pd.concat([rna_df.head(num), cpg_df.head(num), protein_df.head(num)])
rna_df.sort_values('p.adjust', inplace=True)
max_c = sizes[c]['max'] ##max(rna_df['Count'].values)
min_c = sizes[c]['min'] #min(rna_df['Count'].values)
size = [int(1 + ((int(c) - min_c)/(max_c - min_c))*300) for c in rna_df['Count'].values]
norm = mpl.colors.Normalize(vmin=np.min(rna_df['p.adjust'].values), vmax=np.max(rna_df['p.adjust'].values))
cmap = cm.RdBu
m = cm.ScalarMappable(norm=norm, cmap=cmap)
c_map = cmaps[c]
colours = [c_map.get(v) for v in rna_df['Data'].values]
plt.scatter(rna_df['GeneRatio'].values, rna_df['Description'].values, s=size, c=colours, edgecolors='black', marker='*', alpha=0.8)#c=m.to_rgba(rna_df['p.adjust'].values))
plt.title(c)
# # Add legends
# cmap_labels = [f'G.shared:{min(overlapping_numbers)}-{max(overlapping_numbers)}']
# # create proxy artists as handles:
# cmaps = [edge_cmap]
# cmap_handles = [Rectangle((0, 0), 1, 1) for _ in cmaps]
# handler_map = dict(zip(cmap_handles,
# [HandlerColormap(cm, num_stripes=8) for cm in cmaps]))
# legend2 = plt.legend(handles=cmap_handles,
# labels=cmap_labels,
# handler_map=handler_map,
# fontsize=8, loc='upper right')
# plt.gca().add_artist(legend2)
combined_df.to_csv(f'{output_dir}{c}_StageIV-StageI-TvN.csv', index=False)
gmin = plt.scatter([], [], s=10 + int(300*(min_c/(max_c - min_c))), marker='o', color='#222')
#gmid = plt.scatter([], [], s=int(10 + 300*(((max_c - min_c)/2)/max_c)), marker='o', color='#222')
gmax = plt.scatter([], [], s=10 + int(300*(1)), marker='o', color='#222')
legend = plt.legend((gmin, gmax),
(str(min_c), str(max_c)),
scatterpoints=1,
loc='lower left',
ncol=1,
fontsize=8, bbox_to_anchor=(0, -0.1))
legend.set_title("No. Genes")
plt.gca().add_artist(legend)
plt.savefig(f'{output_dir}DotPlot_GO_{c}_S1-S4.svg')
plt.show()
#sns.stripplot(data=rna_df.head(30), y='Description', x='GeneRatio', size=size) #, color='p.adjust', size='Count')
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[3], line 41 39 for c in comparisons: 40 combined_df = pd.DataFrame() ---> 41 rna_df = pd.read_csv(f'{data_dir}{comparisons[c][0].replace("IV", "I")}') 42 rna_df['GeneRatio'] = [int(g.split('/')[0])/int(g.split('/')[1]) for g in rna_df['GeneRatio'].values] 43 rna_df['Data'] = 'RNA S1' NameError: name 'data_dir' is not defined
In [ ]: