Notebook for RNA processing part 2¶
Second notebook on RNA processing¶
This one is run after we have generated the clincical information to divide the tumour and normal samples.
In [1]:
from scidat.api import API, APIException
from sciutil import SciUtil
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
### Read in dataframe
base_dir = '../data/'
data_dir = f'{base_dir}raw_downloads/CPTAC/'
output_dir = f'{base_dir}sircle/F1_DE_input_TvN/'
fig_dir = '../figures/'
supp_dir = f'{base_dir}raw_downloads/supps/'
gene_name = 'hgnc_symbol'
save_fig = False
In [2]:
df_filtered_annot = pd.read_csv(f'{output_dir}CCRCC_Clark_Cell2019_rna_filtered_df.csv')
df_filtered_annot
rna_cases = [c.split('_')[-2][:9] for c in df_filtered_annot.columns if 'CP' in c]
rna_ids = [c.split('_')[-1] for c in df_filtered_annot.columns if 'CP' in c]
rna_cols = [c for c in df_filtered_annot.columns if 'CP' in c]
cond_name = []
cond_ids = []
for c in df_filtered_annot.columns:
if 'CPT' in c:
if 'Tumor' in c:
cond_name.append('Tumor')
cond_ids.append(1)
elif 'Normal' in c:
cond_name.append('Normal')
cond_ids.append(0)
else:
cond_name.append(c.split('_')[1])
print(c.split('_')[1])
sample_df = pd.DataFrame()
sample_df['SampleId'] = rna_ids
sample_df['CaseId'] = rna_cases
sample_df['CondId'] = cond_ids
sample_df['FullLabel'] = rna_cols
sample_df['CondName'] = cond_name
r_safe_case_ids = [c.replace('-', '.') for c in rna_cases]
sample_df['SafeCases'] = r_safe_case_ids
sample_df
Out[2]:
SampleId | CaseId | CondId | FullLabel | CondName | SafeCases | |
---|---|---|---|---|---|---|
0 | b00d920a-f927-438d-af55-40b062523be4 | C3N-01648 | 0 | CPTAC-3_SolidTissueNormal_male_notreported_1_h... | Normal | C3N.01648 |
1 | 47c8a523-5d19-4f3f-8451-44543c00bbc1 | C3N-00646 | 0 | CPTAC-3_SolidTissueNormal_female_notreported_1... | Normal | C3N.00646 |
2 | 1ed67f72-25c2-4d1d-be14-55135cd55edc | C3N-01220 | 0 | CPTAC-3_SolidTissueNormal_male_notreported_1_h... | Normal | C3N.01220 |
3 | e6e0146a-1efd-4e5e-bb7d-89442ba8cb99 | C3L-00814 | 0 | CPTAC-3_SolidTissueNormal_male_notreported_1_h... | Normal | C3L.00814 |
4 | ac2f5fa6-c1d1-443d-800c-1452b5b9a85c | C3L-00447 | 0 | CPTAC-3_SolidTissueNormal_male_white_1_htseq.c... | Normal | C3L.00447 |
... | ... | ... | ... | ... | ... | ... |
241 | ca9f82fb-5cfa-4b46-ba53-6a7ae5037e57 | C3N-00494 | 1 | CPTAC-3_PrimaryTumor,PrimaryTumor_None_None_No... | Tumor | C3N.00494 |
242 | 11a60b10-37f0-4513-8f42-2826de79e344 | C3N-01214 | 1 | CPTAC-3_PrimaryTumor_male_notreported_1_htseq.... | Tumor | C3N.01214 |
243 | ac17a546-5ff3-4583-adf7-44efd0bfbadd | C3N-01200 | 1 | CPTAC-3_PrimaryTumor_female_notreported_1_htse... | Tumor | C3N.01200 |
244 | 5aa7d201-7dc9-454f-888e-55168d462ba3 | C3L-00812 | 1 | CPTAC-3_PrimaryTumor_male_white_1_htseq.counts... | Tumor | C3L.00812 |
245 | 45a2441f-95e1-43e8-a625-655be0e5d6e1 | C3N-00495 | 1 | CPTAC-3_PrimaryTumor_male_notreported_1_htseq.... | Tumor | C3N.00495 |
246 rows × 6 columns
In [3]:
# For each of the dataframes we want to make sure the full labels are "safe" from R
# So we'll update the RNAseq, protein and DNA methylation ones
samples = sample_df['CondName']
cases = sample_df['SafeCases']
rna_col_map = {}
name_map = {}
col_names = []
for i, c in enumerate(sample_df['FullLabel'].values):
col_name = f'{samples[i]}_{cases[i]}_1'
if name_map.get(col_name):
col_name = f'{name_map.get(col_name)[:-2]}_{int(name_map.get(col_name)[-1]) + 1}'
print(col_name)
name_map[f'{samples[i]}_{cases[i]}_1'] = col_name
name_map[col_name] = col_name
rna_col_map[c] = col_name
col_names.append(col_name)
rna_df = df_filtered_annot.rename(columns=rna_col_map)
sample_df['FullLabel'] = col_names
Tumor_C3N.00310_2 Tumor_C3N.00573_2 Tumor_C3N.00646_2 Tumor_C3N.00646_3 Tumor_C3N.01522_2 Tumor_C3L.00583_2 Tumor_C3L.01603_2 Tumor_C3N.00312_2 Tumor_C3N.00573_3 Tumor_C3N.00437_2 Tumor_C3N.00150_2 Tumor_C3N.00148_2 Tumor_C3N.00494_2 Tumor_C3L.01286_2 Tumor_C3N.00437_3 Tumor_C3N.01522_3 Tumor_C3L.00369_2 Tumor_C3L.01836_2 Tumor_C3N.00148_3 Tumor_C3N.01200_2 Tumor_C3N.00168_2 Tumor_C3N.00646_4 Tumor_C3L.01286_3 Tumor_C3L.00606_2 Tumor_C3N.01200_3 Tumor_C3N.00177_2 Tumor_C3N.00168_3 Tumor_C3L.00447_2 Tumor_C3L.00447_3 Tumor_C3N.00320_2 Tumor_C3N.01220_2 Tumor_C3N.00168_4 Tumor_C3N.00310_3 Tumor_C3N.00314_2 Tumor_C3L.01607_2 Tumor_C3N.00148_4 Tumor_C3L.01603_3 Tumor_C3N.01220_3 Tumor_C3L.00606_3 Tumor_C3L.00581_2 Tumor_C3N.00150_3 Tumor_C3L.00418_2 Tumor_C3N.01261_2 Tumor_C3N.00320_3 Tumor_C3N.00577_2 Tumor_C3L.00907_2 Tumor_C3N.00577_3 Tumor_C3N.00733_2 Tumor_C3N.00312_3 Tumor_C3N.00177_3 Tumor_C3N.00150_4 Tumor_C3L.00583_3 Tumor_C3N.00314_3 Tumor_C3N.00953_2 Tumor_C3L.00581_3 Tumor_C3N.01524_2 Tumor_C3L.00907_3 Tumor_C3N.00390_2 Tumor_C3N.01214_2 Tumor_C3L.01607_3 Tumor_C3N.01524_3 Tumor_C3N.00317_2 Tumor_C3N.00314_4 Tumor_C3N.00953_3 Tumor_C3L.01836_3 Tumor_C3N.00194_2 Tumor_C3N.00573_4 Tumor_C3L.00418_3 Tumor_C3N.00390_3 Tumor_C3N.00852_2 Tumor_C3L.01287_2 Tumor_C3L.00369_3 Tumor_C3N.00312_4 Tumor_C3N.00494_3 Tumor_C3N.01214_3 Tumor_C3N.01200_4
In [4]:
## Save to csv
clin_df = pd.read_csv(f'{output_dir}clinical_sircle.csv')
# We want to make a RNAseq clinical dataframe using the case IDs in the labels
rna_sample_df = sample_df.set_index("CaseId").join(clin_df.set_index("case_id"), how="left", rsuffix='_')
## -------- RNA set the labels to be indicative of the data assay
new_full_label_map = {}
new_full_label = []
for full_label in rna_sample_df['FullLabel'].values:
new_label = f'RNA_{full_label}'
new_full_label.append(new_label)
new_full_label_map[full_label] = new_label
# Update
rna_sample_df['FullLabel'] = new_full_label
rna_df = rna_df.rename(columns=new_full_label_map)
rna_df
rna_sample_df.to_csv(f'{output_dir}rna_sample_df_sircle.csv')
rna_df.to_csv(f'{output_dir}rna_df_sircle.csv')
Do the same for the sample df¶
In [5]:
rna_sample_df = pd.read_csv(f'{output_dir}rna_sample_df_sircle.csv', index_col=0)
rna_sample_df
Out[5]:
SampleId | CondId | FullLabel | CondName | SafeCases | gender | TumorStage | AgeGrouped | BMIGrouped | RaceGrouped | ... | CIMPStatus | GenomeInstability | VHL+TTN | VHL-TTN | VHL+PBRM1 | VHL-PBRM1 | PBRM1-VHL | VHL | TTN-VHL | TTN+PBRM1-VHL | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
C3L-00004 | 92dc5293-3be9-4bca-9778-2b3922a1928f | 0 | RNA_Normal_C3L.00004_1 | Normal | C3L.00004 | Male | Stage III | old | normal | White | ... | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 |
C3L-00004 | dc9b8675-5dd3-42e8-8e38-bf59731c2fe8 | 1 | RNA_Tumor_C3L.00004_1 | Tumor | C3L.00004 | Male | Stage III | old | normal | White | ... | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 |
C3L-00010 | d5091a35-7012-48ed-a3ff-a79e7e41ff36 | 0 | RNA_Normal_C3L.00010_1 | Normal | C3L.00010 | Male | Stage I | young | between | White | ... | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
C3L-00010 | 50e6d3b3-190e-44e7-bfd0-2489ba991204 | 1 | RNA_Tumor_C3L.00010_1 | Tumor | C3L.00010 | Male | Stage I | young | between | White | ... | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
C3L-00011 | 6b60f8f0-8e63-4669-b923-3023190d52e7 | 0 | RNA_Normal_C3L.00011_1 | Normal | C3L.00011 | Female | Stage IV | old | between | White | ... | 1 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
C3N-01649 | e7094723-0764-4f20-8ef6-00a948b06a74 | 1 | RNA_Tumor_C3N.01649_1 | Tumor | C3N.01649 | Male | Stage III | middle | obese | White | ... | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
C3N-01651 | b917e31d-ae1a-4c22-bd60-61f1607afc4b | 0 | RNA_Normal_C3N.01651_1 | Normal | C3N.01651 | Male | Stage II | old | between | White | ... | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
C3N-01651 | 14305372-5bec-4f99-9f78-9177d5512a4b | 1 | RNA_Tumor_C3N.01651_1 | Tumor | C3N.01651 | Male | Stage II | old | between | White | ... | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
C3N-01808 | 31ff7d54-4042-44a1-9ff9-f0ce6a737483 | 0 | RNA_Normal_C3N.01808_1 | Normal | C3N.01808 | Male | Stage I | middle | between | White | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
C3N-01808 | 5fbf9bdc-c3bd-409a-9dc0-d63deb653e4a | 1 | RNA_Tumor_C3N.01808_1 | Tumor | C3N.01808 | Male | Stage I | middle | between | White | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
246 rows × 27 columns
Filter out non-ccRCC samples¶
In [6]:
non_ccrcc = ['C3L-00359', 'C3N-00313', 'C3N-00435', 'C3N-00492', 'C3N-00832', 'C3N-01175', 'C3N-01180']
rna_df = pd.read_csv(f'{output_dir}rna_df_sircle.csv')
rna_sample_df = pd.read_csv(f'{output_dir}rna_sample_df_sircle.csv', index_col=0)
rna_sample_df = rna_sample_df[~rna_sample_df.index.isin(non_ccrcc)]
print(rna_sample_df)
meta_cols = [c for c in rna_df.columns if 'RNA' not in c]
rna_df = rna_df[meta_cols + list(rna_sample_df['FullLabel'].values)]
rna_df.to_csv(f'{output_dir}rna_df_sircle_ccRCC.csv', index=False)
rna_sample_df.to_csv(f'{output_dir}rna_sample_df_sircle_ccRCC.csv')
SampleId CondId \ C3L-00004 92dc5293-3be9-4bca-9778-2b3922a1928f 0 C3L-00004 dc9b8675-5dd3-42e8-8e38-bf59731c2fe8 1 C3L-00010 d5091a35-7012-48ed-a3ff-a79e7e41ff36 0 C3L-00010 50e6d3b3-190e-44e7-bfd0-2489ba991204 1 C3L-00011 6b60f8f0-8e63-4669-b923-3023190d52e7 0 ... ... ... C3N-01649 e7094723-0764-4f20-8ef6-00a948b06a74 1 C3N-01651 b917e31d-ae1a-4c22-bd60-61f1607afc4b 0 C3N-01651 14305372-5bec-4f99-9f78-9177d5512a4b 1 C3N-01808 31ff7d54-4042-44a1-9ff9-f0ce6a737483 0 C3N-01808 5fbf9bdc-c3bd-409a-9dc0-d63deb653e4a 1 FullLabel CondName SafeCases gender TumorStage \ C3L-00004 RNA_Normal_C3L.00004_1 Normal C3L.00004 Male Stage III C3L-00004 RNA_Tumor_C3L.00004_1 Tumor C3L.00004 Male Stage III C3L-00010 RNA_Normal_C3L.00010_1 Normal C3L.00010 Male Stage I C3L-00010 RNA_Tumor_C3L.00010_1 Tumor C3L.00010 Male Stage I C3L-00011 RNA_Normal_C3L.00011_1 Normal C3L.00011 Female Stage IV ... ... ... ... ... ... C3N-01649 RNA_Tumor_C3N.01649_1 Tumor C3N.01649 Male Stage III C3N-01651 RNA_Normal_C3N.01651_1 Normal C3N.01651 Male Stage II C3N-01651 RNA_Tumor_C3N.01651_1 Tumor C3N.01651 Male Stage II C3N-01808 RNA_Normal_C3N.01808_1 Normal C3N.01808 Male Stage I C3N-01808 RNA_Tumor_C3N.01808_1 Tumor C3N.01808 Male Stage I AgeGrouped BMIGrouped RaceGrouped ... CIMPStatus \ C3L-00004 old normal White ... 1 C3L-00004 old normal White ... 1 C3L-00010 young between White ... 0 C3L-00010 young between White ... 0 C3L-00011 old between White ... 1 ... ... ... ... ... ... C3N-01649 middle obese White ... 0 C3N-01651 old between White ... 1 C3N-01651 old between White ... 1 C3N-01808 middle between White ... 0 C3N-01808 middle between White ... 0 GenomeInstability VHL+TTN VHL-TTN VHL+PBRM1 VHL-PBRM1 \ C3L-00004 0 0 1 1 0 C3L-00004 0 0 1 1 0 C3L-00010 0 0 1 0 1 C3L-00010 0 0 1 0 1 C3L-00011 1 1 0 0 1 ... ... ... ... ... ... C3N-01649 0 1 0 1 0 C3N-01651 1 1 0 1 0 C3N-01651 1 1 0 1 0 C3N-01808 0 0 0 0 0 C3N-01808 0 0 0 0 0 PBRM1-VHL VHL TTN-VHL TTN+PBRM1-VHL C3L-00004 0 0 0 0 C3L-00004 0 0 0 0 C3L-00010 0 1 0 0 C3L-00010 0 1 0 0 C3L-00011 0 0 0 0 ... ... ... ... ... C3N-01649 0 0 0 0 C3N-01651 0 0 0 0 C3N-01651 0 0 0 0 C3N-01808 0 0 0 0 C3N-01808 0 0 0 0 [239 rows x 27 columns]
In [ ]: