Notebook for RNA processing part 2

N07_RNAProcessing

Second notebook on RNA processing¶

This one is run after we have generated the clincical information to divide the tumour and normal samples.

In [1]:

from scidat.api import API, APIException
from sciutil import SciUtil
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

### Read in dataframe
base_dir = '../data/'
data_dir = f'{base_dir}raw_downloads/CPTAC/'
output_dir = f'{base_dir}sircle/F1_DE_input_TvN/'
fig_dir = '../figures/'
supp_dir = f'{base_dir}raw_downloads/supps/'
gene_name = 'hgnc_symbol'
save_fig = False

In [2]:

df_filtered_annot = pd.read_csv(f'{output_dir}CCRCC_Clark_Cell2019_rna_filtered_df.csv')
df_filtered_annot

rna_cases = [c.split('_')[-2][:9] for c in df_filtered_annot.columns if 'CP' in c]
rna_ids = [c.split('_')[-1] for c in df_filtered_annot.columns if 'CP' in c]
rna_cols = [c for c in df_filtered_annot.columns if 'CP' in c]
cond_name = []
cond_ids = []
for c in df_filtered_annot.columns:
    if 'CPT' in c:
        if 'Tumor' in c:
            cond_name.append('Tumor')
            cond_ids.append(1)
        elif 'Normal' in c:
            cond_name.append('Normal')
            cond_ids.append(0)
        else:
            cond_name.append(c.split('_')[1])
            print(c.split('_')[1])

sample_df = pd.DataFrame()
sample_df['SampleId'] = rna_ids
sample_df['CaseId'] = rna_cases
sample_df['CondId'] = cond_ids
sample_df['FullLabel'] = rna_cols
sample_df['CondName'] = cond_name
r_safe_case_ids = [c.replace('-', '.') for c in rna_cases]
sample_df['SafeCases'] = r_safe_case_ids
sample_df

Out[2]:

	SampleId	CaseId	CondId	FullLabel	CondName	SafeCases
0	b00d920a-f927-438d-af55-40b062523be4	C3N-01648	0	CPTAC-3_SolidTissueNormal_male_notreported_1_h...	Normal	C3N.01648
1	47c8a523-5d19-4f3f-8451-44543c00bbc1	C3N-00646	0	CPTAC-3_SolidTissueNormal_female_notreported_1...	Normal	C3N.00646
2	1ed67f72-25c2-4d1d-be14-55135cd55edc	C3N-01220	0	CPTAC-3_SolidTissueNormal_male_notreported_1_h...	Normal	C3N.01220
3	e6e0146a-1efd-4e5e-bb7d-89442ba8cb99	C3L-00814	0	CPTAC-3_SolidTissueNormal_male_notreported_1_h...	Normal	C3L.00814
4	ac2f5fa6-c1d1-443d-800c-1452b5b9a85c	C3L-00447	0	CPTAC-3_SolidTissueNormal_male_white_1_htseq.c...	Normal	C3L.00447
...	...	...	...	...	...	...
241	ca9f82fb-5cfa-4b46-ba53-6a7ae5037e57	C3N-00494	1	CPTAC-3_PrimaryTumor,PrimaryTumor_None_None_No...	Tumor	C3N.00494
242	11a60b10-37f0-4513-8f42-2826de79e344	C3N-01214	1	CPTAC-3_PrimaryTumor_male_notreported_1_htseq....	Tumor	C3N.01214
243	ac17a546-5ff3-4583-adf7-44efd0bfbadd	C3N-01200	1	CPTAC-3_PrimaryTumor_female_notreported_1_htse...	Tumor	C3N.01200
244	5aa7d201-7dc9-454f-888e-55168d462ba3	C3L-00812	1	CPTAC-3_PrimaryTumor_male_white_1_htseq.counts...	Tumor	C3L.00812
245	45a2441f-95e1-43e8-a625-655be0e5d6e1	C3N-00495	1	CPTAC-3_PrimaryTumor_male_notreported_1_htseq....	Tumor	C3N.00495

246 rows × 6 columns

In [3]:

# For each of the dataframes we want to make sure the full labels are "safe" from R
# So we'll update the RNAseq, protein and DNA methylation ones
samples = sample_df['CondName']
cases = sample_df['SafeCases']

rna_col_map = {}
name_map = {}
col_names = []
for i, c in enumerate(sample_df['FullLabel'].values):
    col_name = f'{samples[i]}_{cases[i]}_1'
    if name_map.get(col_name):
        col_name = f'{name_map.get(col_name)[:-2]}_{int(name_map.get(col_name)[-1]) + 1}'
        print(col_name)
        name_map[f'{samples[i]}_{cases[i]}_1'] = col_name
    name_map[col_name] = col_name
    rna_col_map[c] = col_name
    col_names.append(col_name)
    
rna_df = df_filtered_annot.rename(columns=rna_col_map)
sample_df['FullLabel'] = col_names

Tumor_C3N.00310_2
Tumor_C3N.00573_2
Tumor_C3N.00646_2
Tumor_C3N.00646_3
Tumor_C3N.01522_2
Tumor_C3L.00583_2
Tumor_C3L.01603_2
Tumor_C3N.00312_2
Tumor_C3N.00573_3
Tumor_C3N.00437_2
Tumor_C3N.00150_2
Tumor_C3N.00148_2
Tumor_C3N.00494_2
Tumor_C3L.01286_2
Tumor_C3N.00437_3
Tumor_C3N.01522_3
Tumor_C3L.00369_2
Tumor_C3L.01836_2
Tumor_C3N.00148_3
Tumor_C3N.01200_2
Tumor_C3N.00168_2
Tumor_C3N.00646_4
Tumor_C3L.01286_3
Tumor_C3L.00606_2
Tumor_C3N.01200_3
Tumor_C3N.00177_2
Tumor_C3N.00168_3
Tumor_C3L.00447_2
Tumor_C3L.00447_3
Tumor_C3N.00320_2
Tumor_C3N.01220_2
Tumor_C3N.00168_4
Tumor_C3N.00310_3
Tumor_C3N.00314_2
Tumor_C3L.01607_2
Tumor_C3N.00148_4
Tumor_C3L.01603_3
Tumor_C3N.01220_3
Tumor_C3L.00606_3
Tumor_C3L.00581_2
Tumor_C3N.00150_3
Tumor_C3L.00418_2
Tumor_C3N.01261_2
Tumor_C3N.00320_3
Tumor_C3N.00577_2
Tumor_C3L.00907_2
Tumor_C3N.00577_3
Tumor_C3N.00733_2
Tumor_C3N.00312_3
Tumor_C3N.00177_3
Tumor_C3N.00150_4
Tumor_C3L.00583_3
Tumor_C3N.00314_3
Tumor_C3N.00953_2
Tumor_C3L.00581_3
Tumor_C3N.01524_2
Tumor_C3L.00907_3
Tumor_C3N.00390_2
Tumor_C3N.01214_2
Tumor_C3L.01607_3
Tumor_C3N.01524_3
Tumor_C3N.00317_2
Tumor_C3N.00314_4
Tumor_C3N.00953_3
Tumor_C3L.01836_3
Tumor_C3N.00194_2
Tumor_C3N.00573_4
Tumor_C3L.00418_3
Tumor_C3N.00390_3
Tumor_C3N.00852_2
Tumor_C3L.01287_2
Tumor_C3L.00369_3
Tumor_C3N.00312_4
Tumor_C3N.00494_3
Tumor_C3N.01214_3
Tumor_C3N.01200_4

In [4]:

## Save to csv
clin_df = pd.read_csv(f'{output_dir}clinical_sircle.csv')
# We want to make a RNAseq clinical dataframe using the case IDs in the labels
rna_sample_df = sample_df.set_index("CaseId").join(clin_df.set_index("case_id"), how="left", rsuffix='_')

## -------- RNA set the labels to be indicative of the data assay
new_full_label_map = {}
new_full_label = []
for full_label in rna_sample_df['FullLabel'].values:
    new_label = f'RNA_{full_label}'
    new_full_label.append(new_label)
    new_full_label_map[full_label] = new_label
# Update
rna_sample_df['FullLabel'] = new_full_label
rna_df = rna_df.rename(columns=new_full_label_map)
rna_df

rna_sample_df.to_csv(f'{output_dir}rna_sample_df_sircle.csv')
rna_df.to_csv(f'{output_dir}rna_df_sircle.csv')

Do the same for the sample df¶

In [5]:

rna_sample_df = pd.read_csv(f'{output_dir}rna_sample_df_sircle.csv', index_col=0)
rna_sample_df

Out[5]:

	SampleId	CondId	FullLabel	CondName	SafeCases	gender	TumorStage	AgeGrouped	BMIGrouped	RaceGrouped	...	CIMPStatus	GenomeInstability	VHL+TTN	VHL-TTN	VHL+PBRM1	VHL-PBRM1	PBRM1-VHL	VHL	TTN-VHL	TTN+PBRM1-VHL
C3L-00004	92dc5293-3be9-4bca-9778-2b3922a1928f	0	RNA_Normal_C3L.00004_1	Normal	C3L.00004	Male	Stage III	old	normal	White	...	1	0	0	1	1	0	0	0	0	0
C3L-00004	dc9b8675-5dd3-42e8-8e38-bf59731c2fe8	1	RNA_Tumor_C3L.00004_1	Tumor	C3L.00004	Male	Stage III	old	normal	White	...	1	0	0	1	1	0	0	0	0	0
C3L-00010	d5091a35-7012-48ed-a3ff-a79e7e41ff36	0	RNA_Normal_C3L.00010_1	Normal	C3L.00010	Male	Stage I	young	between	White	...	0	0	0	1	0	1	0	1	0	0
C3L-00010	50e6d3b3-190e-44e7-bfd0-2489ba991204	1	RNA_Tumor_C3L.00010_1	Tumor	C3L.00010	Male	Stage I	young	between	White	...	0	0	0	1	0	1	0	1	0	0
C3L-00011	6b60f8f0-8e63-4669-b923-3023190d52e7	0	RNA_Normal_C3L.00011_1	Normal	C3L.00011	Female	Stage IV	old	between	White	...	1	1	1	0	0	1	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
C3N-01649	e7094723-0764-4f20-8ef6-00a948b06a74	1	RNA_Tumor_C3N.01649_1	Tumor	C3N.01649	Male	Stage III	middle	obese	White	...	0	0	1	0	1	0	0	0	0	0
C3N-01651	b917e31d-ae1a-4c22-bd60-61f1607afc4b	0	RNA_Normal_C3N.01651_1	Normal	C3N.01651	Male	Stage II	old	between	White	...	1	1	1	0	1	0	0	0	0	0
C3N-01651	14305372-5bec-4f99-9f78-9177d5512a4b	1	RNA_Tumor_C3N.01651_1	Tumor	C3N.01651	Male	Stage II	old	between	White	...	1	1	1	0	1	0	0	0	0	0
C3N-01808	31ff7d54-4042-44a1-9ff9-f0ce6a737483	0	RNA_Normal_C3N.01808_1	Normal	C3N.01808	Male	Stage I	middle	between	White	...	0	0	0	0	0	0	0	0	0	0
C3N-01808	5fbf9bdc-c3bd-409a-9dc0-d63deb653e4a	1	RNA_Tumor_C3N.01808_1	Tumor	C3N.01808	Male	Stage I	middle	between	White	...	0	0	0	0	0	0	0	0	0	0

246 rows × 27 columns

Filter out non-ccRCC samples¶

In [6]:

non_ccrcc = ['C3L-00359', 'C3N-00313', 'C3N-00435', 'C3N-00492', 'C3N-00832', 'C3N-01175', 'C3N-01180']
rna_df = pd.read_csv(f'{output_dir}rna_df_sircle.csv')
rna_sample_df = pd.read_csv(f'{output_dir}rna_sample_df_sircle.csv', index_col=0)
rna_sample_df = rna_sample_df[~rna_sample_df.index.isin(non_ccrcc)]
print(rna_sample_df)

meta_cols = [c for c in rna_df.columns if 'RNA' not in c]
rna_df = rna_df[meta_cols + list(rna_sample_df['FullLabel'].values)]

rna_df.to_csv(f'{output_dir}rna_df_sircle_ccRCC.csv', index=False)

rna_sample_df.to_csv(f'{output_dir}rna_sample_df_sircle_ccRCC.csv')

                                       SampleId  CondId  \
C3L-00004  92dc5293-3be9-4bca-9778-2b3922a1928f       0   
C3L-00004  dc9b8675-5dd3-42e8-8e38-bf59731c2fe8       1   
C3L-00010  d5091a35-7012-48ed-a3ff-a79e7e41ff36       0   
C3L-00010  50e6d3b3-190e-44e7-bfd0-2489ba991204       1   
C3L-00011  6b60f8f0-8e63-4669-b923-3023190d52e7       0   
...                                         ...     ...   
C3N-01649  e7094723-0764-4f20-8ef6-00a948b06a74       1   
C3N-01651  b917e31d-ae1a-4c22-bd60-61f1607afc4b       0   
C3N-01651  14305372-5bec-4f99-9f78-9177d5512a4b       1   
C3N-01808  31ff7d54-4042-44a1-9ff9-f0ce6a737483       0   
C3N-01808  5fbf9bdc-c3bd-409a-9dc0-d63deb653e4a       1   

                        FullLabel CondName  SafeCases  gender TumorStage  \
C3L-00004  RNA_Normal_C3L.00004_1   Normal  C3L.00004    Male  Stage III   
C3L-00004   RNA_Tumor_C3L.00004_1    Tumor  C3L.00004    Male  Stage III   
C3L-00010  RNA_Normal_C3L.00010_1   Normal  C3L.00010    Male    Stage I   
C3L-00010   RNA_Tumor_C3L.00010_1    Tumor  C3L.00010    Male    Stage I   
C3L-00011  RNA_Normal_C3L.00011_1   Normal  C3L.00011  Female   Stage IV   
...                           ...      ...        ...     ...        ...   
C3N-01649   RNA_Tumor_C3N.01649_1    Tumor  C3N.01649    Male  Stage III   
C3N-01651  RNA_Normal_C3N.01651_1   Normal  C3N.01651    Male   Stage II   
C3N-01651   RNA_Tumor_C3N.01651_1    Tumor  C3N.01651    Male   Stage II   
C3N-01808  RNA_Normal_C3N.01808_1   Normal  C3N.01808    Male    Stage I   
C3N-01808   RNA_Tumor_C3N.01808_1    Tumor  C3N.01808    Male    Stage I   

          AgeGrouped BMIGrouped RaceGrouped  ...  CIMPStatus  \
C3L-00004        old     normal       White  ...           1   
C3L-00004        old     normal       White  ...           1   
C3L-00010      young    between       White  ...           0   
C3L-00010      young    between       White  ...           0   
C3L-00011        old    between       White  ...           1   
...              ...        ...         ...  ...         ...   
C3N-01649     middle      obese       White  ...           0   
C3N-01651        old    between       White  ...           1   
C3N-01651        old    between       White  ...           1   
C3N-01808     middle    between       White  ...           0   
C3N-01808     middle    between       White  ...           0   

           GenomeInstability  VHL+TTN  VHL-TTN  VHL+PBRM1  VHL-PBRM1  \
C3L-00004                  0        0        1          1          0   
C3L-00004                  0        0        1          1          0   
C3L-00010                  0        0        1          0          1   
C3L-00010                  0        0        1          0          1   
C3L-00011                  1        1        0          0          1   
...                      ...      ...      ...        ...        ...   
C3N-01649                  0        1        0          1          0   
C3N-01651                  1        1        0          1          0   
C3N-01651                  1        1        0          1          0   
C3N-01808                  0        0        0          0          0   
C3N-01808                  0        0        0          0          0   

           PBRM1-VHL  VHL  TTN-VHL  TTN+PBRM1-VHL  
C3L-00004          0    0        0              0  
C3L-00004          0    0        0              0  
C3L-00010          0    1        0              0  
C3L-00010          0    1        0              0  
C3L-00011          0    0        0              0  
...              ...  ...      ...            ...  
C3N-01649          0    0        0              0  
C3N-01651          0    0        0              0  
C3N-01651          0    0        0              0  
C3N-01808          0    0        0              0  
C3N-01808          0    0        0              0  

[239 rows x 27 columns]

In [ ]: