Notebook for Filtering CpGs to Genes¶
import os
import pandas as pd
import seaborn as sns
from sciviso import Scatterplot
from scircm import SciRCM
from scivae import Vis
import matplotlib.pyplot as plt
import os
import pandas as pd
from collections import defaultdict
from sciutil import SciUtil
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
base_dir = '../data/'
data_dir = f'{base_dir}sircle/F2_DE_output_TvN/'
output_dir = f'{base_dir}sircle/F3_regulatory_clustering/'
fig_dir = '../figures/'
supp_dir = f'{base_dir}raw_downloads/supps/'
gene_name = 'hgnc_symbol'
save_fig = False
test_title = 'all_patients_ccRCC'
Filter DNA methylation genes¶
Here we want to assign 1 CpG to each gene (i.e. we want to identify the "driver" CpG).
We determine this by first finding CpGs that are in agreement in terms of direction of regulation.
1) Group CpGs that fall within 1000 bp of the TSS
2) If there are at least 3 CpGs then check that at least 60% of CpGs argree in terms of direction of change (and have a pvalue < 0.1)
3) If they do then retain the CpG with the largest DNA methylation difference (in the direction that 60% agreed on)
4) If there were less than 3 CpGs just keep the CpG with the largest DNA methylation difference.
`
from scircm import filter_methylation_data_by_genes
u = SciUtil()
meth_df = pd.read_csv(f'{data_dir}cpg_DE_{test_title}_sircle.csv')
# t value = t, adj.P.Val = adjusted P, cpg_id = CpG id, logFC=logfold change
# Only include those with a adj p value < 0.1
run_meth_filter = True
# Replace inf
# replace logFC col neg and +ve inf
meth_df['logFC_meth'].replace([np.inf],max(meth_df['logFC_meth'].replace(np.inf, np.nan)),inplace=True)
meth_df['logFC_meth'].replace([-np.inf],min(meth_df['logFC_meth'].replace(-np.inf, np.nan)),inplace=True)
normal_cols = [c for c in meth_df.columns if 'Normal' in c]
tumor_cols = [c for c in meth_df.columns if 'Tumor' in c]
beta_diff = np.mean(meth_df[tumor_cols].values, axis=1) - np.mean(meth_df[normal_cols].values, axis=1)
meth_df['CpG_Beta_diff'] = beta_diff
# Filter the methylation data to only
meth_f_df = filter_methylation_data_by_genes(meth_df, 'ensembl_gene_id', 'padj_meth', 'CpG_Beta_diff')
-------------------------------------------------------------------------------- Originally had: 25434 genes. Filtered DF now has: 21377 genes. --------------------------------------------------------------------------------
meth_f_df.to_csv(f'{data_dir}filtered_cpg_DE_{test_title}_sircle.csv', index=False)
plt.hist(meth_f_df['logFC_meth'], bins=20)
plt.title('Methylation logFC M value')
plt.show()
mean_change = np.mean(meth_f_df[[c for c in meth_f_df.columns if 'Tumor' in c]].values, axis=1) - np.mean(meth_f_df[[c for c in meth_f_df.columns if 'Normal' in c]].values, axis=1)
plt.hist(mean_change, bins=20)
plt.title('Methylation mean beta difference')
plt.show()
plt.scatter(meth_f_df["CpG_Beta_diff"].values, meth_f_df["logFC_meth"].values)
plt.title('Methylation beta difference vs methylation M fold change')
plt.show()
Look at how many genes had multiple CpGs¶
gene_counts = meth_df['ensembl_gene_id'].value_counts()
print("Number of genes with less than 3 CpGs:", len(gene_counts[gene_counts < 3]),
"\nNumber of genes with between 3 and 20 CpGs:", len(gene_counts[gene_counts > 3]),
"\nNumber of genes with greater than 20 CpGs:", len(gene_counts[gene_counts > 20])) #6713 15848 1400
# As an example, show the number of genes for a given one
meth_df[meth_df['external_gene_name'] == 'CA9']
Number of genes with less than 3 CpGs: 6713 Number of genes with between 3 and 20 CpGs: 15848 Number of genes with greater than 20 CpGs: 1400
Unnamed: 0 | Locus | chr | pos | ensembl_gene_id | external_gene_name | hgnc_symbol | entrezgene_id | Relation_to_Island | UCSC_RefGene_Group | ... | CpG_Normal_d781be9c.7b3b.4acf.b202.d4d4b847db05_1 | CpG_Normal_62bcce15.fdaf.49d4.9bed.24a493f5776b_1 | CpG_Normal_e3ed1380.d6f1.420e.b6a6.d24a6e81a364_1 | CpG_Normal_c92995d5.68ce.47d0.8149.0323e624c032_1 | CpG_Normal_f29b6c8c.d713.42ad.9b90.e556df9b05cd_1 | CpG_Normal_cb371398.ee48.4665.8089.26229c5b2cf0_1 | CpG_Normal_ac092a8e.80af.4589.8bb8.d86427b398ca_1 | CpG_Normal_318f6ffb.1fdf.4f74.90db.21e337503aae_1 | CpG_Normal_ef9ae1dd.83aa.404d.83ef.ced707ae738b_1 | CpG_Beta_diff | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
53104 | cg06908460 | cg06908460 | chr9 | 35676107 | ENSG00000107159 | CA9 | CA9 | 768 | Island | Body | ... | 0.071525 | 0.073432 | 0.056048 | 0.061662 | 0.057470 | 0.061828 | 0.089296 | 0.060925 | 0.075332 | 0.013317 |
71975 | cg09566069 | cg09566069 | chr9 | 35676368 | ENSG00000107159 | CA9 | CA9 | 768 | Island | Body | ... | 0.188227 | 0.189941 | 0.207914 | 0.189651 | 0.196477 | 0.184108 | 0.335658 | 0.183858 | 0.252410 | 0.056579 |
100054 | cg13849253 | cg13849253 | chr9 | 35676169 | ENSG00000107159 | CA9 | CA9 | 768 | Island | Body | ... | 0.041222 | 0.067208 | 0.049146 | 0.054657 | 0.035332 | 0.081421 | 0.117577 | 0.047883 | 0.069222 | 0.002398 |
105670 | cg14563831 | cg14563831 | chr9 | 35679278 | ENSG00000107159 | CA9 | CA9 | 768 | S_Shelf | Body | ... | 0.890417 | 0.940949 | 0.922212 | 0.921867 | 0.934773 | 0.919804 | 0.938456 | 0.893177 | 0.923882 | -0.021328 |
135379 | cg19257550 | cg19257550 | chr9 | 35673912 | ENSG00000107159 | CA9 | CA9 | 768 | N_Shore | TSS200 | ... | 0.545057 | 0.590555 | 0.494968 | 0.602210 | 0.731530 | 0.526126 | 0.595478 | 0.549169 | 0.736935 | -0.038873 |
143713 | cg20610181 | cg20610181 | chr9 | 35674003 | ENSG00000107159 | CA9 | CA9 | 768 | N_Shore | 1stExon | ... | 0.812362 | 0.863635 | 0.851431 | 0.850729 | 0.945040 | 0.831214 | 0.802052 | 0.889678 | 0.895687 | -0.363275 |
6 rows × 272 columns
Save as a bed file so we can easily see which one we chose using IGV¶
# Save the locus, chr, pos, and gene name to a CSV (along with cpgBetaDiff) to use for the peak files
meth_to_save = meth_df[['Locus', 'chr', 'pos', 'external_gene_name', 'CpG_Beta_diff', 'padj_meth']]
meth_to_save['Sig'] = ['*' if c < 0.05 else '' for c in meth_df['padj_meth'].values]
loci = {}
for l in meth_f_df['Locus'].values:
loci[l] = True
chosen=[]
for l in meth_to_save['Locus'].values:
if loci.get(l):
chosen.append('+')
else:
chosen.append(" ")
meth_to_save['Chosen'] = chosen
meth_to_save['Name'] = meth_to_save['Chosen'] + meth_to_save['Locus'] + '(' + meth_to_save['Sig'] + meth_to_save['external_gene_name'] + ')'
meth_to_save.to_csv(f'{data_dir}cpg_DE_{test_title}_sircle_locus_info.csv')
meth_f_df[['Locus', 'chr', 'pos', 'external_gene_name', 'CpG_Beta_diff', 'padj_meth']].to_csv(f'{data_dir}cpg_filtered_DE_{test_title}_sircle_locus_info.csv')
# Just using this to convert to a bed easily.
from scie2g.csv import Csv, Epi2GeneException
# We read in the file and then convert it to a BED file so that we can load it in IGV
filepath = f'{data_dir}cpg_DE_{test_title}_sircle_locus_info.csv'
df = pd.read_csv(filepath)
f = Csv(filepath, 'chr', 'pos', 'pos', 'CpG_Beta_diff', ['Locus', 'external_gene_name'], sep=',')
f.convert_to_bed(df, f'{output_dir}cpg_DE_all_patients_ccRCC_sircle_locus.bed',
'CPTAC TCGA CpG', None, 'Name')
/Users/ariane/opt/miniconda3/envs/clean_ml/lib/python3.6/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy This is separate from the ipykernel package so we can avoid doing imports until /Users/ariane/opt/miniconda3/envs/clean_ml/lib/python3.6/site-packages/ipykernel_launcher.py:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy del sys.path[0] /Users/ariane/opt/miniconda3/envs/clean_ml/lib/python3.6/site-packages/ipykernel_launcher.py:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy app.launch_new_instance()
Plot the CpGs: supplementary figure¶
Here we want to plot the CpGs and how they map to genes, also report the total number of "up" and "down" CpGs.
"""
---------------------------------------------------------------
Plot and identify the largest direction of change for the CpGs
---------------------------------------------------------------
"""
plt.rcParams["figure.figsize"] = (3,3)
# Plot a scatter in grey of the original dataset
plt.scatter(meth_df['logFC_meth'].values, -1 * np.log10(meth_df['padj_meth'].values), c='grey', alpha=0.5, s=10)
# plot a scatter in colour for the filtered dataset
plt.scatter(meth_f_df['logFC_meth'].values, -1 * np.log10(meth_f_df['padj_meth'].values), c='#C2631D', s=10,
alpha=0.5)
plt.title('CpGs')
plt.xlabel('DNA methylation differece')
plt.ylabel('log10 padj')
if save_fig:
plt.savefig(f'{fig_dir}SFig1_CpGs_{test_title}.svg')
plt.show()
# Print the respective sizes of the datasets
u.dp(['Original number of CpGs: ', len(meth_df), ' Filtered number of CpGs: ', len(meth_f_df)])
plt.hist(meth_df['logFC_meth'].values, bins=20)
plt.title('logFC M values')
plt.show()
plt.hist(meth_df['logFC_meth'].values[np.where(abs(meth_df['logFC_meth'].values) > 1.0)], bins=20)
plt.title('logFC M values > 1')
plt.show()
-------------------------------------------------------------------------------- Original number of CpGs: 187770 Filtered number of CpGs: 21377 --------------------------------------------------------------------------------
"""
---------------------------------------------------------------
Plot and identify the largest direction of change for the CpGs
---------------------------------------------------------------
"""
plt.rcParams["figure.figsize"] = (3,3)
# Save the meth DF to csv
#meth_f_df.to_csv(meth_filtered_file, index=False)
plt.scatter(meth_f_df['CpG_Beta_diff'].values, -1 * np.log10(meth_f_df['padj_meth'].values), c='orange', alpha=0.5, s=10)
# plot a scatter in colour for the filtered dataset
plt.title('CpGs')
plt.xlabel('DNA methylation betadiff_meth')
plt.ylabel('log10 padj')
if save_fig:
plt.savefig(f'{fig_dir}SFig1_CpGs_beta_{meth_test_title}.svg')
plt.show()
# Print the respective sizes of the datasets
u.dp(['Original number of CpGs: ', len(meth_df), ' Filtered number of CpGs: ', len(meth_f_df)])
-------------------------------------------------------------------------------- Original number of CpGs: 187770 Filtered number of CpGs: 21377 --------------------------------------------------------------------------------
m_cutoff = 1.0
# Print out the number of hypo and hyper methylated genes
u.dp(['Original Hyper-methylated:', len(meth_df[meth_df['logFC_meth'].values > m_cutoff]),
'Original Hypo-methylated:',len(meth_df[meth_df['logFC_meth'].values < -m_cutoff])])
u.warn_p(['Filtered Hyper-methylated:', len(meth_f_df[meth_f_df['logFC_meth'].values > m_cutoff]),
'Filtered Hypo-methylated:', len(meth_f_df[meth_f_df['logFC_meth'].values < -m_cutoff])])
u.dp(['Percentage All Hyper-methylated:', 100*(len(meth_df[meth_df['logFC_meth'].values > m_cutoff])/len(meth_df)),
'Percentage All Hypo-methylated:', 100*(len(meth_df[meth_df['logFC_meth'].values < -m_cutoff])/len(meth_df))])
u.warn_p(['Percentage Filtered Hyper-methylated:', 100*(len(meth_f_df[meth_f_df['logFC_meth'].values > m_cutoff])/len(meth_f_df)),
'Percentage Filtered Hypo-methylated:', 100*(len(meth_f_df[meth_f_df['logFC_meth'].values < -m_cutoff])/len(meth_f_df))])
-------------------------------------------------------------------------------- Original Hyper-methylated: 15306 Original Hypo-methylated: 31140 -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- Filtered Hyper-methylated: 2474 Filtered Hypo-methylated: 9450 -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- Percentage All Hyper-methylated: 8.151461894871385 Percentage All Hypo-methylated: 16.584118868828888 -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- Percentage Filtered Hyper-methylated: 11.573186134630678 Percentage Filtered Hypo-methylated: 44.20639004537587 --------------------------------------------------------------------------------
m_cutoff = 0.5
# Print out the number of hypo and hyper methylated genes
u.dp(['Original Hyper-methylated:', len(meth_df[meth_df['CpG_Beta_diff'].values > m_cutoff]),
'Original Hypo-methylated:',len(meth_df[meth_df['CpG_Beta_diff'].values < -m_cutoff])])
u.warn_p(['Filtered Hyper-methylated:', len(meth_f_df[meth_f_df['CpG_Beta_diff'].values > m_cutoff]),
'Filtered Hypo-methylated:', len(meth_f_df[meth_f_df['CpG_Beta_diff'].values < -m_cutoff])])
u.dp(['Percentage All Hyper-methylated:', 100*(len(meth_df[meth_df['CpG_Beta_diff'].values > m_cutoff])/len(meth_df)),
'Percentage All Hypo-methylated:', 100*(len(meth_df[meth_df['CpG_Beta_diff'].values < -m_cutoff])/len(meth_df))])
u.warn_p(['Percentage Filtered Hyper-methylated:', 100*(len(meth_f_df[meth_f_df['CpG_Beta_diff'].values > m_cutoff])/len(meth_f_df)),
'Percentage Filtered Hypo-methylated:', 100*(len(meth_f_df[meth_f_df['CpG_Beta_diff'].values < -m_cutoff])/len(meth_f_df))])
-------------------------------------------------------------------------------- Original Hyper-methylated: 20 Original Hypo-methylated: 66 -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- Filtered Hyper-methylated: 7 Filtered Hypo-methylated: 54 -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- Percentage All Hyper-methylated: 0.010651328753261968 Percentage All Hypo-methylated: 0.0351493848857645 -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- Percentage Filtered Hyper-methylated: 0.03274547410768583 Percentage Filtered Hypo-methylated: 0.2526079431164336 --------------------------------------------------------------------------------
Correlation between the input features: Figure 1 A¶
Plot the correlation between the input features of the datasets.
from scipy.stats import pearsonr
prot_df = pd.read_csv(f'{data_dir}prot_DE_{test_title}_sircle.csv')
rna_df = pd.read_csv(f'{data_dir}rna_DE_{test_title}_sircle.csv')
genes = list(set(prot_df['ensembl_gene_id'].values)&set(rna_df['ensembl_gene_id'].values))
v_r = rna_df[rna_df.set_index('ensembl_gene_id').index.isin(genes)].sort_values('ensembl_gene_id')
v_p = prot_df[prot_df.set_index('ensembl_gene_id').index.isin(genes)].sort_values('ensembl_gene_id')
v_p.drop_duplicates(subset='ensembl_gene_id', inplace=True)
v_r.drop_duplicates(subset='ensembl_gene_id', inplace=True)
plt.scatter( v_p['logFC_protein'].values, v_r['logFC_rna'].values, alpha=0.2, c='k')
corr, p_corr = pearsonr( v_p['logFC_protein'].values, v_r['logFC_rna'].values)
plt.title(f'RNA and protein correlation: {round(corr, 2)} {round(p_corr, 2)}')
plt.xlabel('Protein logFC')
plt.ylabel('RNA logFC')
if save_fig:
plt.savefig(f'{fig_dir}Fig1A_RNA_Protein_corr.svg')
# Print out the pearson correlation
u.dp(['Pearson correlation between RNAseq and Protein: ', corr, p_corr])
-------------------------------------------------------------------------------- Pearson correlation between RNAseq and Protein: 0.6363288467228314 0.0 --------------------------------------------------------------------------------
plt.hist(rna_df['logFC_rna'].values, bins=20)
plt.title('RNA logFC')
plt.show()
plt.hist(prot_df['logFC_protein'].values, bins=20)
plt.title('Protein logFC')
plt.show()
Test copula correlation¶
print(np.std(v_p['logFC_protein'].values), np.std(v_r['logFC_rna'].values))
0.4894415774554938 1.2030189680500991
from scipy.stats import norm
rna_cdf = norm.cdf(v_r['logFC_rna'].values, 0, 1.4)
prot_cdf = norm.cdf(v_p['logFC_protein'].values, 0, 0.5)
plt.scatter(rna_cdf, prot_cdf, alpha=0.2, c='k')
corr, p_corr = pearsonr(rna_cdf, prot_cdf)
plt.title(f'RNA and protein correlation: {round(corr, 2)} {round(p_corr, 2)}')
plt.xlabel('RNA logFC')
plt.ylabel('Protein logFC')
u.dp(['Pearson correlation between RNAseq and Protein: ', corr, p_corr])
plt.show()
-------------------------------------------------------------------------------- Pearson correlation between RNAseq and Protein: 0.6709383907979878 0.0 --------------------------------------------------------------------------------
Combine RNA and methylation data¶
genes = list(set(meth_df['external_gene_name'].values)&set(rna_df['external_gene_name'].values))
v_r = rna_df[rna_df.set_index('external_gene_name').index.isin(genes)].sort_values('external_gene_name')
v_p = meth_df[meth_df.set_index('external_gene_name').index.isin(genes)].sort_values('external_gene_name')
rna_meth = v_p.join(v_r, lsuffix='', rsuffix='_')
rna_meth
Unnamed: 0 | Locus | chr | pos | ensembl_gene_id | external_gene_name | hgnc_symbol | entrezgene_id | Relation_to_Island | UCSC_RefGene_Group | ... | RNA_Tumor_C3N.01524_3 | RNA_Normal_C3N.01646_1 | RNA_Tumor_C3N.01646_1 | RNA_Normal_C3N.01648_1 | RNA_Normal_C3N.01649_1 | RNA_Tumor_C3N.01649_1 | RNA_Normal_C3N.01651_1 | RNA_Tumor_C3N.01651_1 | RNA_Normal_C3N.01808_1 | RNA_Tumor_C3N.01808_1 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
166341 | cg24393602 | cg24393602 | chr6 | 52995606 | ENSG00000202198 | 7SK | 0 | 0 | S_Shore | TSS1500 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
66855 | cg08830754 | cg08830754 | chr6 | 52995436 | ENSG00000202198 | 7SK | 0 | 0 | S_Shore | TSS200 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
155112 | cg22568540 | cg22568540 | chr19 | 58353480 | ENSG00000121410 | A1BG | A1BG | 1 | N_Shelf | Body;5'UTR;1stExon | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
124590 | cg17498272 | cg17498272 | chr19 | 58353563 | ENSG00000121410 | A1BG | A1BG | 1 | N_Shelf | Body;TSS200 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
59196 | cg07739758 | cg07739758 | chr19 | 58354001 | ENSG00000121410 | A1BG | A1BG | 1 | N_Shelf | Body;TSS1500 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
134548 | cg19105961 | cg19105961 | chr17 | 4142929 | ENSG00000074755 | ZZEF1 | ZZEF1 | 23140 | Island | TSS1500;5'UTR;1stExon;TSS1500;TSS1500 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
19873 | cg02459389 | cg02459389 | chr17 | 4019718 | ENSG00000074755 | ZZEF1 | ZZEF1 | 23140 | OpenSea | Body | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
11060 | cg01341891 | cg01341891 | chr1 | 77682042 | ENSG00000036549 | ZZZ3 | ZZZ3 | 26009 | N_Shore | 5'UTR | ... | 7.532377 | 6.027899 | 7.85286 | 6.769411 | 5.845572 | 8.083454 | 7.000563 | 8.019076 | 7.010768 | 7.782185 |
167499 | cg24577193 | cg24577193 | chr1 | 77682711 | ENSG00000036549 | ZZZ3 | ZZZ3 | 26009 | Island | TSS200 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
153157 | cg22221847 | cg22221847 | chr1 | 77565502 | ENSG00000036549 | ZZZ3 | ZZZ3 | 26009 | OpenSea | 3'UTR | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
161704 rows × 522 columns
plt.scatter(rna_meth['logFC_meth'].values, rna_meth['logFC_rna'].values, alpha=0.2, c='k')
rna_meth = rna_meth.dropna()
corr, p_corr = pearsonr(rna_meth['logFC_meth'].values, rna_meth['logFC_rna'].values)
plt.title(f'RNA and Meth correlation: {round(corr, 2)} {round(p_corr, 2)}')
plt.xlabel('DNA Methylation difference')
plt.ylabel('RNA logFC')
if save_fig:
plt.savefig(f'{fig_dir}Fig1A_RNA_Meth_corr_{meth_test_title}.svg')
# Print out the pearson correlation
u.dp(['Pearson correlation between RNAseq and DNA methylation: ', corr, p_corr])
-------------------------------------------------------------------------------- Pearson correlation between RNAseq and DNA methylation: -0.0039863098067630925 0.6272710460579092 --------------------------------------------------------------------------------
plt.hist(rna_meth['logFC_meth'].values, bins=20)
plt.show()
genes = list(set(meth_f_df['ensembl_gene_id'].values)&set(rna_df['ensembl_gene_id'].values))
rna_df.set_index('ensembl_gene_id', inplace=True)
v_r = rna_df[rna_df.index.isin(genes)].sort_values('ensembl_gene_id')
meth_f_df.set_index('ensembl_gene_id', inplace=True)
v_p = meth_f_df[meth_f_df.index.isin(genes)].sort_values('ensembl_gene_id')
rna_meth = v_p.join(v_r, lsuffix='', rsuffix='_')
rna_meth
plt.scatter(v_p['CpG_Beta_diff'].values, v_r['logFC_rna'].values, alpha=0.2, c='k')
corr, p_corr = pearsonr(v_p['CpG_Beta_diff'].values, v_r['logFC_rna'].values)
plt.title(f'RNA and Meth correlation: {round(corr, 2)} {round(p_corr, 2)}')
plt.xlabel('DNA Methylation difference')
plt.ylabel('RNA logFC')
if save_fig:
plt.savefig(f'{fig_dir}Fig1A_RNA_Meth_Beta_corr_{meth_test_title}.svg')
# Print out the pearson correlation
u.dp(['Pearson correlation between RNAseq and DNA methylation Beta: ', corr, p_corr])
plt.show()
# Also do M value
plt.scatter(v_p['logFC_meth'].values, v_r['logFC_rna'].values, alpha=0.2, c='k')
corr, p_corr = pearsonr(v_p['CpG_Beta_diff'].values, v_r['logFC_rna'].values)
plt.title(f'RNA and Meth M value correlation: {round(corr, 2)} {round(p_corr, 2)}')
plt.xlabel('DNA Methylation difference')
plt.ylabel('RNA logFC')
-------------------------------------------------------------------------------- Pearson correlation between RNAseq and DNA methylation Beta: -0.172699114287582 3.3929936065564195e-114 --------------------------------------------------------------------------------
Text(0, 0.5, 'RNA logFC')