Notebook RCM Part 1 ccRCC Figure 2
Plot the SiRCle clusters nicely¶
Here we just want to plot the SiRCle clusters and plot the results from the TF analysis
In [1]:
import os
cancer = 'ClearCellRenalCellCarcinoma'
input_dir = 'Input_RCM'
output_dir = 'Output_Data'
supp_dir = 'Required_Refs'
fig_dir = 'Output_Figures'
regLabel = 'RG2_Changes_filtered'
ora_dir = os.path.join(output_dir, 'ORA')
Make visualisation asthetic¶
In [2]:
###############################################################################
# #
# This program is free software: you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or #
# (at your option) any later version. #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program. If not, see <http://www.gnu.org/licenses/>. #
# #
###############################################################################
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict
from wordcloud import WordCloud
###############################################################################
# #
# This program is free software: you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or #
# (at your option) any later version. #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program. If not, see <http://www.gnu.org/licenses/>. #
# #
###############################################################################
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from collections import defaultdict
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import networkx as nx
import seaborn as sns
from matplotlib.patches import Patch
from matplotlib.patches import Rectangle
from matplotlib.legend_handler import HandlerBase
from matplotlib.colors import ListedColormap
from sciviso import Vis
# https://stackoverflow.com/questions/55501860/how-to-put-multiple-colormap-patches-in-a-matplotlib-legend
class HandlerColormap(HandlerBase):
def __init__(self, cmap, num_stripes=8, **kw):
HandlerBase.__init__(self, **kw)
self.cmap = cmap
self.num_stripes = num_stripes
def create_artists(self, legend, orig_handle,
xdescent, ydescent, width, height, fontsize, trans):
stripes = []
for i in range(self.num_stripes):
s = Rectangle([xdescent + i * width / self.num_stripes, ydescent],
width / self.num_stripes,
height,
fc=self.cmap((2 * i + 1) / (2 * self.num_stripes)),
transform=trans, linewidth=0)
stripes.append(s)
return stripes
class Emapplot(Vis):
def __init__(self, df: pd.DataFrame, size_column='Count', color_column='p.adjust', id_column='ID',
label_column='Description', overlap_column='gene_id', overlap_sep='/', title='',
config={}):
super().__init__(df)
self.title=title
self.size = size_column
self.color = color_column
self.id = id_column
self.label = label_column
self.overlap_column = overlap_column
self.overlap_sep = overlap_sep
if config:
self.load_style(config)
def build_graph(self, min_count, max_count, min_overlap):
"""
Builds a graph from the dataframe from R
:return:
"""
G = nx.Graph()
node_cmap = 'viridis'
edge_cmap = 'Greys'
edge_map = defaultdict(dict)
gene_ids = self.df[self.overlap_column].values
gene_ids = [set(genes.split(self.overlap_sep)) for genes in gene_ids] # Turn it into a list
all_genes = 0
for g in gene_ids:
all_genes += len(g)
#min_overlap = int(0.01*all_genes) if int(0.01*all_genes) < 20 else 20
print(min_overlap, 'min_overlap')
# Want to iterate over and get the maps between the two
overlapping_numbers = []
for i, id_i in enumerate(self.df[self.id].values):
for j, id_j in enumerate(self.df[self.id].values):
if i != j:
if edge_map.get(id_j):
if edge_map[id_j].get(id_i):
continue
else:
overlapping_genes = len(gene_ids[i] & gene_ids[j])
if overlapping_genes >= min_overlap:
edge_map[id_i][id_j] = overlapping_genes
overlapping_numbers.append(overlapping_genes)
else:
overlapping_genes = len(gene_ids[i] & gene_ids[j])
if overlapping_genes >= min_overlap:
edge_map[id_i][id_j] = overlapping_genes
overlapping_numbers.append(overlapping_genes)
edges = []
for node1 in edge_map:
for node2 in edge_map[node1]:
edges.append((node1, node2))
seen_nodes = []
edge_groups = defaultdict(list)
for node_from, node_to_lst in edge_map.items():
if node_from not in seen_nodes:
# Now we want to traverse the graph visiting each node
for node in node_to_lst:
if node not in edge_groups[node_from] and node not in seen_nodes:
edge_groups[node_from].append(node)
seen_nodes.append(node)
if edge_map.get(node) and node not in seen_nodes:
for node2 in edge_map.get(node):
edge_groups[node_from].append(node2)
seen_nodes.append(node2)
seen_nodes.append(node_from)
edge_groups[node_from].append(node_from)
G.add_edges_from(edges)
nodes = G.nodes()
# Check that all nodes have been added and if not add them
nodes_to_add = [node_id for node_id in self.df[self.id].values if node_id not in nodes]
for node in nodes_to_add:
G.add_node(node)
edge_groups[node].append(node) # So that we actually draw it!
# Now we want a list of node sizes and colours
#mins = np.min(self.df[self.size].values)
#maxs = np.max(self.df[self.size].values)
#norms = maxs - mins
counts =[100*(c/max_count) for c in self.df[self.size].values] # [10 + (100 * (max_count - c)/(max_count - min_count)) for c in self.df[self.size].values]
self.df["norm_sized"] = [int(100*(c/max_count)) for c in self.df[self.size].values]
print(self.df[self.size])
colour = []
for p in self.df[self.color].values:
if p < 0.0001:
colour.append("#065f46")
elif p < 0.001:
colour.append("#059669")
elif p < 0.01:
colour.append("#34d399")
elif p < 0.05:
colour.append("#a7f3d0")
# Colour the edges by the number of genes shared between the nodes
edge_values = [edge_map[edge[0]][edge[1]] for edge in edges]
lut = dict(zip(set(edge_values), sns.dark_palette("#d1d5db", len(set(edge_values)), reverse=True)))
edge_cmap = ListedColormap(sns.dark_palette("#d1d5db", len(set(edge_values)), reverse=True))
edge_colours = [] #pd.DataFrame(edge_values)[0].map(lut).values
edge_alphas = []
for c in edge_values:
if c < 10:
edge_colours.append("#bfbfbf")
elif c < 20:
edge_colours.append("#a6a6a6")
elif c < 30:
edge_colours.append("#808080")
elif c < 40:
edge_colours.append("#595959")
elif c < 50:
edge_colours.append("#333333")
else:
edge_colours.append("#0d0d0d")
edge_colours.append(lut[c])
# Need to create a layout when doing
# separate calls to draw nodes and edges
pos = nx.spring_layout(G, k=1) #nx.kamada_kawai_layout(G) # nx.spring_layout(G, k=2) #
nx.draw_networkx_nodes(G, pos, node_color=colour, node_size=self.df["norm_sized"].values) #self.df[self.size].values)
labels = dict(zip(self.df[self.id].values, self.df[self.label].values))
nx.draw_networkx_edges(G, pos, edgelist=edges, alpha=0.8, edge_color=edge_colours, width=0.5, arrows=False) #, edge_color=edge_colours, arrows=False)
# Plot the small labels and then for each "cluster" plot the smallest GO ID this should
# correspond to the "top" term.
# https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.clique.find_cliques.html#networkx.algorithms.clique.find_cliques
cliques = nx.find_cliques(G)
labels_to_draw = {}
gene_numbers = dict(zip(self.df[self.id].values, self.df[self.size].values))
small_labels = {}
for go in labels:
if not labels_to_draw.get(go):
small_labels[go] = labels[go]
nx.draw_networkx_labels(G, pos, small_labels, font_size=self.axis_font_size, font_color='black',
font_family='sans-serif', verticalalignment='bottom', clip_on=False)
plt.axis("off")
def plot_cluster_ORA(filename, gene_ratio='GeneRatio', count_column='Count', padj='p.adjust', overlap_column='geneID',
id_column='ID', label_column='Description', gene_ratio_min=0.05, padj_max=0.05, title='',
label_font_size=9, figsize=(3, 3), axis_font_size=6, min_count=20, max_count=200, min_overlap=4,
save_fig=True):
"""
Parameters
----------
filename
gene_ratio
count_column
padj
overlap_column
id_column
label_column
gene_ratio_min
padj_max
title
label_font_size
figsize
axis_font_size
min_count
max_count
min_overlap
save_fig
Returns
-------
"""
df = pd.read_csv(f'{filename}')
# Convert gene ratio to a number
gr = df[gene_ratio].values
gene_ratios = []
for g in gr:
g = g.split('/')
g0 = float(g[0])
g1 = float(g[1])
gene_ratios.append(g0 / g1)
df[gene_ratio] = gene_ratios
df = df[df[gene_ratio] > gene_ratio_min]
df = df[df[padj] < padj_max]
if len(df) > 1:
eplot = Emapplot(df, size_column=count_column, color_column=padj, id_column=id_column,
label_column=label_column, overlap_column=overlap_column, overlap_sep='/', title=title,
config={'figsize': figsize, 'label_font_size': label_font_size,
'axis_font_size': axis_font_size})
eplot.build_graph(min_overlap=2, min_count=5, max_count=200)
plt.title(title, fontsize=18, fontweight='bold')
plt.gca().set_clip_on = False
if save_fig:
plt.savefig(f'{filename.replace(".csv", "")}_Network.svg', bbox_inches='tight', transparent=True)
plt.show()
x, y = np.ogrid[:300, :300]
plt.rcParams['svg.fonttype'] = 'none' # Ensure text is saved as text
plt.rcParams['figure.figsize'] = figsize
mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
mask = 255 * mask.astype(int)
wordfeqs = defaultdict(int)
for g in df[overlap_column].values:
for w in g.split('/'):
w = w.replace(' ', '.')
wordfeqs[w] += 1
total_words = len(wordfeqs)
for w in wordfeqs:
wordfeqs[w] = wordfeqs[w] / total_words
# Compute the frequency of each word (since there are duplicates sometimes...)
wordcloud = WordCloud(background_color="white", mask=mask, repeat=False).generate_from_frequencies(wordfeqs)
wordcloud_svg = wordcloud.to_svg(embed_font=True)
if save_fig:
f = open(f'{filename.replace(".csv", "")}_WordCloud.svg', "w+")
f.write(wordcloud_svg)
f.close()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
In [3]:
import os
files = os.listdir(ora_dir)
cluster_files = [c for c in files if 'ClusterGoSummary' in c]
for c in cluster_files:
if '.svg' not in c and '_RCM' in c:
if 'TPDE' in c and 'TMDS' in c:
plot_cluster_ORA(os.path.join(ora_dir, c), figsize=(1.5, 1.5))
elif ('MDS' in c or 'TPDS' in c) and 'TMDE' in c:
plot_cluster_ORA(os.path.join(ora_dir, c), figsize=(1.5, 1.5))
else:
plot_cluster_ORA(os.path.join(ora_dir, c), figsize=(2.5, 2.5))
2 min_overlap 0 38 1 24 2 21 3 21 4 39 5 19 6 27 7 19 8 27 9 20 11 23 12 22 13 20 14 21 15 27 16 20 17 28 20 30 22 27 24 20 25 24 27 26 29 28 30 20 31 27 32 27 34 27 3500 24 3501 23 3860 19 3881 22 3887 20 Name: Count, dtype: int64
2 min_overlap 0 27 3 15 4 28 5 16 8 22 14 22 15 23 16 18 18 15 19 20 20 24 21 20 23 16 26 17 50 19 52 19 82 19 133 16 3434 21 3435 21 3444 15 3445 18 Name: Count, dtype: int64
2 min_overlap 0 131 1 117 2 83 3 82 4 83 5 98 6 75 7 82 8 85 9 71 10 55 11 55 12 103 13 83 14 92 15 64 16 63 17 97 19 92 20 76 21 72 24 71 28 86 31 80 33 59 35 74 37 68 48 53 50 50 52 50 54 66 55 64 58 56 59 66 67 64 71 65 72 52 73 51 94 58 171 50 185 49 4205 264 4206 260 4207 181 4208 80 4209 80 4210 88 4211 66 4212 61 4215 60 4216 86 4217 77 4218 61 4224 51 4659 68 4665 63 Name: Count, dtype: int64
2 min_overlap 6 32 7 34 8 34 23 36 34 34 44 35 93 32 4099 33 4523 52 4524 53 4525 53 4526 49 4527 41 4528 40 4534 32 4535 40 Name: Count, dtype: int64
2 min_overlap 0 25 1 27 2 34 3 36 5 25 28 27 36 29 38 25 40 26 47 28 58 26 62 28 69 26 4504 27 4505 25 4506 25 Name: Count, dtype: int64
2 min_overlap 0 20 1 14 3 12 4 8 5 13 6 8 7 8 8 8 9 15 10 14 11 13 12 13 15 13 16 13 18 12 20 10 21 9 28 10 29 11 34 11 36 10 2133 16 2134 16 2135 14 2136 15 2137 16 2142 11 2152 11 2396 8 2397 9 2398 8 2400 13 2405 8 2406 9 Name: Count, dtype: int64
2 min_overlap 0 83 1 66 2 62 3 50 4 62 5 37 6 31 7 27 8 30 9 39 10 40 15 32 17 32 18 38 19 36 21 30 24 34 29 24 30 34 35 32 36 30 39 30 49 25 78 23 136 23 3421 58 3422 42 3423 44 3426 24 3430 24 3431 24 3436 29 3443 24 3795 25 3801 23 3802 28 3803 35 3804 24 3837 25 Name: Count, dtype: int64
2 min_overlap 0 56 1 59 2 45 3 58 4 48 .. 4307 24 4672 31 4673 31 4675 31 4689 25 Name: Count, Length: 182, dtype: int64
2 min_overlap 0 19 1 17 2 16 3 16 4 21 .. 2788 11 2795 11 3050 15 3051 11 3053 15 Name: Count, Length: 86, dtype: int64
2 min_overlap 0 101 1 95 2 86 3 70 4 71 ... 5308 59 5309 58 5317 47 5321 48 5322 48 Name: Count, Length: 64, dtype: int64
In [ ]:
In [ ]: