Notebook RCM Part 1 PanCan Figure 2
Plot the SiRCle clusters nicely¶
Here we just want to plot the SiRCle clusters and plot the results from the TF analysis
In [4]:
import os
cancer = 'PanCan'
input_dir = 'Input_RCM'
output_dir = 'Output_Data'
supp_dir = 'Required_Refs'
fig_dir = 'Output_Figures'
regLabel = 'RG2_Changes_filtered'
ora_dir = os.path.join(output_dir, 'ORA')
Make visualisation asthetic¶
In [5]:
###############################################################################
# #
# This program is free software: you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or #
# (at your option) any later version. #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program. If not, see <http://www.gnu.org/licenses/>. #
# #
###############################################################################
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict
from wordcloud import WordCloud
###############################################################################
# #
# This program is free software: you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or #
# (at your option) any later version. #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program. If not, see <http://www.gnu.org/licenses/>. #
# #
###############################################################################
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from collections import defaultdict
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import networkx as nx
import seaborn as sns
from matplotlib.patches import Patch
from matplotlib.patches import Rectangle
from matplotlib.legend_handler import HandlerBase
from matplotlib.colors import ListedColormap
from sciviso import Vis
# https://stackoverflow.com/questions/55501860/how-to-put-multiple-colormap-patches-in-a-matplotlib-legend
class HandlerColormap(HandlerBase):
def __init__(self, cmap, num_stripes=8, **kw):
HandlerBase.__init__(self, **kw)
self.cmap = cmap
self.num_stripes = num_stripes
def create_artists(self, legend, orig_handle,
xdescent, ydescent, width, height, fontsize, trans):
stripes = []
for i in range(self.num_stripes):
s = Rectangle([xdescent + i * width / self.num_stripes, ydescent],
width / self.num_stripes,
height,
fc=self.cmap((2 * i + 1) / (2 * self.num_stripes)),
transform=trans, linewidth=0)
stripes.append(s)
return stripes
class Emapplot(Vis):
def __init__(self, df: pd.DataFrame, size_column='Count', color_column='p.adjust', id_column='ID',
label_column='Description', overlap_column='gene_id', overlap_sep='/', title='',
config={}):
super().__init__(df)
self.title=title
self.size = size_column
self.color = color_column
self.id = id_column
self.label = label_column
self.overlap_column = overlap_column
self.overlap_sep = overlap_sep
if config:
self.load_style(config)
def build_graph(self, min_count, max_count, min_overlap):
"""
Builds a graph from the dataframe from R
:return:
"""
G = nx.Graph()
node_cmap = 'viridis'
edge_cmap = 'Greys'
edge_map = defaultdict(dict)
gene_ids = self.df[self.overlap_column].values
gene_ids = [set(genes.split(self.overlap_sep)) for genes in gene_ids] # Turn it into a list
all_genes = 0
for g in gene_ids:
all_genes += len(g)
#min_overlap = int(0.01*all_genes) if int(0.01*all_genes) < 20 else 20
print(min_overlap, 'min_overlap')
# Want to iterate over and get the maps between the two
overlapping_numbers = []
for i, id_i in enumerate(self.df[self.id].values):
for j, id_j in enumerate(self.df[self.id].values):
if i != j:
if edge_map.get(id_j):
if edge_map[id_j].get(id_i):
continue
else:
overlapping_genes = len(gene_ids[i] & gene_ids[j])
if overlapping_genes >= min_overlap:
edge_map[id_i][id_j] = overlapping_genes
overlapping_numbers.append(overlapping_genes)
else:
overlapping_genes = len(gene_ids[i] & gene_ids[j])
if overlapping_genes >= min_overlap:
edge_map[id_i][id_j] = overlapping_genes
overlapping_numbers.append(overlapping_genes)
edges = []
for node1 in edge_map:
for node2 in edge_map[node1]:
edges.append((node1, node2))
seen_nodes = []
edge_groups = defaultdict(list)
for node_from, node_to_lst in edge_map.items():
if node_from not in seen_nodes:
# Now we want to traverse the graph visiting each node
for node in node_to_lst:
if node not in edge_groups[node_from] and node not in seen_nodes:
edge_groups[node_from].append(node)
seen_nodes.append(node)
if edge_map.get(node) and node not in seen_nodes:
for node2 in edge_map.get(node):
edge_groups[node_from].append(node2)
seen_nodes.append(node2)
seen_nodes.append(node_from)
edge_groups[node_from].append(node_from)
G.add_edges_from(edges)
nodes = G.nodes()
# Check that all nodes have been added and if not add them
nodes_to_add = [node_id for node_id in self.df[self.id].values if node_id not in nodes]
for node in nodes_to_add:
G.add_node(node)
edge_groups[node].append(node) # So that we actually draw it!
# Now we want a list of node sizes and colours
#mins = np.min(self.df[self.size].values)
#maxs = np.max(self.df[self.size].values)
#norms = maxs - mins
counts =[100*(c/max_count) for c in self.df[self.size].values] # [10 + (100 * (max_count - c)/(max_count - min_count)) for c in self.df[self.size].values]
self.df["norm_sized"] = [int(100*(c/max_count)) for c in self.df[self.size].values]
print(self.df[self.size])
colour = []
for p in self.df[self.color].values:
if p < 0.0001:
colour.append("#065f46")
elif p < 0.001:
colour.append("#059669")
elif p < 0.01:
colour.append("#34d399")
elif p < 0.05:
colour.append("#a7f3d0")
# Colour the edges by the number of genes shared between the nodes
edge_values = [edge_map[edge[0]][edge[1]] for edge in edges]
lut = dict(zip(set(edge_values), sns.dark_palette("#d1d5db", len(set(edge_values)), reverse=True)))
edge_cmap = ListedColormap(sns.dark_palette("#d1d5db", len(set(edge_values)), reverse=True))
edge_colours = [] #pd.DataFrame(edge_values)[0].map(lut).values
edge_alphas = []
for c in edge_values:
if c < 10:
edge_colours.append("#bfbfbf")
elif c < 20:
edge_colours.append("#a6a6a6")
elif c < 30:
edge_colours.append("#808080")
elif c < 40:
edge_colours.append("#595959")
elif c < 50:
edge_colours.append("#333333")
else:
edge_colours.append("#0d0d0d")
edge_colours.append(lut[c])
# Need to create a layout when doing
# separate calls to draw nodes and edges
pos = nx.spring_layout(G, k=1) #nx.kamada_kawai_layout(G) # nx.spring_layout(G, k=2) #
nx.draw_networkx_nodes(G, pos, node_color=colour, node_size=self.df["norm_sized"].values) #self.df[self.size].values)
labels = dict(zip(self.df[self.id].values, self.df[self.label].values))
nx.draw_networkx_edges(G, pos, edgelist=edges, alpha=0.8, edge_color=edge_colours, width=0.5, arrows=False) #, edge_color=edge_colours, arrows=False)
# Plot the small labels and then for each "cluster" plot the smallest GO ID this should
# correspond to the "top" term.
# https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.clique.find_cliques.html#networkx.algorithms.clique.find_cliques
cliques = nx.find_cliques(G)
labels_to_draw = {}
gene_numbers = dict(zip(self.df[self.id].values, self.df[self.size].values))
small_labels = {}
for go in labels:
if not labels_to_draw.get(go):
small_labels[go] = labels[go]
nx.draw_networkx_labels(G, pos, small_labels, font_size=self.axis_font_size, font_color='black',
font_family='sans-serif', verticalalignment='bottom', clip_on=False)
plt.axis("off")
def plot_cluster_ORA(filename, gene_ratio='GeneRatio', count_column='Count', padj='p.adjust', overlap_column='geneID',
id_column='ID', label_column='Description', gene_ratio_min=0.05, padj_max=0.05, title='',
label_font_size=9, figsize=(3, 3), axis_font_size=6, min_count=20, max_count=200, min_overlap=4,
save_fig=True):
"""
Parameters
----------
filename
gene_ratio
count_column
padj
overlap_column
id_column
label_column
gene_ratio_min
padj_max
title
label_font_size
figsize
axis_font_size
min_count
max_count
min_overlap
save_fig
Returns
-------
"""
df = pd.read_csv(f'{filename}')
# Convert gene ratio to a number
gr = df[gene_ratio].values
gene_ratios = []
for g in gr:
g = g.split('/')
g0 = float(g[0])
g1 = float(g[1])
gene_ratios.append(g0 / g1)
df[gene_ratio] = gene_ratios
df = df[df[gene_ratio] > gene_ratio_min]
df = df[df[padj] < padj_max]
if len(df) > 1:
eplot = Emapplot(df, size_column=count_column, color_column=padj, id_column=id_column,
label_column=label_column, overlap_column=overlap_column, overlap_sep='/', title=title,
config={'figsize': figsize, 'label_font_size': label_font_size,
'axis_font_size': axis_font_size})
eplot.build_graph(min_overlap=2, min_count=5, max_count=200)
plt.title(title, fontsize=18, fontweight='bold')
plt.gca().set_clip_on = False
if save_fig:
plt.savefig(f'{filename.replace(".csv", "")}_Network.svg', bbox_inches='tight', transparent=True)
plt.show()
x, y = np.ogrid[:300, :300]
plt.rcParams['svg.fonttype'] = 'none' # Ensure text is saved as text
plt.rcParams['figure.figsize'] = figsize
mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
mask = 255 * mask.astype(int)
wordfeqs = defaultdict(int)
for g in df[overlap_column].values:
for w in g.split('/'):
w = w.replace(' ', '.')
wordfeqs[w] += 1
total_words = len(wordfeqs)
for w in wordfeqs:
wordfeqs[w] = wordfeqs[w] / total_words
# Compute the frequency of each word (since there are duplicates sometimes...)
wordcloud = WordCloud(background_color="white", mask=mask, repeat=False).generate_from_frequencies(wordfeqs)
wordcloud_svg = wordcloud.to_svg(embed_font=True)
if save_fig:
f = open(f'{filename.replace(".csv", "")}_WordCloud.svg', "w+")
f.write(wordcloud_svg)
f.close()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
In [6]:
import os
files = os.listdir(ora_dir)
cluster_files = [c for c in files if 'ClusterGoSummary' in c]
for c in cluster_files:
if '.svg' not in c and '_RCM' in c:
if 'TPDE' in c and 'TMDS' in c:
plot_cluster_ORA(os.path.join(ora_dir, c), figsize=(1.5, 1.5))
elif ('MDS' in c or 'TPDS' in c) and 'TMDE' in c:
plot_cluster_ORA(os.path.join(ora_dir, c), figsize=(1.5, 1.5))
else:
plot_cluster_ORA(os.path.join(ora_dir, c), figsize=(2.5, 2.5))
2 min_overlap 0 25 1 22 2 22 3 25 4 23 5 24 6 19 7 16 8 16 9 9 11 16 12 16 13 16 21 9 22 9 26 9 42 11 46 10 49 12 60 13 2406 10 2408 15 2649 9 2654 11 Name: Count, dtype: int64
2 min_overlap 0 9 1 9 2 11 3 7 4 7 .. 2155 10 2156 6 2157 5 2158 5 2160 5 Name: Count, Length: 100, dtype: int64
2 min_overlap 0 43 1 43 2 43 3 27 4 35 .. 3803 19 3805 20 3810 21 3828 22 3862 20 Name: Count, Length: 72, dtype: int64
2 min_overlap 0 43 1 37 3 38 10 31 37 33 42 32 47 30 57 30 61 32 4305 41 4308 32 4309 32 4325 32 4724 37 4733 35 4734 35 4736 32 Name: Count, dtype: int64
2 min_overlap 0 21 1 19 2 33 3 27 4 24 5 34 6 27 7 30 8 18 9 17 10 14 11 14 15 17 26 14 42 14 47 14 52 17 61 13 64 14 116 13 121 13 122 14 123 14 3150 24 3151 22 Name: Count, dtype: int64
2 min_overlap 0 5 1 2 2 3 3 2 4 2 5 2 6 4 833 2 834 2 Name: Count, dtype: int64
2 min_overlap 0 21 1 21 2 21 3 15 4 10 .. 2792 10 3029 20 3030 22 3036 13 3045 10 Name: Count, Length: 64, dtype: int64
2 min_overlap 0 25 1 25 2 25 3 19 4 10 .. 2099 5 2100 7 2102 5 2105 5 2108 5 Name: Count, Length: 87, dtype: int64
2 min_overlap 0 6 1 8 2 8 3 8 4 6 .. 1555 3 1556 2 1557 4 1558 3 1559 3 Name: Count, Length: 117, dtype: int64
2 min_overlap 0 44 1 49 2 50 3 36 5 35 9 30 50 29 58 34 64 31 4034 36 4036 33 4462 31 4464 30 4466 30 Name: Count, dtype: int64
In [ ]:
In [ ]: