Notebook RCM Part 1 PanCan Figure 2

Figure2

Plot the SiRCle clusters nicely¶

Here we just want to plot the SiRCle clusters and plot the results from the TF analysis

In [4]:

import os

cancer = 'PanCan'
input_dir = 'Input_RCM'
output_dir = 'Output_Data'
supp_dir = 'Required_Refs'
fig_dir = 'Output_Figures'
regLabel = 'RG2_Changes_filtered'

ora_dir = os.path.join(output_dir, 'ORA')

Make visualisation asthetic¶

In [5]:

###############################################################################
#                                                                             #
#    This program is free software: you can redistribute it and/or modify     #
#    it under the terms of the GNU General Public License as published by     #
#    the Free Software Foundation, either version 3 of the License, or        #
#    (at your option) any later version.                                      #
#                                                                             #
#    This program is distributed in the hope that it will be useful,          #
#    but WITHOUT ANY WARRANTY; without even the implied warranty of           #
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            #
#    GNU General Public License for more details.                             #
#                                                                             #
#    You should have received a copy of the GNU General Public License        #
#    along with this program. If not, see <http://www.gnu.org/licenses/>.     #
#                                                                             #
###############################################################################

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict
from wordcloud import WordCloud

###############################################################################
#                                                                             #
#    This program is free software: you can redistribute it and/or modify     #
#    it under the terms of the GNU General Public License as published by     #
#    the Free Software Foundation, either version 3 of the License, or        #
#    (at your option) any later version.                                      #
#                                                                             #
#    This program is distributed in the hope that it will be useful,          #
#    but WITHOUT ANY WARRANTY; without even the implied warranty of           #
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            #
#    GNU General Public License for more details.                             #
#                                                                             #
#    You should have received a copy of the GNU General Public License        #
#    along with this program. If not, see <http://www.gnu.org/licenses/>.     #
#                                                                             #
###############################################################################
import matplotlib as mpl

import matplotlib.pyplot as plt
import pandas as pd
from collections import defaultdict
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import networkx as nx
import seaborn as sns
from matplotlib.patches import Patch
from matplotlib.patches import Rectangle
from matplotlib.legend_handler import HandlerBase
from matplotlib.colors import ListedColormap

from sciviso import Vis

# https://stackoverflow.com/questions/55501860/how-to-put-multiple-colormap-patches-in-a-matplotlib-legend
class HandlerColormap(HandlerBase):
    def __init__(self, cmap, num_stripes=8, **kw):
        HandlerBase.__init__(self, **kw)
        self.cmap = cmap
        self.num_stripes = num_stripes
    def create_artists(self, legend, orig_handle,
                       xdescent, ydescent, width, height, fontsize, trans):
        stripes = []
        for i in range(self.num_stripes):
            s = Rectangle([xdescent + i * width / self.num_stripes, ydescent],
                          width / self.num_stripes,
                          height,
                          fc=self.cmap((2 * i + 1) / (2 * self.num_stripes)),
                          transform=trans, linewidth=0)
            stripes.append(s)
        return stripes


class Emapplot(Vis):

    def __init__(self, df: pd.DataFrame, size_column='Count', color_column='p.adjust', id_column='ID',
                 label_column='Description', overlap_column='gene_id', overlap_sep='/', title='',
                 config={}):
        super().__init__(df)
        self.title=title
        self.size = size_column
        self.color = color_column
        self.id = id_column
        self.label = label_column
        self.overlap_column = overlap_column
        self.overlap_sep = overlap_sep
        if config:
            self.load_style(config)

    def build_graph(self, min_count, max_count, min_overlap):
        """
        Builds a graph from the dataframe from R
        :return:
        """
        G = nx.Graph()
        node_cmap = 'viridis'
        edge_cmap = 'Greys'

        edge_map = defaultdict(dict)
        gene_ids = self.df[self.overlap_column].values
        gene_ids = [set(genes.split(self.overlap_sep)) for genes in gene_ids] # Turn it into a list
        all_genes = 0
        for g in gene_ids:
            all_genes += len(g)
        #min_overlap = int(0.01*all_genes) if int(0.01*all_genes) < 20 else 20
        print(min_overlap, 'min_overlap')
        # Want to iterate over and get the maps between the two
        overlapping_numbers = []
        for i, id_i in enumerate(self.df[self.id].values):
            for j, id_j in enumerate(self.df[self.id].values):
                if i != j:
                    if edge_map.get(id_j):
                        if edge_map[id_j].get(id_i):
                            continue
                        else:
                            overlapping_genes = len(gene_ids[i] & gene_ids[j])
                            if overlapping_genes >= min_overlap:
                                edge_map[id_i][id_j] = overlapping_genes
                                overlapping_numbers.append(overlapping_genes)
                    else:
                        overlapping_genes = len(gene_ids[i] & gene_ids[j])
                        if overlapping_genes >= min_overlap:
                            edge_map[id_i][id_j] = overlapping_genes
                            overlapping_numbers.append(overlapping_genes)
        edges = []
        for node1 in edge_map:
            for node2 in edge_map[node1]:
                edges.append((node1, node2))

        seen_nodes = []
        edge_groups = defaultdict(list)
        for node_from, node_to_lst in edge_map.items():
            if node_from not in seen_nodes:
                # Now we want to traverse the graph visiting each node
                for node in node_to_lst:
                    if node not in edge_groups[node_from] and node not in seen_nodes:
                        edge_groups[node_from].append(node)
                        seen_nodes.append(node)
                        if edge_map.get(node) and node not in seen_nodes:
                            for node2 in edge_map.get(node):
                                edge_groups[node_from].append(node2)
                                seen_nodes.append(node2)
                seen_nodes.append(node_from)
                edge_groups[node_from].append(node_from)

        G.add_edges_from(edges)
        nodes = G.nodes()
        # Check that all nodes have been added and if not add them
        nodes_to_add = [node_id for node_id in self.df[self.id].values if node_id not in nodes]
        for node in nodes_to_add:
            G.add_node(node)
            edge_groups[node].append(node) # So that we actually draw it!
        # Now we want a list of node sizes and colours
        #mins = np.min(self.df[self.size].values)
        #maxs = np.max(self.df[self.size].values)
        #norms = maxs - mins
        counts =[100*(c/max_count) for c in self.df[self.size].values] # [10 + (100 * (max_count - c)/(max_count - min_count)) for c in self.df[self.size].values]
        self.df["norm_sized"] = [int(100*(c/max_count)) for c in self.df[self.size].values]
        print(self.df[self.size])
        colour = []
        for p in self.df[self.color].values:
            if p < 0.0001:
               colour.append("#065f46")
            elif p < 0.001:
                colour.append("#059669")
            elif p < 0.01:
                colour.append("#34d399")
            elif p < 0.05:
                colour.append("#a7f3d0")
        # Colour the edges by the number of genes shared between the nodes
        edge_values = [edge_map[edge[0]][edge[1]] for edge in edges]


        lut = dict(zip(set(edge_values), sns.dark_palette("#d1d5db", len(set(edge_values)), reverse=True)))
        edge_cmap = ListedColormap(sns.dark_palette("#d1d5db", len(set(edge_values)), reverse=True))
        edge_colours = [] #pd.DataFrame(edge_values)[0].map(lut).values
        edge_alphas = []
        for c in edge_values:
            if c < 10:
                edge_colours.append("#bfbfbf")
            elif c < 20:
                edge_colours.append("#a6a6a6")
            elif c < 30:
                edge_colours.append("#808080")
            elif c < 40:
                edge_colours.append("#595959")
            elif c < 50:
                edge_colours.append("#333333")
            else:
                edge_colours.append("#0d0d0d")
            edge_colours.append(lut[c])
        # Need to create a layout when doing
        # separate calls to draw nodes and edges
        pos = nx.spring_layout(G,  k=1) #nx.kamada_kawai_layout(G) # nx.spring_layout(G,  k=2) #
        nx.draw_networkx_nodes(G, pos, node_color=colour, node_size=self.df["norm_sized"].values) #self.df[self.size].values)

        labels = dict(zip(self.df[self.id].values, self.df[self.label].values))
        nx.draw_networkx_edges(G, pos, edgelist=edges, alpha=0.8, edge_color=edge_colours, width=0.5, arrows=False) #, edge_color=edge_colours, arrows=False)
        # Plot the small labels and then for each "cluster" plot the smallest GO ID this should
        # correspond to the "top" term.
        # https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.clique.find_cliques.html#networkx.algorithms.clique.find_cliques
        cliques = nx.find_cliques(G)
        labels_to_draw = {}
        gene_numbers = dict(zip(self.df[self.id].values, self.df[self.size].values))


        small_labels = {}
        for go in labels:
            if not labels_to_draw.get(go):
                small_labels[go] = labels[go]
        nx.draw_networkx_labels(G, pos, small_labels, font_size=self.axis_font_size, font_color='black',
                                font_family='sans-serif', verticalalignment='bottom', clip_on=False)

        plt.axis("off")

def plot_cluster_ORA(filename, gene_ratio='GeneRatio', count_column='Count', padj='p.adjust', overlap_column='geneID',
                     id_column='ID', label_column='Description', gene_ratio_min=0.05, padj_max=0.05, title='',
                     label_font_size=9, figsize=(3, 3), axis_font_size=6, min_count=20, max_count=200, min_overlap=4,
                     save_fig=True):
    """

    Parameters
    ----------
    filename
    gene_ratio
    count_column
    padj
    overlap_column
    id_column
    label_column
    gene_ratio_min
    padj_max
    title
    label_font_size
    figsize
    axis_font_size
    min_count
    max_count
    min_overlap
    save_fig

    Returns
    -------

    """
    df = pd.read_csv(f'{filename}')
    # Convert gene ratio to a number
    gr = df[gene_ratio].values
    gene_ratios = []
    for g in gr:
        g = g.split('/')
        g0 = float(g[0])
        g1 = float(g[1])
        gene_ratios.append(g0 / g1)
    df[gene_ratio] = gene_ratios
    df = df[df[gene_ratio] > gene_ratio_min]
    df = df[df[padj] < padj_max]
    if len(df) > 1:
        eplot = Emapplot(df, size_column=count_column, color_column=padj, id_column=id_column,
                         label_column=label_column, overlap_column=overlap_column, overlap_sep='/', title=title,
                         config={'figsize': figsize, 'label_font_size': label_font_size,
                                 'axis_font_size': axis_font_size})
        eplot.build_graph(min_overlap=2, min_count=5, max_count=200)
        plt.title(title, fontsize=18, fontweight='bold')
        plt.gca().set_clip_on = False

        if save_fig:
            plt.savefig(f'{filename.replace(".csv", "")}_Network.svg', bbox_inches='tight', transparent=True)
        plt.show()

        x, y = np.ogrid[:300, :300]
        plt.rcParams['svg.fonttype'] = 'none'  # Ensure text is saved as text
        plt.rcParams['figure.figsize'] = figsize
        mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
        mask = 255 * mask.astype(int)
        wordfeqs = defaultdict(int)
        for g in df[overlap_column].values:
            for w in g.split('/'):
                w = w.replace(' ', '.')
                wordfeqs[w] += 1
        total_words = len(wordfeqs)
        for w in wordfeqs:
            wordfeqs[w] = wordfeqs[w] / total_words
        # Compute the frequency of each word (since there are duplicates sometimes...)
        wordcloud = WordCloud(background_color="white", mask=mask, repeat=False).generate_from_frequencies(wordfeqs)
        wordcloud_svg = wordcloud.to_svg(embed_font=True)
        if save_fig:
            f = open(f'{filename.replace(".csv", "")}_WordCloud.svg', "w+")
            f.write(wordcloud_svg)
            f.close()
        plt.imshow(wordcloud, interpolation="bilinear")
        plt.axis("off")
        plt.show()

In [6]:

import os

files = os.listdir(ora_dir)
cluster_files = [c for c in files if 'ClusterGoSummary' in c]

for c in cluster_files:
    if '.svg' not in c and '_RCM' in c:
        if 'TPDE' in c and 'TMDS' in c:
            plot_cluster_ORA(os.path.join(ora_dir, c), figsize=(1.5, 1.5))
        elif ('MDS' in c or 'TPDS' in c) and 'TMDE' in c:
            plot_cluster_ORA(os.path.join(ora_dir, c), figsize=(1.5, 1.5))
        else:
            plot_cluster_ORA(os.path.join(ora_dir, c), figsize=(2.5, 2.5))

2 min_overlap
0       25
1       22
2       22
3       25
4       23
5       24
6       19
7       16
8       16
9        9
11      16
12      16
13      16
21       9
22       9
26       9
42      11
46      10
49      12
60      13
2406    10
2408    15
2649     9
2654    11
Name: Count, dtype: int64

No description has been provided for this image

2 min_overlap
0        9
1        9
2       11
3        7
4        7
        ..
2155    10
2156     6
2157     5
2158     5
2160     5
Name: Count, Length: 100, dtype: int64

2 min_overlap
0       43
1       43
2       43
3       27
4       35
        ..
3803    19
3805    20
3810    21
3828    22
3862    20
Name: Count, Length: 72, dtype: int64

2 min_overlap
0       43
1       37
3       38
10      31
37      33
42      32
47      30
57      30
61      32
4305    41
4308    32
4309    32
4325    32
4724    37
4733    35
4734    35
4736    32
Name: Count, dtype: int64

2 min_overlap
0       21
1       19
2       33
3       27
4       24
5       34
6       27
7       30
8       18
9       17
10      14
11      14
15      17
26      14
42      14
47      14
52      17
61      13
64      14
116     13
121     13
122     14
123     14
3150    24
3151    22
Name: Count, dtype: int64

2 min_overlap
0      5
1      2
2      3
3      2
4      2
5      2
6      4
833    2
834    2
Name: Count, dtype: int64

2 min_overlap
0       21
1       21
2       21
3       15
4       10
        ..
2792    10
3029    20
3030    22
3036    13
3045    10
Name: Count, Length: 64, dtype: int64

2 min_overlap
0       25
1       25
2       25
3       19
4       10
        ..
2099     5
2100     7
2102     5
2105     5
2108     5
Name: Count, Length: 87, dtype: int64

2 min_overlap
0       6
1       8
2       8
3       8
4       6
       ..
1555    3
1556    2
1557    4
1558    3
1559    3
Name: Count, Length: 117, dtype: int64

2 min_overlap
0       44
1       49
2       50
3       36
5       35
9       30
50      29
58      34
64      31
4034    36
4036    33
4462    31
4464    30
4466    30
Name: Count, dtype: int64

In [ ]: