Notebook RCM Part 1 ccRCC Figure 2

Figure2

Plot the SiRCle clusters nicely¶

Here we just want to plot the SiRCle clusters and plot the results from the TF analysis

In [1]:

import os

cancer = 'ClearCellRenalCellCarcinoma'
input_dir = 'Input_RCM'
output_dir = 'Output_Data'
supp_dir = 'Required_Refs'
fig_dir = 'Output_Figures'
regLabel = 'RG2_Changes_filtered'

ora_dir = os.path.join(output_dir, 'ORA')

Make visualisation asthetic¶

In [2]:

###############################################################################
#                                                                             #
#    This program is free software: you can redistribute it and/or modify     #
#    it under the terms of the GNU General Public License as published by     #
#    the Free Software Foundation, either version 3 of the License, or        #
#    (at your option) any later version.                                      #
#                                                                             #
#    This program is distributed in the hope that it will be useful,          #
#    but WITHOUT ANY WARRANTY; without even the implied warranty of           #
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            #
#    GNU General Public License for more details.                             #
#                                                                             #
#    You should have received a copy of the GNU General Public License        #
#    along with this program. If not, see <http://www.gnu.org/licenses/>.     #
#                                                                             #
###############################################################################

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict
from wordcloud import WordCloud

###############################################################################
#                                                                             #
#    This program is free software: you can redistribute it and/or modify     #
#    it under the terms of the GNU General Public License as published by     #
#    the Free Software Foundation, either version 3 of the License, or        #
#    (at your option) any later version.                                      #
#                                                                             #
#    This program is distributed in the hope that it will be useful,          #
#    but WITHOUT ANY WARRANTY; without even the implied warranty of           #
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            #
#    GNU General Public License for more details.                             #
#                                                                             #
#    You should have received a copy of the GNU General Public License        #
#    along with this program. If not, see <http://www.gnu.org/licenses/>.     #
#                                                                             #
###############################################################################
import matplotlib as mpl

import matplotlib.pyplot as plt
import pandas as pd
from collections import defaultdict
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import networkx as nx
import seaborn as sns
from matplotlib.patches import Patch
from matplotlib.patches import Rectangle
from matplotlib.legend_handler import HandlerBase
from matplotlib.colors import ListedColormap

from sciviso import Vis

# https://stackoverflow.com/questions/55501860/how-to-put-multiple-colormap-patches-in-a-matplotlib-legend
class HandlerColormap(HandlerBase):
    def __init__(self, cmap, num_stripes=8, **kw):
        HandlerBase.__init__(self, **kw)
        self.cmap = cmap
        self.num_stripes = num_stripes
    def create_artists(self, legend, orig_handle,
                       xdescent, ydescent, width, height, fontsize, trans):
        stripes = []
        for i in range(self.num_stripes):
            s = Rectangle([xdescent + i * width / self.num_stripes, ydescent],
                          width / self.num_stripes,
                          height,
                          fc=self.cmap((2 * i + 1) / (2 * self.num_stripes)),
                          transform=trans, linewidth=0)
            stripes.append(s)
        return stripes


class Emapplot(Vis):

    def __init__(self, df: pd.DataFrame, size_column='Count', color_column='p.adjust', id_column='ID',
                 label_column='Description', overlap_column='gene_id', overlap_sep='/', title='',
                 config={}):
        super().__init__(df)
        self.title=title
        self.size = size_column
        self.color = color_column
        self.id = id_column
        self.label = label_column
        self.overlap_column = overlap_column
        self.overlap_sep = overlap_sep
        if config:
            self.load_style(config)

    def build_graph(self, min_count, max_count, min_overlap):
        """
        Builds a graph from the dataframe from R
        :return:
        """
        G = nx.Graph()
        node_cmap = 'viridis'
        edge_cmap = 'Greys'

        edge_map = defaultdict(dict)
        gene_ids = self.df[self.overlap_column].values
        gene_ids = [set(genes.split(self.overlap_sep)) for genes in gene_ids] # Turn it into a list
        all_genes = 0
        for g in gene_ids:
            all_genes += len(g)
        #min_overlap = int(0.01*all_genes) if int(0.01*all_genes) < 20 else 20
        print(min_overlap, 'min_overlap')
        # Want to iterate over and get the maps between the two
        overlapping_numbers = []
        for i, id_i in enumerate(self.df[self.id].values):
            for j, id_j in enumerate(self.df[self.id].values):
                if i != j:
                    if edge_map.get(id_j):
                        if edge_map[id_j].get(id_i):
                            continue
                        else:
                            overlapping_genes = len(gene_ids[i] & gene_ids[j])
                            if overlapping_genes >= min_overlap:
                                edge_map[id_i][id_j] = overlapping_genes
                                overlapping_numbers.append(overlapping_genes)
                    else:
                        overlapping_genes = len(gene_ids[i] & gene_ids[j])
                        if overlapping_genes >= min_overlap:
                            edge_map[id_i][id_j] = overlapping_genes
                            overlapping_numbers.append(overlapping_genes)
        edges = []
        for node1 in edge_map:
            for node2 in edge_map[node1]:
                edges.append((node1, node2))

        seen_nodes = []
        edge_groups = defaultdict(list)
        for node_from, node_to_lst in edge_map.items():
            if node_from not in seen_nodes:
                # Now we want to traverse the graph visiting each node
                for node in node_to_lst:
                    if node not in edge_groups[node_from] and node not in seen_nodes:
                        edge_groups[node_from].append(node)
                        seen_nodes.append(node)
                        if edge_map.get(node) and node not in seen_nodes:
                            for node2 in edge_map.get(node):
                                edge_groups[node_from].append(node2)
                                seen_nodes.append(node2)
                seen_nodes.append(node_from)
                edge_groups[node_from].append(node_from)

        G.add_edges_from(edges)
        nodes = G.nodes()
        # Check that all nodes have been added and if not add them
        nodes_to_add = [node_id for node_id in self.df[self.id].values if node_id not in nodes]
        for node in nodes_to_add:
            G.add_node(node)
            edge_groups[node].append(node) # So that we actually draw it!
        # Now we want a list of node sizes and colours
        #mins = np.min(self.df[self.size].values)
        #maxs = np.max(self.df[self.size].values)
        #norms = maxs - mins
        counts =[100*(c/max_count) for c in self.df[self.size].values] # [10 + (100 * (max_count - c)/(max_count - min_count)) for c in self.df[self.size].values]
        self.df["norm_sized"] = [int(100*(c/max_count)) for c in self.df[self.size].values]
        print(self.df[self.size])
        colour = []
        for p in self.df[self.color].values:
            if p < 0.0001:
               colour.append("#065f46")
            elif p < 0.001:
                colour.append("#059669")
            elif p < 0.01:
                colour.append("#34d399")
            elif p < 0.05:
                colour.append("#a7f3d0")
        # Colour the edges by the number of genes shared between the nodes
        edge_values = [edge_map[edge[0]][edge[1]] for edge in edges]


        lut = dict(zip(set(edge_values), sns.dark_palette("#d1d5db", len(set(edge_values)), reverse=True)))
        edge_cmap = ListedColormap(sns.dark_palette("#d1d5db", len(set(edge_values)), reverse=True))
        edge_colours = [] #pd.DataFrame(edge_values)[0].map(lut).values
        edge_alphas = []
        for c in edge_values:
            if c < 10:
                edge_colours.append("#bfbfbf")
            elif c < 20:
                edge_colours.append("#a6a6a6")
            elif c < 30:
                edge_colours.append("#808080")
            elif c < 40:
                edge_colours.append("#595959")
            elif c < 50:
                edge_colours.append("#333333")
            else:
                edge_colours.append("#0d0d0d")
            edge_colours.append(lut[c])
        # Need to create a layout when doing
        # separate calls to draw nodes and edges
        pos = nx.spring_layout(G,  k=1) #nx.kamada_kawai_layout(G) # nx.spring_layout(G,  k=2) #
        nx.draw_networkx_nodes(G, pos, node_color=colour, node_size=self.df["norm_sized"].values) #self.df[self.size].values)

        labels = dict(zip(self.df[self.id].values, self.df[self.label].values))
        nx.draw_networkx_edges(G, pos, edgelist=edges, alpha=0.8, edge_color=edge_colours, width=0.5, arrows=False) #, edge_color=edge_colours, arrows=False)
        # Plot the small labels and then for each "cluster" plot the smallest GO ID this should
        # correspond to the "top" term.
        # https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.clique.find_cliques.html#networkx.algorithms.clique.find_cliques
        cliques = nx.find_cliques(G)
        labels_to_draw = {}
        gene_numbers = dict(zip(self.df[self.id].values, self.df[self.size].values))


        small_labels = {}
        for go in labels:
            if not labels_to_draw.get(go):
                small_labels[go] = labels[go]
        nx.draw_networkx_labels(G, pos, small_labels, font_size=self.axis_font_size, font_color='black',
                                font_family='sans-serif', verticalalignment='bottom', clip_on=False)

        plt.axis("off")

def plot_cluster_ORA(filename, gene_ratio='GeneRatio', count_column='Count', padj='p.adjust', overlap_column='geneID',
                     id_column='ID', label_column='Description', gene_ratio_min=0.05, padj_max=0.05, title='',
                     label_font_size=9, figsize=(3, 3), axis_font_size=6, min_count=20, max_count=200, min_overlap=4,
                     save_fig=True):
    """

    Parameters
    ----------
    filename
    gene_ratio
    count_column
    padj
    overlap_column
    id_column
    label_column
    gene_ratio_min
    padj_max
    title
    label_font_size
    figsize
    axis_font_size
    min_count
    max_count
    min_overlap
    save_fig

    Returns
    -------

    """
    df = pd.read_csv(f'{filename}')
    # Convert gene ratio to a number
    gr = df[gene_ratio].values
    gene_ratios = []
    for g in gr:
        g = g.split('/')
        g0 = float(g[0])
        g1 = float(g[1])
        gene_ratios.append(g0 / g1)
    df[gene_ratio] = gene_ratios
    df = df[df[gene_ratio] > gene_ratio_min]
    df = df[df[padj] < padj_max]
    if len(df) > 1:
        eplot = Emapplot(df, size_column=count_column, color_column=padj, id_column=id_column,
                         label_column=label_column, overlap_column=overlap_column, overlap_sep='/', title=title,
                         config={'figsize': figsize, 'label_font_size': label_font_size,
                                 'axis_font_size': axis_font_size})
        eplot.build_graph(min_overlap=2, min_count=5, max_count=200)
        plt.title(title, fontsize=18, fontweight='bold')
        plt.gca().set_clip_on = False

        if save_fig:
            plt.savefig(f'{filename.replace(".csv", "")}_Network.svg', bbox_inches='tight', transparent=True)
        plt.show()

        x, y = np.ogrid[:300, :300]
        plt.rcParams['svg.fonttype'] = 'none'  # Ensure text is saved as text
        plt.rcParams['figure.figsize'] = figsize
        mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
        mask = 255 * mask.astype(int)
        wordfeqs = defaultdict(int)
        for g in df[overlap_column].values:
            for w in g.split('/'):
                w = w.replace(' ', '.')
                wordfeqs[w] += 1
        total_words = len(wordfeqs)
        for w in wordfeqs:
            wordfeqs[w] = wordfeqs[w] / total_words
        # Compute the frequency of each word (since there are duplicates sometimes...)
        wordcloud = WordCloud(background_color="white", mask=mask, repeat=False).generate_from_frequencies(wordfeqs)
        wordcloud_svg = wordcloud.to_svg(embed_font=True)
        if save_fig:
            f = open(f'{filename.replace(".csv", "")}_WordCloud.svg', "w+")
            f.write(wordcloud_svg)
            f.close()
        plt.imshow(wordcloud, interpolation="bilinear")
        plt.axis("off")
        plt.show()

In [3]:

import os

files = os.listdir(ora_dir)
cluster_files = [c for c in files if 'ClusterGoSummary' in c]

for c in cluster_files:
    if '.svg' not in c and '_RCM' in c:
        if 'TPDE' in c and 'TMDS' in c:
            plot_cluster_ORA(os.path.join(ora_dir, c), figsize=(1.5, 1.5))
        elif ('MDS' in c or 'TPDS' in c) and 'TMDE' in c:
            plot_cluster_ORA(os.path.join(ora_dir, c), figsize=(1.5, 1.5))
        else:
            plot_cluster_ORA(os.path.join(ora_dir, c), figsize=(2.5, 2.5))

2 min_overlap
0       38
1       24
2       21
3       21
4       39
5       19
6       27
7       19
8       27
9       20
11      23
12      22
13      20
14      21
15      27
16      20
17      28
20      30
22      27
24      20
25      24
27      26
29      28
30      20
31      27
32      27
34      27
3500    24
3501    23
3860    19
3881    22
3887    20
Name: Count, dtype: int64

No description has been provided for this image

2 min_overlap
0       27
3       15
4       28
5       16
8       22
14      22
15      23
16      18
18      15
19      20
20      24
21      20
23      16
26      17
50      19
52      19
82      19
133     16
3434    21
3435    21
3444    15
3445    18
Name: Count, dtype: int64

2 min_overlap
0       131
1       117
2        83
3        82
4        83
5        98
6        75
7        82
8        85
9        71
10       55
11       55
12      103
13       83
14       92
15       64
16       63
17       97
19       92
20       76
21       72
24       71
28       86
31       80
33       59
35       74
37       68
48       53
50       50
52       50
54       66
55       64
58       56
59       66
67       64
71       65
72       52
73       51
94       58
171      50
185      49
4205    264
4206    260
4207    181
4208     80
4209     80
4210     88
4211     66
4212     61
4215     60
4216     86
4217     77
4218     61
4224     51
4659     68
4665     63
Name: Count, dtype: int64

2 min_overlap
6       32
7       34
8       34
23      36
34      34
44      35
93      32
4099    33
4523    52
4524    53
4525    53
4526    49
4527    41
4528    40
4534    32
4535    40
Name: Count, dtype: int64

2 min_overlap
0       25
1       27
2       34
3       36
5       25
28      27
36      29
38      25
40      26
47      28
58      26
62      28
69      26
4504    27
4505    25
4506    25
Name: Count, dtype: int64

2 min_overlap
0       20
1       14
3       12
4        8
5       13
6        8
7        8
8        8
9       15
10      14
11      13
12      13
15      13
16      13
18      12
20      10
21       9
28      10
29      11
34      11
36      10
2133    16
2134    16
2135    14
2136    15
2137    16
2142    11
2152    11
2396     8
2397     9
2398     8
2400    13
2405     8
2406     9
Name: Count, dtype: int64

2 min_overlap
0       83
1       66
2       62
3       50
4       62
5       37
6       31
7       27
8       30
9       39
10      40
15      32
17      32
18      38
19      36
21      30
24      34
29      24
30      34
35      32
36      30
39      30
49      25
78      23
136     23
3421    58
3422    42
3423    44
3426    24
3430    24
3431    24
3436    29
3443    24
3795    25
3801    23
3802    28
3803    35
3804    24
3837    25
Name: Count, dtype: int64

2 min_overlap
0       56
1       59
2       45
3       58
4       48
        ..
4307    24
4672    31
4673    31
4675    31
4689    25
Name: Count, Length: 182, dtype: int64

2 min_overlap
0       19
1       17
2       16
3       16
4       21
        ..
2788    11
2795    11
3050    15
3051    11
3053    15
Name: Count, Length: 86, dtype: int64

2 min_overlap
0       101
1        95
2        86
3        70
4        71
       ... 
5308     59
5309     58
5317     47
5321     48
5322     48
Name: Count, Length: 64, dtype: int64

In [ ]: