Generates the Network of Fig 3e of the histoCAT paper

A neightbourhood analysis was performed on the dataset using the histoCAT toolbox. The results were written out as .mat files containing p-values for significance of interactions.

There is one folder "all_pvalues" that contains pvalues for all images. Further there are folders containing pvalues for certain clusters of images as determined by manually cutting the dendrogram in the heatmap of Fig. 3d

By Vito Zanotelli, Bodemiller Group, MIT License

In [1]:
import os
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import scipy.io as sio
import scipy as sp
%matplotlib notebook

Define the path to the used data

In [2]:
# folder contains all the p_values as .mat files
folder = 'input/all_pValues/'

# basedir contains subfolders that have .mat files containing p_values
basedir = 'input/'

# plotdir is the directory that contains the output
plotdir = 'output'
# plot_prefix: a prefix to be used for the output
plot_prefix = 'final_'
# the filetype of the plots
filetype = '.pdf'

Load the file with all pvalues

In [3]:
files = [fn for fn in os.listdir(folder) if fn.endswith('.mat')]

filedat = pd.DataFrame([(fn, fn.split('_')[0], fn.split('_')[2], fn.split('_')[3].strip('.mat')) for fn in files])
filedat.columns = [['filename', 'sample', 'n_neigh', 'grade']]

filedat['mat'] = filedat['filename'].map(lambda x: sio.loadmat(os.path.join(folder,x))['pMAT'])

Load the files with pvalues per cluster

In [4]:
subdirs = os.walk(basedir).next()[1]
for subdir in subdirs:
    filedat['is_' + subdir] = filedat['filename'].isin(os.listdir(os.path.join(basedir, subdir)))

The matlab output matrices have 4 columns:

  • pos p value: pvalue for positive interaction (pValue for test "do Cells of type 'from celltype' have on average more 'to celltype' cells as negithbour as by a random distribution")
  • neg p value: pvalue for negative ('avoidance') interaction (pValue for test "do Cells of type 'from celltype' have on average less 'to celltype' cells as neigthbour as by a random distribution")
  • 'from celltype': identified as phenograph cluster number
  • 'to celltype': identified as phenograph cluster number

Define helper functions to calculate summary statistics from the neigthbourhood analysis matrices

In [5]:
dim = max(filedat['mat'].map(lambda x: x[:,2].max()))+1
full_shape = (dim, dim)
In [6]:
def pmat_2_full(pmat, direction, f_shape):
    """
    Calculate the full matrices from the sparse matrices saved.
    Note that missing values will be 0. This is okay here as the minimal p-values are allways 0.001
    """
    if direction >0:
        idir = 0
    else:
        idir = 1
    mat = sp.sparse.coo_matrix((pmat[:,idir], (pmat[:,2], pmat[:,3])), shape=f_shape).toarray()
    return mat

def get_present_counts(fullmatlist):
    """
    Calculate for each interaction how often it is non-missing in a list os matrices. 
    """
    
    mat = np.stack(fullmatlist, axis=2)
    n_presentmat = np.sum(mat >0, axis=2)
    return n_presentmat

def get_mean_present_p(fullmatlist):
    """
    Calculate the mean of the non-zero p-values.
    """
    mat = np.stack(fullmatlist, axis=2)
    mat = mat.copy()
    mat[mat == 0] = np.NAN
    mnmat = np.nanmean(mat, axis=2)
    return mnmat

def get_frac_signif(fullmatlist, p_co=0.01):
    """
    Calculate the mean of the non-zero p-values.
    """
    mat = np.stack(fullmatlist, axis=2)
    mat = mat.copy()
    mat[mat == 0] = np.NAN
    mnmat = np.nanmean(mat<=p_co, axis=2)
    return mnmat


def get_sum_signif(fullmatlist, p_co=0.01):
    """
    Calculate the mean of the non-zero p-values.
    """
    mat = np.stack(fullmatlist, axis=2)
    mat = mat.copy()
    mat[mat == 0] = np.NAN
    
    return np.nansum(mat<p_co, axis=2)

def get_n(fullmatlist):
    return len(fullmatlist)
In [7]:
filedat['fullmats_pos'] = filedat['mat'].map(lambda x: pmat_2_full(x, 1, full_shape))
filedat['fullmats_neg'] = filedat['mat'].map(lambda x: pmat_2_full(x, -1, full_shape))

Calculate the summary matrices:

In [8]:
p= 0.01
apply_fkts = lambda x: pd.Series(
    {('pos','ct_present'): get_present_counts(x['fullmats_pos'].tolist()),
    ('pos','mn_present'): get_mean_present_p(x['fullmats_pos'].tolist()),
    ('pos','frac_sign'): get_frac_signif(x['fullmats_pos'].tolist(), p_co=p),
     ('pos','n_sig'): get_sum_signif(x['fullmats_pos'].tolist(), p_co=p),
     ('pos','n_imgs'): get_n(x['fullmats_pos'].tolist()),
     ('neg','ct_present'): get_present_counts(x['fullmats_neg'].tolist()),
    ('neg','mn_present'): get_mean_present_p(x['fullmats_neg'].tolist()),
    ('neg','frac_sign'): get_frac_signif(x['fullmats_neg'].tolist(), p_co=p),
    ('neg','n_sig'): get_sum_signif(x['fullmats_neg'].tolist(), p_co=p),
    ('neg','n_imgs'): get_n(x['fullmats_neg'].tolist())
    })
In [9]:
stats = pd.DataFrame(data=subdirs, columns=['cluster'])
stats =stats.set_index('cluster', drop=False)
stats = stats.apply(lambda x: apply_fkts(filedat.loc[filedat['is_'+x['cluster']],:]),
                                                     axis=1)
/home/vitoz/.virtualenvs/python2_env/local/lib/python2.7/site-packages/numpy/lib/nanfunctions.py:703: RuntimeWarning: Mean of empty slice
  warnings.warn("Mean of empty slice", RuntimeWarning)

Plot the data as networks:

Concept

Make a graph where edge thickness corresponds to how often an interaction exist at all (fraction of times both celltypes are present together in an image). Color should correspond to which fractions of interactions are significant (white to gray)

In [10]:
import pygraphviz as pgv
# install graphviz library and libgraphviz
# install with pip as
# pip2 install pygraphviz --install-option="--include-path=/usr/include/graphviz" --install-option="--library-path=/usr/lib/graphviz/"

def col2hex(col):
    return '#%02x%02x%02x' % (col[0]*255, col[1]*255, col[2]*255)
In [11]:
cu_fracpres=0.9 # which fraction of the images does the connection needs to be present
cu_fracsig=0.3 # if the connection is present, what fraction does it need to be significant
cmap = plt.get_cmap('Greys')
clusters = ['WardsGrade1', 'WardsGrade3']
In [12]:
node_cols = pd.read_csv('input/pheno_colors.csv')
In [13]:
node_cols_dict = {str(clust): '#'+col for clust, col in  zip(node_cols['Cluster'], node_cols['HEX'])}
In [14]:
np.all(stats.loc[clusters[0], ('pos','ct_present')] == stats.loc[clusters[0], ('neg','ct_present')])
Out[14]:
True

Define a helper function to do the plots

In [15]:
def plot_groups(cu_fracsig, cu_fracpres, clusters, cmap_pos=None, cmap_neg=None, node_cols_dict=None):
    """
    Plots selected clusters on a common layout using pygraphviz
    
    Args:  cu_fracsig:  cutoff for significance
           cu_fracpres: cutoff for fraction of images were the relation needs to be present
           clusters:    the selected clusters for plotting
           cmap_pos:    colormap for positive edges
           cmap_neg:    colormap for negative edges
           node_cols_dict: colors for nodes
    """
    if cmap_pos is None:
        cmap_pos = plt.get_cmap('Reds')
    if cmap_neg is None:
        cmap_neg = plt.get_cmap('Blues')
    fracsig_pos = dict()
    fracsig_neg = dict()
    fracpres = dict()
    for cluster in clusters:
        fracsig_pos[cluster] = stats.loc[cluster, ('pos','frac_sign')].copy()
        fracsig_neg[cluster] = stats.loc[cluster, ('neg','frac_sign')].copy()
        # the presence of clusters in the images can be determined both from the 'pos' or 'neg' statistics
        # as it is independent of the p-value of an interaction.
        fracpres[cluster] = stats.loc[cluster, ('pos','ct_present')].copy()/float(stats.loc[cluster, ('pos','n_imgs')])

        fracpres[cluster][fracpres[cluster] < cu_fracpres] = 0
        fracpres[cluster][np.maximum(fracsig_pos[cluster], fracsig_neg[cluster]) < cu_fracsig] = 0 
        fracsig_pos[cluster][fracpres[cluster] == 0] = 0
        fracsig_neg[cluster][fracpres[cluster] == 0] = 0


    fracpres_all = np.stack(fracpres.values(),axis=2).max(axis=2)
    fracpos_all = np.stack(fracsig_pos.values(),axis=2).max(axis=2)
    G_all = pgv.AGraph(strict=False, directed=True, dpi=160, overlap=False, splines=False, bgcolor="white")
    for row in range(fracpres_all.shape[0]):
        for col in range(fracpres_all.shape[1]):
            if fracpres_all[row, col] > 0:
                pos = fracpos_all[row, col]
                weight=pos           
                G_all.add_edge(row,col,key=str(row)+str(col), weight=weight)
    G_all.layout(prog='neato')
    pos_dict = {n: n.attr['pos'] for n in G_all.nodes_iter()}
    for cluster in clusters:
        G = pgv.AGraph(strict=False, directed=True, dpi=160, overlap=False, splines=False, bgcolor="white")
        for node in G_all.nodes_iter():
            G.add_node(node)

        for row in range(fracpres[cluster].shape[0]):
            for col in range(fracpres[cluster].shape[1]):
                if fracpres[cluster][row, col] > 0:
                    pos = fracsig_pos[cluster][row, col]
                    neg = fracsig_neg[cluster][row, col]
                    if pos > neg:
                        cur_color = col2hex(cmap_pos(pos))
                        weight = pos
                    else:
                        cur_color = col2hex(cmap_neg(neg))
                        weight = -neg
                    G.add_edge(row,col,color=cur_color, penwidth=fracpres[cluster][row, col]*4, key=str(row)+str(col), weight=weight)

        for node in G.nodes_iter():
            node.attr['width'] = 0.3
            node.attr['height'] = 0.3
            node.attr['margin'] = 0
            node.attr['shape'] = 'circle'
            node.attr['fixedsize'] = True
            node.attr['fontsize'] = 10
            node.attr['penwidth'] = 5
            if node_cols_dict is not None:
                node.attr['color'] = node_cols_dict[node]
            node.attr['pos'] = pos_dict[node]


        G.layout(prog='neato',args='-n')
        #['size'] = "40,40"
        G.draw( os.path.join(plotdir, plot_prefix + 'graphs_'+'both'+ '_fracpres' +
                             str(cu_fracpres)+'_fracsig'+str(cu_fracsig)+
                              '_'+cluster+filetype))

Use the function to plot the networks used in Figure 3e

The images will be stored in the defined plotfolder

In [16]:
plot_groups(cu_fracsig=cu_fracsig, cu_fracpres=cu_fracpres, clusters=clusters, node_cols_dict=node_cols_dict)