A neightbourhood analysis was performed on the dataset using the histoCAT toolbox. The results were written out as .mat files containing p-values for significance of interactions.
There is one folder "all_pvalues" that contains pvalues for all images. Further there are folders containing pvalues for certain clusters of images as determined by manually cutting the dendrogram in the heatmap of Fig. 3d
By Vito Zanotelli, Bodemiller Group, MIT License
import os
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import scipy.io as sio
import scipy as sp
%matplotlib notebook
Define the path to the used data
# folder contains all the p_values as .mat files
folder = 'input/all_pValues/'
# basedir contains subfolders that have .mat files containing p_values
basedir = 'input/'
# plotdir is the directory that contains the output
plotdir = 'output'
# plot_prefix: a prefix to be used for the output
plot_prefix = 'final_'
# the filetype of the plots
filetype = '.pdf'
Load the file with all pvalues
files = [fn for fn in os.listdir(folder) if fn.endswith('.mat')]
filedat = pd.DataFrame([(fn, fn.split('_')[0], fn.split('_')[2], fn.split('_')[3].strip('.mat')) for fn in files])
filedat.columns = [['filename', 'sample', 'n_neigh', 'grade']]
filedat['mat'] = filedat['filename'].map(lambda x: sio.loadmat(os.path.join(folder,x))['pMAT'])
Load the files with pvalues per cluster
subdirs = os.walk(basedir).next()[1]
for subdir in subdirs:
filedat['is_' + subdir] = filedat['filename'].isin(os.listdir(os.path.join(basedir, subdir)))
The matlab output matrices have 4 columns:
dim = max(filedat['mat'].map(lambda x: x[:,2].max()))+1
full_shape = (dim, dim)
def pmat_2_full(pmat, direction, f_shape):
"""
Calculate the full matrices from the sparse matrices saved.
Note that missing values will be 0. This is okay here as the minimal p-values are allways 0.001
"""
if direction >0:
idir = 0
else:
idir = 1
mat = sp.sparse.coo_matrix((pmat[:,idir], (pmat[:,2], pmat[:,3])), shape=f_shape).toarray()
return mat
def get_present_counts(fullmatlist):
"""
Calculate for each interaction how often it is non-missing in a list os matrices.
"""
mat = np.stack(fullmatlist, axis=2)
n_presentmat = np.sum(mat >0, axis=2)
return n_presentmat
def get_mean_present_p(fullmatlist):
"""
Calculate the mean of the non-zero p-values.
"""
mat = np.stack(fullmatlist, axis=2)
mat = mat.copy()
mat[mat == 0] = np.NAN
mnmat = np.nanmean(mat, axis=2)
return mnmat
def get_frac_signif(fullmatlist, p_co=0.01):
"""
Calculate the mean of the non-zero p-values.
"""
mat = np.stack(fullmatlist, axis=2)
mat = mat.copy()
mat[mat == 0] = np.NAN
mnmat = np.nanmean(mat<=p_co, axis=2)
return mnmat
def get_sum_signif(fullmatlist, p_co=0.01):
"""
Calculate the mean of the non-zero p-values.
"""
mat = np.stack(fullmatlist, axis=2)
mat = mat.copy()
mat[mat == 0] = np.NAN
return np.nansum(mat<p_co, axis=2)
def get_n(fullmatlist):
return len(fullmatlist)
filedat['fullmats_pos'] = filedat['mat'].map(lambda x: pmat_2_full(x, 1, full_shape))
filedat['fullmats_neg'] = filedat['mat'].map(lambda x: pmat_2_full(x, -1, full_shape))
p= 0.01
apply_fkts = lambda x: pd.Series(
{('pos','ct_present'): get_present_counts(x['fullmats_pos'].tolist()),
('pos','mn_present'): get_mean_present_p(x['fullmats_pos'].tolist()),
('pos','frac_sign'): get_frac_signif(x['fullmats_pos'].tolist(), p_co=p),
('pos','n_sig'): get_sum_signif(x['fullmats_pos'].tolist(), p_co=p),
('pos','n_imgs'): get_n(x['fullmats_pos'].tolist()),
('neg','ct_present'): get_present_counts(x['fullmats_neg'].tolist()),
('neg','mn_present'): get_mean_present_p(x['fullmats_neg'].tolist()),
('neg','frac_sign'): get_frac_signif(x['fullmats_neg'].tolist(), p_co=p),
('neg','n_sig'): get_sum_signif(x['fullmats_neg'].tolist(), p_co=p),
('neg','n_imgs'): get_n(x['fullmats_neg'].tolist())
})
stats = pd.DataFrame(data=subdirs, columns=['cluster'])
stats =stats.set_index('cluster', drop=False)
stats = stats.apply(lambda x: apply_fkts(filedat.loc[filedat['is_'+x['cluster']],:]),
axis=1)
import pygraphviz as pgv
# install graphviz library and libgraphviz
# install with pip as
# pip2 install pygraphviz --install-option="--include-path=/usr/include/graphviz" --install-option="--library-path=/usr/lib/graphviz/"
def col2hex(col):
return '#%02x%02x%02x' % (col[0]*255, col[1]*255, col[2]*255)
cu_fracpres=0.9 # which fraction of the images does the connection needs to be present
cu_fracsig=0.3 # if the connection is present, what fraction does it need to be significant
cmap = plt.get_cmap('Greys')
clusters = ['WardsGrade1', 'WardsGrade3']
node_cols = pd.read_csv('input/pheno_colors.csv')
node_cols_dict = {str(clust): '#'+col for clust, col in zip(node_cols['Cluster'], node_cols['HEX'])}
np.all(stats.loc[clusters[0], ('pos','ct_present')] == stats.loc[clusters[0], ('neg','ct_present')])
Define a helper function to do the plots
def plot_groups(cu_fracsig, cu_fracpres, clusters, cmap_pos=None, cmap_neg=None, node_cols_dict=None):
"""
Plots selected clusters on a common layout using pygraphviz
Args: cu_fracsig: cutoff for significance
cu_fracpres: cutoff for fraction of images were the relation needs to be present
clusters: the selected clusters for plotting
cmap_pos: colormap for positive edges
cmap_neg: colormap for negative edges
node_cols_dict: colors for nodes
"""
if cmap_pos is None:
cmap_pos = plt.get_cmap('Reds')
if cmap_neg is None:
cmap_neg = plt.get_cmap('Blues')
fracsig_pos = dict()
fracsig_neg = dict()
fracpres = dict()
for cluster in clusters:
fracsig_pos[cluster] = stats.loc[cluster, ('pos','frac_sign')].copy()
fracsig_neg[cluster] = stats.loc[cluster, ('neg','frac_sign')].copy()
# the presence of clusters in the images can be determined both from the 'pos' or 'neg' statistics
# as it is independent of the p-value of an interaction.
fracpres[cluster] = stats.loc[cluster, ('pos','ct_present')].copy()/float(stats.loc[cluster, ('pos','n_imgs')])
fracpres[cluster][fracpres[cluster] < cu_fracpres] = 0
fracpres[cluster][np.maximum(fracsig_pos[cluster], fracsig_neg[cluster]) < cu_fracsig] = 0
fracsig_pos[cluster][fracpres[cluster] == 0] = 0
fracsig_neg[cluster][fracpres[cluster] == 0] = 0
fracpres_all = np.stack(fracpres.values(),axis=2).max(axis=2)
fracpos_all = np.stack(fracsig_pos.values(),axis=2).max(axis=2)
G_all = pgv.AGraph(strict=False, directed=True, dpi=160, overlap=False, splines=False, bgcolor="white")
for row in range(fracpres_all.shape[0]):
for col in range(fracpres_all.shape[1]):
if fracpres_all[row, col] > 0:
pos = fracpos_all[row, col]
weight=pos
G_all.add_edge(row,col,key=str(row)+str(col), weight=weight)
G_all.layout(prog='neato')
pos_dict = {n: n.attr['pos'] for n in G_all.nodes_iter()}
for cluster in clusters:
G = pgv.AGraph(strict=False, directed=True, dpi=160, overlap=False, splines=False, bgcolor="white")
for node in G_all.nodes_iter():
G.add_node(node)
for row in range(fracpres[cluster].shape[0]):
for col in range(fracpres[cluster].shape[1]):
if fracpres[cluster][row, col] > 0:
pos = fracsig_pos[cluster][row, col]
neg = fracsig_neg[cluster][row, col]
if pos > neg:
cur_color = col2hex(cmap_pos(pos))
weight = pos
else:
cur_color = col2hex(cmap_neg(neg))
weight = -neg
G.add_edge(row,col,color=cur_color, penwidth=fracpres[cluster][row, col]*4, key=str(row)+str(col), weight=weight)
for node in G.nodes_iter():
node.attr['width'] = 0.3
node.attr['height'] = 0.3
node.attr['margin'] = 0
node.attr['shape'] = 'circle'
node.attr['fixedsize'] = True
node.attr['fontsize'] = 10
node.attr['penwidth'] = 5
if node_cols_dict is not None:
node.attr['color'] = node_cols_dict[node]
node.attr['pos'] = pos_dict[node]
G.layout(prog='neato',args='-n')
#['size'] = "40,40"
G.draw( os.path.join(plotdir, plot_prefix + 'graphs_'+'both'+ '_fracpres' +
str(cu_fracpres)+'_fracsig'+str(cu_fracsig)+
'_'+cluster+filetype))
Use the function to plot the networks used in Figure 3e
The images will be stored in the defined plotfolder
plot_groups(cu_fracsig=cu_fracsig, cu_fracpres=cu_fracpres, clusters=clusters, node_cols_dict=node_cols_dict)