In [1]:
import pandas as pd In [ ]:
%load_ext autoreload In [19]:
%autoreload 2 Simple clustergram example showing clustered heatmap defaults¶
In [64]:
import numpy as np import string shape = (10, 20) np.random.seed(2013) df = pd.DataFrame(np.random.randn(*shape), index=list(string.lowercase[0:shape[0]]), columns=list(string.uppercase[0:shape[1]])) # -- Add some structure in the matrix so we can try to be correct -- # # Add 5 to rows a,b,c,d,e and columns K,L,M,N,O,P,Q,R,S,T df.ix[0:5,10:20] += 5 # Subtract 5 from rows f,g,h,i,j and columns A,B,C,D,E df.ix[5:10,0:5] -= 5 fig, row_dendrogram, col_dendrogram = plotting.heatmap(df) [autoreload of pandas.tools.plotting failed: Traceback (most recent call last): File "/usr/local/lib/python2.7/site-packages/IPython/extensions/autoreload.py", line 229, in check superreload(m, reload, self.old_objects) AttributeError: 'NoneType' object has no attribute '_cache' ]
Overwhelming example showing off all parameters¶
Both df and plot_df are options because scipy.cluster.hierachy only clusters properly if there are no NAs. I'm not sure how to get it to properly ignore NAs. So what I did instead was have the user provide two different dataframes, one to cluster on, and one to plot. Because it's possible that the user may not
In [48]:
%pdb Automatic pdb calling has been turned ON
In [1]:
range(1) Out[1]:
[0]
In [14]:
%load_ext autoreload %autoreload 2 The autoreload extension is already loaded. To reload it, use: %reload_ext autoreload
In [75]:
import pandas as pd import numpy as np import string import matplotlib as mpl import brewer2mpl from pandas.tools import plotting shape = (10, 20) np.random.seed(2013) df = pd.DataFrame(np.random.randn(*shape), index=list(string.lowercase[0:shape[0]]), columns=list(string.uppercase[0:shape[1]])) # -- Add some structure in the matrix so we can try to be correct -- # # Add 5 to rows a,b,c,d,e and columns K,L,M,N,O,P,Q,R,S,T df.ix[0:5,10:20] += 100 # Subtract 5 from rows f,g,h,i,j and columns A,B,C,D,E df.ix[5:10,0:5] = np.random.uniform(high=0.001, size=(5,5)) # --- Crazy stuff starts here!!! --- # Set the df to absolute so we can show log scaling df = df.abs() # Add some NAs df.ix['c', 'C'] = np.nan df_na_mean = df.fillna(df.mean()) vowels = ['a', 'e', 'i', 'o', 'u'] vowels += map(string.upper, vowels) # print vowels set1 = brewer2mpl.get_map('Set1', 'qualitative', 9).mpl_colors grey = set1[8] pink = set1[7] col_side_colors = [pink if letter in vowels else grey for letter in df.columns] row_side_colors = [pink if letter in vowels else grey for letter in df.index] cmap = mpl.cm.YlGnBu # highlight the NA with white cmap.set_under('white') fig, row_dendrogram, col_dendrogram = plotting.heatmap(df=df_na_mean, title='Awesome heatmap example', title_fontsize=32, colorbar_label='powers of 10', col_side_colors=col_side_colors, row_side_colors=row_side_colors, color_scale='log', cmap=cmap, row_linkage_method='single', col_linkage_method='average', figsize=(20,10), label_rows=[letter+'++' for letter in df.index], label_cols=False, xlabel_fontsize=8, ylabel_fontsize=20, cluster_rows=False, cluster_cols=True, vmin=1e-4, vmax=1e2, plot_df=df, edgecolor='white', linewidth=0.01) In [ ]: