Source code for craft.visualise

import math

import matplotlib
import matplotlib.pyplot as plt
from matplotlib import transforms
import numpy
import pandas

[docs]def fit_text(ax, *args, **kwargs): """Write text into `ax` by passing `args` and `kwargs` to ax.text, then change the view limits of `ax` if necessary to include the text. Trickier than it sounds, and the results are a little approximate. """ t = ax.text(*args, **kwargs) tbox = t.get_tightbbox(ax.get_figure().canvas.get_renderer()) tbox_in_data = tbox.inverse_transformed(t.get_transform()) # We should be able to just do: # # ax.update_datalim_bounds(tbox_in_data) # ax.autoscale_view() # # But the axes data limits are not necessarily what we want. For # instance, after doing an ax.imshow with a transform (as we do # for ld_block), the ax.dataLim is in the array's data coordinates # (whereas the ax.viewLim is in the axes' data coordinates). That # might be a bug but in any case we want really to work with the # viewLim, in case the viewLim has previously been deliberately # moved. So we do it in a longer way: # existing viewLim l, b, w, h = ax.viewLim.bounds r = l+w t = b+h # text box bounds in same coordinates tl, tb, tw, th = tbox_in_data.bounds tr = tl+tw tt = tb+th # new view limits # apply margins where necessary. xmargin, ymargin = ax.margins() nl = l if tl > l else tl - (r-tl) * xmargin nr = r if tr < r else tr + (tr-l) * xmargin nb = b if tb > b else tb - (t-tb) * ymargin nt = t if tt < t else tt + (tt-b) * ymargin ax.set_xlim(nl,nr) ax.set_ylim(nb,nt)
[docs]def ld_block(array, indexes = None, names = None, labels = None, text_kwargs = {}, figsize = (8, 5), cmap = 'Reds', colorbar = True): """Create and return an linkage-disequilibrium block chart. `array` is a square numpy array containing LD values (Pearson's correlation coefficient r). Only the upper triangle of the array (above the diagonal) is used. The values actually plotted are r^2. `indexes` is an iterable of index values into the rows and columns of `array`, giving the order and identity of the SNPs to display. If None, the whole array is displayed in array order. `names` is an iterable of names, which should be the same length as `indexes`, or the number of rows in `array`, used to display row/column names above the LD block. If None, no names are displayed. `labels`, if present, is a dictionary of labels with available keys "mid", "left", and "right", for example dict(mid='chr17, left=12398013, right=18290324). If there are no labels then no label bar is drawn. `figsize` is the size of the figure in inches (width, height). `cmap` is the name of a Matplotlib colormap. `colorbar` controls whether to add a colorbar. The default is True. """ assert array.ndim == 2 if labels: fig, (lab_ax,ax) = plt.subplots(2, figsize = figsize, gridspec_kw = dict( height_ratios=(1,10), hspace=0)) else: fig, ax = plt.subplots(figsize=figsize) if indexes is not None: l = list(indexes) # for consumable iterables assert 0 <= min(l) assert max(l) < array.shape[0] array = array[..., l][l, ...] # array contains Pearson's "r" coefficients. We plot r^2. # Note: point-wise multiplication, not matrix multiplication! array = array*array # how many items lds = array.shape[0] # mask out lower triangle and diagonal of the array mask = numpy.tri(lds) array = numpy.ma.array(array, mask=mask) # set the colormap cmap = matplotlib.cm.get_cmap(cmap) cmap.set_bad('w') # so masked values are white # draw the actual block grid and rotate it as needed. # force colormap range to 0-1. im = ax.imshow(array, cmap = cmap, vmin = 0, vmax = 1, transform = (transforms.Affine2D().rotate_deg(-45) + ax.transData)) # set view limits so we see the necessary part of the grid. s2 = math.sqrt(2) ax.set_xlim(0, s2*(lds-1)) ax.set_ylim(-s2*lds/2, 1) # add labels for each item, changing the view limits to fit. if names is not None: kw = dict(rotation='vertical', horizontalalignment='center', verticalalignment='bottom') kw.update(text_kwargs) for i,n in enumerate(names): fit_text(ax, i*s2, 0, n, **kw) # don't draw axes, ax.set_axis_off() # draw the (optional) color bar. if colorbar: cbar = fig.colorbar(im, ax=ax, shrink=0.5) cbar.set_label(label='$R^2$', rotation=0) # draw the label line and labels if required. if labels: lab_ax.spines['right'].set_visible(False) lab_ax.spines['left'].set_visible(False) lab_ax.spines['top'].set_visible(False) lab_ax.tick_params(which='both', bottom=False, left=False, labelbottom=False, labelleft=False) lab_ax.set_xlim(0, 1) lab_ax.set_ylim(0, 1) for k, x, ha, va in [('left' , 0.0, 'left' , 'bottom'), ('mid' , 0.5, 'center', 'bottom'), ('right', 1.0, 'right' , 'bottom'), ]: if k in labels: lab_ax.text(x, 0, str(labels[k]), horizontalalignment=ha, verticalalignment=va) return fig
[docs]def manhattan(df, x_label, index_df = None, alpha = 5e-8, figsize = (8, 5), size = 1, color = 'b', marker = '.', alpha_line_width = 0.5, alpha_line_color = '0.5', alpha_line_style = '--', good_size = 2, good_color = 'g', good_marker = 'D', good_label_column = 'rsid', good_label_rotation = 'vertical', vertical_lines = []): """Draw and return a "Manhattan plot", with values above some critical threshold marked distinctly and labelled. `df` is a Pandas dataframe with two required columns: "pvalue": the P value of each SNP; "position": the position of the SNP (in base-pairs); If automatic labelling of the most significant SNPs is required, (controlled by `index_df` and `alpha` parameters), the DataFrame must also have a column labelled by the `good_label_column` parameter. `index_df`, if present, is a Pandas dataframe like `df`, with the same required columns, identifying the SNPs to be labelled. `x_label` is a label for the x axis (such as "Chromosome 17"). `alpha` is a threshold for distinguishing "good" SNPs. If `index_df` is present then the SNPs in it will be distinguished. If `index_df` is absent, and `alpha` is set, then SNPs with pvalue less than `alpha` are distinguished. Distinguished SNPs are drawn differently (see the `good_` parameters below). If `index_df` is present, or if both `alpha` and `good_label_column` are set `figsize` is the figure size (width, height) in inches. `size` is the point area for the main scatter plot, in square points. `color` is the point color for the main scatter plot. `marker` is the marker style for the main scatter plot. `alpha_line_width` is the width of the horizontal line to be drawn at the alpha level. `alpha_line_color` is the color specifier for the alpha line. `alpha_line_style` is the line style for the alpha line. `good_size` is the point area for "good" SNPs, in square points. `good_color` is the color for "good" SNPs. `good_marker` is the marker style for "good" SNPs. `good_label_column` is the column name in `df` or `index_df` for labels to be drawn for "good" SNPs, or None for no labels. `good_label_rotation` is the rotation for labels for "good" SNPs. """ # calculate -log_10(P) to plot df['minuslog10pvalue'] = -numpy.log10(df.pvalue) # x axis in megabases, to suppress annoying useOffset nonsense # in the MPL tick formatter. df['positionMb'] = df.position / 1e6 fig, ax = plt.subplots(figsize=figsize) # plot the main scatter ax.scatter(df.positionMb, df.minuslog10pvalue, s=size, color=color, marker=marker) # if we have a threshold (alpha), draw the line if alpha: ax.axhline(-numpy.log10(alpha), linestyle=alpha_line_style, linewidth=alpha_line_width, color=alpha_line_color) # if we want additional vertical lines (e.g. to mark out MHC on chr6) for v in vertical_lines: ax.axvline(v / 1e6, linewidth=0.5, color='k') # if we need to distinguish points: if alpha or (index_df is not None): # which points to distinguish: if index_df is None: index_df = df[df.pvalue < alpha] else: index_df['positionMb'] = index_df.position / 1e6 index_df['minuslog10pvalue'] = -numpy.log10(index_df.pvalue) # draw them differently ax.scatter(index_df.positionMb, index_df.minuslog10pvalue, s=good_size, color=good_color, marker=good_marker) # label them if good_label_column: for i, row in index_df.iterrows(): fit_text(ax, row.positionMb, row.minuslog10pvalue +0.1, row[good_label_column], rotation=good_label_rotation, horizontalalignment='left', verticalalignment='bottom') # axes and ticks ax.set_ylabel(r'$-\log_{10}\ (P)$') ax.set_xlabel(f'{x_label} (Mbp)') ax.ticklabel_format(useOffset=False, style='plain') return fig
[docs]def locus(df, cred_snps = None, figsize = (5, 8), size = 1, color = 'b', marker = '.', threshold = 0.8, alpha_line_width = 0.5, alpha_line_color = '0.5', alpha_line_style = '--', good_size = 2, good_color = 'g', good_marker = 'D', good_label_column = 'rsid', good_label_rotation = 'vertical', tracks=None, track_height = 0.5, track_column = 'tracks', track_colors = ['r','g','b','c','m','y','k'], track_alpha = 0.3, track_linelength = 0.7, track_good_linelength = 1.0, track_lines = False, track_line_width=0.5, track_line_color='k', pos_top = False, genes = None, gene_height = 0.5, ): """Draw and return a "locus plot", with values above some critical threshold marked distinctly and labelled. `df` is a Pandas dataframe with two required columns: "pp": the posterior probability of each SNP; "position": the position of the SNP (in base-pairs); "chromosome": the chromosome name or number (should be the same for every SNP). If automatic labelling of credible SNPs is required, (controlled by `cred_snps` or `alpha` parameter), the DataFrame must also have a column labelled by the `good_label_column` parameter. If a tracks pane is required (controlled by `tracks` parameter), the DataFrame must also have a column labelled by the `track_column` parameter. `figsize` is the figure size (width, height) in inches. `size` is the point area for the main scatter plot, in square points. `color` is the point color for the main scatter plot. `marker` is the marker style for the main scatter plot. `cred_snps`, if given, is a list of credible SNP IDs, or a list of lists of credible SNP IDs. If the latter, the SNPs in each list are drawn in a different color (see `good_color`). `threshold` is a threshold for distinguishing credible SNPs. If `cred_snps` is given then `threshold` is only used for drawing the alpha line. `alpha_line_width` is the width of the horizontal line to be drawn at the alpha level. `alpha_line_color` is the color specifier for the alpha line. `alpha_line_style` is the line style for the alpha line. `good_size` is the point area for credible SNPs, in square points. `good_color` is the color for credible SNPs. If `cred_snps` is given, and is a list of list of IDs, this should be a list of colors (and defaults to ['r','g','b','c','m','y','k']). `good_marker` is the marker style for credible SNPs. `good_label_column` is the column name in `df` or `index_df` for labels to be drawn for credible SNPs, or None for no labels. `good_label_rotation` is the rotation for labels for "good" SNPs. `tracks` is a list of track labels for the tracks pane, if required. `track_height` is the height of the tracks pane, relative to the posterior probability pane. `track_column` is the name of a column in the `df` DataFrame. A given SNP is shown on a track if its entry in this column contains the corresponding label from the `tracks` parameter. `track_colors` is a list of Matplotlib color descriptors, to be used for the tracks. `track_alpha` is the opacity of a track marker for a non-credible SNP (credible SNPs have alpha 1.0). `track_linelength` is the length of a track marker for a non-credible SNP. `track_good_linelength` is the length of a track marker for a credible SNP. `track_lines` controls whether to draw vertical lines under the tracks pane, marking the credible SNPs. `track_line_width` is the width of vertical lines under the tracks pane marking credible SNPs. `track_line_color` is the color of vertical lines under the tracks pane marking credible SNPs. `pos_top` controls whether the panes for tracks and genes are above (True) or below (False) the posterior probabilty pane. `genes` is a list of genes for the genes pane, if required. The list member for each gene should be a tuple (start position, end position, name, strand), where `strand` is "+" or "-". `gene_height` is the height of the genes pane, relative to the posterior probability pane. """ df['positionMb'] = df.position / 1e6 total_height = 1 + track_height + gene_height if tracks and genes: height_ratios = [track_height, gene_height] if pos_top: height_ratios.push(1) else: height_ratios.append(1) fig, axes = ( plt.subplots(3, figsize=figsize, gridspec_kw=dict(height_ratios=height_ratios, hspace=0), sharex=True)) if pos_top: posax, trackax, geneax = axes else: geneax, trackax, posax = axes bottomax = axes[-1] elif tracks or genes: pos_index = 0 if pos_top else 1 other_index = 1 - pos_index height2 = track_height if tracks else gene_height heights = [1,1] heights[other_index] = height2 fig, axes = ( plt.subplots(2, figsize=figsize, gridspec_kw=dict(height_ratios=heights, hspace=0), sharex=True)) bottomax = axes[-1] posax = axes[pos_index] ax2 = axes[other_index] if tracks: trackax = ax2 else: geneax = ax2 else: # neither tracks nor genes fig, posax = plt.subplots(figsize=figsize) bottomax = posax posax.scatter(df.positionMb, df.pp, s=size, color=color, marker=marker) posax.set_ylabel('Posterior probability') chromosome = df.chromosome.unique()[0] posax.set_ylim(0,1) bottomax.set_xlabel(f'Chromosome {chromosome}; position (Mbp)') cred_dfs = None if threshold: posax.axhline(threshold, linestyle=alpha_line_style, linewidth=alpha_line_width, color=alpha_line_color) if not cred_snps: cred_df = df[df.pp > threshold] cred_dfs = [cred_df] good_colors = [good_color] if cred_snps: if isinstance(cred_snps[0], list): cred_snps_list = cred_snps good_colors = ['r','g','b','c','m','y','k'] else: cred_snps_list=[cred_snps] good_colors = [good_color] cred_dfs = [df[df[good_label_column].isin(l)] for l in cred_snps_list] if cred_dfs: for i, cred_df in enumerate(cred_dfs): posax.scatter(cred_df.positionMb, cred_df.pp, s=good_size, color=good_colors[i], marker=good_marker) if good_label_column: for j, row in cred_df.iterrows(): fit_text(posax,row.positionMb, row.pp+0.01, row[good_label_column], rotation=good_label_rotation, horizontalalignment="left", verticalalignment='bottom') posax.set_yticks([y for y in posax.get_yticks() if y >= 0 and y <= 1]) if tracks: track_colors = (track_colors + track_colors)[:len(tracks)] alpha = track_alpha if threshold else 1.0 colors = [matplotlib.colors.to_rgba(c, alpha) for c in track_colors] trackax.set_ylabel('tracks') tracks_array = [df[df[track_column].str.contains(t)].positionMb for t in tracks] trackax.eventplot(tracks_array, linelengths=track_linelength, colors=colors) if threshold: if track_lines: for p in cred_df.positionMb: trackax.axvline(p, zorder=-1, linewidth=track_line_width, color=track_line_color) tracks_array = [cred_df[cred_df[track_column].str.contains(t)].positionMb for t in tracks] trackax.eventplot(tracks_array, linelengths=track_good_linelength, colors=track_colors) trackax.set_yticks(range(len(tracks))) trackax.set_yticklabels(tracks) trackax.set_ylabel('') half_linelength = max(track_good_linelength, track_linelength)/2 trackax.set_ylim(-half_linelength-0.1, len(tracks)-1+half_linelength+0.1) if genes is not None: geneax.set_ylabel('genes') # geneax.spines['top'].set_visible(False) geneax.tick_params(which='both', left=False, labelleft=False) xlims = geneax.get_xlim() y = 0.25 for start, end, name, strand in genes: name = name + r'$\rightarrow$' if strand == '+' else r'$\leftarrow$' + name geneax.plot((start/1e6, end/1e6), (y,y)) geneax.text((start + end)/2e6, y+0.05, name, horizontalalignment="center", clip_on=True) y = 1-y geneax.set_xlim(xlims) geneax.set_ylim(0,1) fig.tight_layout() return fig
def test(): fig, ax = plt.subplots() ax.eventplot(((1,5,7,8),(3,4,1,6),(5,2,8))) fit_text(ax, 3, 2, "Spong") fit_text(ax, 2, -3, "Wibble", rotation=-45) fit_text(ax, 1, 5, "Foobar", verticalalignment='bottom', rotation='vertical') return fig