Source code for geneviz.tracks.feature

from collections import namedtuple
import itertools
import operator

import pandas as pd
import numpy as np
import seaborn as sns
import toolz

from intervaltree import IntervalTree
import matplotlib.pyplot as plt

from matplotlib.patches import Rectangle, FancyArrow
from matplotlib.collections import PatchCollection, LineCollection

from geneviz.util.colormap import build_colormap

from .base import Track


[docs]class FeatureTrack(Track): """Track for plotting generic genomic features in a stacked fashion. The FeatureTrack is the main track that is used to plot genomic features. Overlapping features are plotted in a stacked fashion to avoid overplotting. Optionally, features can be grouped together if they belong to the same group, such as exons for a given gene/transcript. Grouped features are connected by junctions to indicate that they belong to a single group. Features can also be colored by their properties, using the hue and palette arugments. Parameters ---------- data : pandas.Dataframe Dataset for plotting. Each row in the DataFrame is expected to correspond with a single feature. The DataFrame should have the following columns: seqname, start, end, strand; which together specify the location and orientation of the corresponding feature. group : str Column (categorical) by which features should be grouped. Grouped features are drawn together (at the same height) and are visually connected by junctions. label : str Column containing text labels that should be used to name features in the plot. If group is specified, the label of the first feature in the group is used for the group. hue : str Column (categorical) that should be used to determine the color of a given feature. hue_order : List[str] Order to plot the categorical hue levels in, otherwise the levels are inferred from the data objects. palette : List[Union[str, Tuple[float, float, float]]] Colors to use for the different levels of the hue variable. Should be specified as a list of colors (strs) or a list of tuples with RGB values (similar to Seaborn color palettes). strand_junctions : bool Boolean that indicates if group junctions should be drawn in a stranded fashion for groups. If False, the features are drawn with their own respective strands. If True, the strand of a group is determined from the first element of the group and the junctions are drawn upwards (for groups on the forward strand) or downwards (for groups on the reverse strand). height : float The height of an individual feature. spacing : float The (vertical) spacing to use between features. color : str Color for all of the elements. Overridded by hue. patch_kws : dict[str, Any] Dict of keyword arguments to pass to RectangleCollection or FancyArrow collection when drawing feature patches. Used to precisely specify the aesthetics of features. line_kws : dict[str, Any] Dict of keyword arguments to pass to LineCollection when drawing the junctions between groups. Used to specify modify the aesthetics of the junctions. label_kws : dict[str, Any] Dict of keyword arguments to pass to ax.annotate when drawing feature/group labels. Used to precisely specify the aesthetics of the labels. """ def __init__(self, data, group=None, label=None, hue=None, hue_order=None, palette=None, strand_junctions=False, height=1.0, spacing=0.05, color='dimgrey', patch_kws=None, line_kws=None, label_kws=None): super().__init__() # TODO: Add legend for hue. # Setup internal dataframe. color_map = build_colormap( data, hue=hue, palette=palette, order=hue_order) self._data = data self._color_map = color_map # Various visual parameters. self._hue = hue self._group = group self._label = label self._height = height self._spacing = spacing self._strand_junctions = strand_junctions # Detailed style kws for different plot aspects. default_patch_kws = { 'facecolor': color, 'edgecolor': color, 'linewidth': 1 } self._patch_kws = toolz.merge(default_patch_kws, patch_kws or {}) default_line_kws = {'color': color, 'linewidth': 1.5} self._line_kws = toolz.merge(default_line_kws, line_kws or {}) self._label_kws = label_kws or {} @classmethod
[docs] def from_position(cls, data, width, **kwargs): """Constructs instance from frame with positions instead of start/ends. Assumes that the DataFrame contains a 'position' column, which defines the exact position of the given feature. This function expands features so that the width of the plotted feature is equal to the given width and features are centered around their position. Parameters ---------- data : pandas.Dataframe Dataset for plotting. Assumed to be the same format as for the main constructor, apart from containing a 'position' column, rather than start/end columns. width : int The width to use for the expanded features. **kwargs Any kwargs are passed to the main constructor. """ data = data.assign( start=data['position'] - (width // 2), end=data['position'] + (width // 2)) return cls(data=data, **kwargs)
[docs] def get_height(self, region, ax): """Returns the height of the dummy track. Parameters ---------- region : Tuple[str, int, int] The genomic region that will be drawn. Specified as a tuple of (chromosome, start, end). ax : matplotlib.Axes Axis that the track will be drawn on. Returns ------- height : int Height of the track. """ data = self._fetch_data(region).assign(height=self._height) stacked = stack( data, group=self._group, label=self._label, label_func=self._get_label_extent, ax=ax, spacing=self._spacing) return stacked['y'].max() + self._height + self._spacing
def _fetch_data(self, region): """Fetches features within a given region.""" return self._data.query( ('chromosome == {!r} and end >= {} and start <= {}') .format(*region)) # yapf: disable
[docs] def draw(self, region, ax): """Draws the track on the given axis. Parameters ---------- region : Tuple[str, int, int] Genomic region to draw. ax : matplotlib.Axes Axis to draw track on. """ # Fetch data within region. data = self._fetch_data(region).assign(height=self._height) stacked = stack( data, group=self._group, label=self._label, label_func=self._get_label_extent, ax=ax, spacing=self._spacing) # Draw features. if self._hue is None: patches = (self._feature_patch(tup) for tup in stacked.itertuples()) ax.add_collection(PatchCollection(patches, **self._patch_kws)) else: for hue, grp in stacked.groupby(self._hue): patches = (self._feature_patch(tup) for tup in grp.itertuples()) patch_kws = toolz.merge(self._patch_kws, {'facecolor': self._color_map[hue]}) ax.add_collection(PatchCollection(patches, **patch_kws)) # Draw junctions/labels. if self._group is not None: segments = [] for _, grp in stacked.groupby(self._group): # Collect junction segments. segments += list(self._junction_segments(grp)) # Draw labels for groups. if self._label is not None: self._draw_label_group(grp, ax=ax) ax.add_collection(LineCollection(segments, **self._line_kws)) else: # Draw labels for single features. if self._label is not None: for tup in stacked.itertuples(): self._draw_label_single(tup, ax) # Set ylim and style axes. ax.set_ylim(0, stacked['y'].max() + self._height + self._spacing) ax.set_yticks([])
def _feature_patch(self, tup): if self._strand_junctions or pd.isnull(tup.strand): # Return rectangle. return Rectangle( xy=(tup.start, tup.y), width=tup.end - tup.start, height=tup.height) else: # Return directed arrow. if tup.strand == 1: x, dx = tup.start, tup.end - tup.start else: x, dx = tup.end, tup.start - tup.end arrow_kws = dict( head_width=0.9 * tup.height, head_length=abs(tup.start - tup.end) * 0.5, width=0.5 * tup.height, length_includes_head=True) return FancyArrow( x=x, dx=dx, y=tup.y + (0.5 * tup.height), dy=0, **arrow_kws) def _junction_segments(self, grp): grp = grp.sort_values(by='start') first = grp.iloc[0] y = first.y + (0.5 * first.height) if self._strand_junctions and not pd.isnull(first.strand): # Plot stranded junction with offset middle. mid_offset = 0.25 * first.height for start, end in zip(grp.end[:-1], grp.start[1:]): mid = (start + end) / 2 y_mid = y + (mid_offset * first.strand) yield ((start, y), (mid, y_mid), (end, y)) else: # Plot unstranded junction as straight line. for start, end in zip(grp.end[:-1], grp.start[1:]): yield ((start, y), (end, y)) def _draw_label_group(self, grp, ax): # Check if we are drawing in reverse. xlim = ax.get_xlim() reverse = xlim[1] < xlim[0] # Use end as anchor for reverse situation. first = grp.iloc[0] x = grp.end.max() if reverse else grp.start.min() y = first.y + (0.5 * first.height) # Draw label. self._draw_label(x=x, y=y, label=first[self._label], ax=ax) def _draw_label_single(self, tup, ax): # Check if we are drawing in reverse. xlim = ax.get_xlim() reverse = xlim[1] < xlim[0] # Use end as anchor for reverse situation. x = tup.end if reverse else tup.start y = tup.y + (0.5 * tup.height) # Draw label. self._draw_label(x=x, y=y, label=getattr(tup, self._label), ax=ax) def _draw_label(self, x, y, label, ax): return ax.annotate( xy=(x, y), xycoords='data', xytext=(-5, 0), textcoords='offset points', s=label, horizontalalignment='right', verticalalignment='center', clip_on=True, **self._label_kws) def _get_label_extent(self, x, y, label, ax): txt = self._draw_label(x=x, y=y, label=label, ax=ax) plt.draw() bbox = txt.get_window_extent() dbox = bbox.transformed(ax.transData.inverted()) txt.remove() return dbox
def stack(data, group=None, label=None, label_func=None, ax=None, spacing=0.05): """Stacks features in given dataframe using the FFDH algorithm. Parameters ---------- data : pandas.DataFrame DataFrame containing the features to be stacked. group : str Name of categorical column used to group features. label : str Name of categorical column containing feature labels. label_func : Function Function that is used to draw feature labels. This is used to assess the size of each label in data coordinates, which is required when accounting for the label sizes when stacking features. ax : matplotlib.Axes Axes on which feature labels will be drawn. Also used to assess feature label size. spacing : float Amount of vertical spacing to use between features. Returns ------- pandas.DataFrame Annotated version of the input dataframe, containing an extra column 'y' that denotes the y-offset assigned to each feature. """ # TODO: Refactor out label function? (Not a core feature of stack). if group is not None: agg_funcs = {'start': min, 'end': max, 'height': max} if label is not None: agg_funcs[label] = 'first' grouped = (data.groupby(group).agg(agg_funcs)) heights = _stack( grouped, label=label, label_func=label_func, ax=ax, spacing=spacing) heights.index.name = group heights = heights.reset_index() return pd.merge(data, heights[[group, 'y']], on=group, how='outer') else: heights = _stack(data, label=label, label_func=label_func, ax=ax) return pd.concat([data, heights], axis=1) def _stack(data, label=None, label_func=None, ax=None, spacing=0.05): if label is not None: data = _augment_with_labels(data, label, label_func, ax) levels, level_heights = _pack_ffdh(data.itertuples()) level_offsets = np.cumsum(level_heights, dtype=np.float) - level_heights[0] level_offsets += np.arange(1, len(level_offsets) + 1) * spacing rows = [] for objs, offset in zip(levels, level_offsets): for obj in objs: rows.append((obj.Index, offset)) index, y_values = zip(*rows) heights = pd.DataFrame({'y': y_values}, index=index) return heights def _pack_ffdh(objects): """Implementation of the First-Fit Decreasing Height packing algorithm. Packs objects into multiple levels, whilst trying to minimize the total used height. Uses a greedy approach that sorts objects by decreasing height, and then packs them accordingly. See http://cgi.csc.liv.ac.uk/~epa/surveyhtml.html for a description of the algorithm and other options for packing algorithms. Parameters ---------- objects : List[Any] Objects to pack. Any object can be used, as long as the objects have 'height', 'start' and 'end' properties, which define the height and range of the object. Returns ------- Tuple[List[List[Any]], List[int]] Returns a tuple of level assignments (containing the objects) assigned to each level and a list containing the heights of each level. """ # Sort boxes by decreasing height and size. objects = sorted( objects, key=lambda obj: (obj.height, obj.end - obj.start)) objects = objects[::-1] levels = [] level_heights = [] for obj in objects: placed = False for level in levels: if not level.overlaps(obj.start, obj.end): level.addi(obj.start, obj.end, obj) placed = True break if not placed: new_level = IntervalTree.from_tuples([(obj.start, obj.end, obj)]) levels.append(new_level) level_heights.append(obj.height) levels_list = [[interval[2] for interval in level] for level in levels] return levels_list, level_heights def _augment_with_labels(data, label, label_func, ax): # Augment positions. anchor_col = 'end' if _reversed_axis(ax) else 'start' positions = [ label_func( x=x, y=0, label=lab, ax=ax).x0 for lab, x in zip(data[label], data[anchor_col]) ] return data.assign(**{anchor_col: positions}) def _reversed_axis(ax): """Checks if x-axis is reversed.""" xlim = ax.get_xlim() return xlim[1] < xlim[0]
[docs]class RugTrack(Track): """Track that plots density ticks for features. Parameters ---------- data : pandas.Dataframe Dataset for plotting. Each row in the DataFrame is expected to correspond with a single feature. The DataFrame should have the following columns: seqname, position, strand; which together specify the location and orientation of the corresponding feature. hue : str Column (categorical) that should be used to determine the color of a given feature. hue_order : List[str] Order to plot the categorical hue levels in, otherwise the levels are inferred from the data objects. palette : List[Union[str, Tuple[float, float, float]]] Colors to use for the different levels of the hue variable. Should be specified as a list of colors (strs) or a list of tuples with RGB values (similar to Seaborn color palettes). height : float The height of the track. line_kws : dict[str, Any] Dict of keyword arguments to pass to LineCollection when drawing the ticks. Used to specify modify the aesthetics of the ticks. """ def __init__(self, data, hue=None, hue_order=None, palette=None, height=1.0, line_kws=None): super().__init__() self._data = data self._height = height self._hue = hue self._color_map = build_colormap( data, hue=hue, palette=palette, order=hue_order) self._line_kws = line_kws or {}
[docs] def get_height(self, region, ax): """Returns the height of the track. Parameters ---------- region : Tuple[str, int, int] The genomic region that will be drawn. Specified as a tuple of (chromosome, start, end). ax : matplotlib.Axes Axis that the track will be drawn on. Returns ------- height : int Height of the track. """ return self._height
[docs] def draw(self, region, ax): """Draws the track on the given axis. Parameters ---------- region : Tuple[str, int, int] Genomic region to draw. ax : matplotlib.Axes Axis to draw track on. """ data = self._data.query( 'chromosome == {!r} and position > {} and position < {}' .format(*region)) # yapf: disable if self._hue is not None: for hue, grp in data.groupby(self._hue): self._draw_lines(grp, ax, color=self._color_map[hue]) else: self._draw_lines(data, ax) ax.yaxis.set_visible(False)
def _draw_lines(self, data, ax, color=None): segments = (((tup.position, 0), (tup.position, 1)) for tup in data.itertuples()) lines = LineCollection(segments, color=color, **self._line_kws) ax.add_collection(lines)