Source code for pybiomart.dataset

from __future__ import absolute_import, division, print_function

# pylint: disable=wildcard-import,redefined-builtin,unused-wildcard-import
from builtins import *
# pylint: enable=wildcard-import,redefined-builtin,unused-wildcard-import
from future.utils import native_str

from io import StringIO
from xml.etree import ElementTree

import pandas as pd

# pylint: disable=import-error
from .base import ServerBase, BiomartException, DEFAULT_SCHEMA

# pylint: enable=import-error


[docs]class Dataset(ServerBase): """Class representing a biomart dataset. This class is responsible for handling queries to biomart datasets. Queries can select a subset of attributes and can be filtered using any available filters. A list of valid attributes is available in the attributes property. If no attributes are given, a set of default attributes is used. A list of valid filters is available in the filters property. The type of value that can be specified for a given filter depends on the filter as some filters accept single values, whilst others can take lists of values. Args: name (str): Id of the dataset. display_name (str): Display name of the dataset. host (str): Url of host to connect to. path (str): Path on the host to access to the biomart service. port (int): Port to use for the connection. use_cache (bool): Whether to cache requests. virtual_schema (str): The virtual schema of the dataset. Examples: Directly connecting to a dataset: >>> dataset = Dataset(name='hsapiens_gene_ensembl', >>> host='http://www.ensembl.org') Querying the dataset: >>> dataset.query(attributes=['ensembl_gene_id', >>> 'external_gene_name'], >>> filters={'chromosome_name': ['1','2']}) Listing available attributes: >>> dataset.attributes >>> dataset.list_attributes() Listing available filters: >>> dataset.filters >>> dataset.list_filters() """ def __init__(self, name, display_name='', host=None, path=None, port=None, use_cache=True, virtual_schema=DEFAULT_SCHEMA): super().__init__(host=host, path=path, port=port, use_cache=use_cache) self._name = name self._display_name = display_name self._virtual_schema = virtual_schema self._filters = None self._attributes = None self._default_attributes = None @property def name(self): """Name of the dataset (used as dataset id).""" return self._name @property def display_name(self): """Display name of the dataset.""" return self._display_name @property def filters(self): """List of filters available for the dataset.""" if self._filters is None: self._filters, self._attributes = self._fetch_configuration() return self._filters @property def attributes(self): """List of attributes available for the dataset (cached).""" if self._attributes is None: self._filters, self._attributes = self._fetch_configuration() return self._attributes @property def default_attributes(self): """List of default attributes for the dataset.""" if self._default_attributes is None: self._default_attributes = { name: attr for name, attr in self.attributes.items() if attr.default is True } return self._default_attributes
[docs] def list_attributes(self): """Lists available attributes in a readable DataFrame format. Returns: pd.DataFrame: Frame listing available attributes. """ def _row_gen(attributes): for attr in attributes.values(): yield (attr.name, attr.display_name, attr.description) return pd.DataFrame.from_records( _row_gen(self.attributes), columns=['name', 'display_name', 'description'])
[docs] def list_filters(self): """Lists available filters in a readable DataFrame format. Returns: pd.DataFrame: Frame listing available filters. """ def _row_gen(attributes): for attr in attributes.values(): yield (attr.name, attr.type, attr.description) return pd.DataFrame.from_records( _row_gen(self.filters), columns=['name', 'type', 'description'])
def _fetch_configuration(self): # Get datasets using biomart. response = self.get(type='configuration', dataset=self._name) # Check response for problems. if 'Problem retrieving configuration' in response.text: raise BiomartException('Failed to retrieve dataset configuration, ' 'check the dataset name and schema.') # Get filters and attributes from xml. xml = ElementTree.fromstring(response.content) filters = {f.name: f for f in self._filters_from_xml(xml)} attributes = {a.name: a for a in self._attributes_from_xml(xml)} return filters, attributes @staticmethod def _filters_from_xml(xml): for node in xml.iter('FilterDescription'): attrib = node.attrib yield Filter( name=attrib['internalName'], type=attrib.get('type', '')) @staticmethod def _attributes_from_xml(xml): for page_index, page in enumerate(xml.iter('AttributePage')): for desc in page.iter('AttributeDescription'): attrib = desc.attrib # Default attributes can only be from the first page. default = (page_index == 0 and attrib.get('default', '') == 'true') yield Attribute( name=attrib['internalName'], display_name=attrib.get('displayName', ''), description=attrib.get('description', ''), default=default)
[docs] def query(self, attributes=None, filters=None, only_unique=True, use_attr_names=False): """Queries the dataset to retrieve the contained data. Args: attributes (list[str]): Names of attributes to fetch in query. Attribute names must correspond to valid attributes. See the attributes property for a list of valid attributes. filters (dict[str,any]): Dictionary of filters --> values to filter the dataset by. Filter names and values must correspond to valid filters and filter values. See the filters property for a list of valid filters. only_unique (bool): Whether to return only rows containing unique values (True) or to include duplicate rows (False). use_attr_names (bool): Whether to use the attribute names as column names in the result (True) or the attribute display names (False). Returns: pandas.DataFrame: DataFrame containing the query results. """ # Example query from Ensembl biomart: # # <?xml version="1.0" encoding="UTF-8"?> # <!DOCTYPE Query> # <Query virtualSchemaName = "default" formatter = "TSV" header = "0" # uniqueRows = "0" count = "" datasetConfigVersion = "0.6" > # <Dataset name = "hsapiens_gene_ensembl" interface = "default" > # <Filter name = "chromosome_name" value = "1,2"/> # <Filter name = "end" value = "10000000"/> # <Filter name = "start" value = "1"/> # <Attribute name = "ensembl_gene_id" /> # <Attribute name = "ensembl_transcript_id" /> # </Dataset> # </Query> # Setup query element. root = ElementTree.Element('Query') root.set('virtualSchemaName', self._virtual_schema) root.set('formatter', 'TSV') root.set('header', '1') root.set('uniqueRows', native_str(int(only_unique))) root.set('datasetConfigVersion', '0.6') # Add dataset element. dataset = ElementTree.SubElement(root, 'Dataset') dataset.set('name', self.name) dataset.set('interface', 'default') # Default to default attributes if none requested. if attributes is None: attributes = list(self.default_attributes.keys()) # Add attribute elements. for name in attributes: try: attr = self.attributes[name] self._add_attr_node(dataset, attr) except KeyError: raise BiomartException( 'Unknown attribute {}, check dataset attributes ' 'for a list of valid attributes.'.format(name)) if filters is not None: # Add filter elements. for name, value in filters.items(): try: filter_ = self.filters[name] self._add_filter_node(dataset, filter_, value) except KeyError: raise BiomartException( 'Unknown filter {}, check dataset filters ' 'for a list of valid filters.'.format(name)) # Fetch response. response = self.get(query=ElementTree.tostring(root)) # Raise exception if an error occurred. if 'Query ERROR' in response.text: raise BiomartException(response.text) # Parse results into a DataFrame. result = pd.read_csv(StringIO(response.text), sep='\t') if use_attr_names: # Rename columns with attribute names instead of display names. column_map = { self.attributes[attr].display_name: attr for attr in attributes } result.rename(columns=column_map, inplace=True) return result
@staticmethod def _add_attr_node(root, attr): attr_el = ElementTree.SubElement(root, 'Attribute') attr_el.set('name', attr.name) @staticmethod def _add_filter_node(root, filter_, value): """Adds filter xml node to root.""" filter_el = ElementTree.SubElement(root, 'Filter') filter_el.set('name', filter_.name) # Set filter value depending on type. if filter_.type == 'boolean': # Boolean case. if value is True or value.lower() in {'included', 'only'}: filter_el.set('excluded', '0') elif value is False or value.lower() == 'excluded': filter_el.set('excluded', '1') else: raise ValueError('Invalid value for boolean filter ({})' .format(value)) elif isinstance(value, list) or isinstance(value, tuple): # List case. filter_el.set('value', ','.join(map(str, value))) else: # Default case. filter_el.set('value', str(value)) def __repr__(self): return ('<biomart.Dataset name={!r}, display_name={!r}>' .format(self._name, self._display_name))
class Attribute(object): """Biomart dataset attribute. Attributes: name (str): Attribute name. display_name (str): Attribute display name. description (str): Attribute description. """ def __init__(self, name, display_name='', description='', default=False): """Attribute constructor. Args: name (str): Attribute name. display_name (str): Attribute display name. description (str): Attribute description. default (bool): Whether the attribute is a default attribute of the corresponding datasets. """ self._name = name self._display_name = display_name self._description = description self._default = default @property def name(self): """Name of the attribute.""" return self._name @property def display_name(self): """Display name of the attribute.""" return self._display_name @property def description(self): """Description of the attribute.""" return self._description @property def default(self): """Whether this is a default attribute.""" return self._default def __repr__(self): return (('<biomart.Attribute name={!r},' ' display_name={!r}, description={!r}>') .format(self._name, self._display_name, self._description)) class Filter(object): """Biomart dataset filter. Attributes: name (str): Filter name. type (str): Type of the filter (boolean, int, etc.). description (str): Filter description. """ def __init__(self, name, type, description=''): """ Filter constructor. Args: name (str): Filter name. type (str): Type of the filter (boolean, int, etc.). description (str): Filter description. """ self._name = name self._type = type self._description = description @property def name(self): """Filter name.""" return self._name @property def type(self): """Filter type.""" return self._type @property def description(self): """Filter description.""" return self._description def __repr__(self): return ('<biomart.Filter name={!r}, type={!r}>' .format(self.name, self.type))