Source code for pybiomart.mart

from __future__ import absolute_import, division, print_function

# pylint: disable=wildcard-import,redefined-builtin,unused-wildcard-import
from builtins import *
# pylint: enable=wildcard-import,redefined-builtin,unused-wildcard-import

from io import StringIO

import pandas as pd

# pylint: disable=import-error
from .base import ServerBase, DEFAULT_SCHEMA
from .dataset import Dataset
# pylint: enable=import-error


[docs]class Mart(ServerBase): """Class representing a biomart mart. Used to represent specific mart instances on the server. Provides functionality for listing and loading the datasets that are available in the corresponding mart. Args: name (str): Name of the mart. database_name (str): ID of the mart on the host. display_name (str): Display name of the mart. host (str): Url of host to connect to. path (str): Path on the host to access to the biomart service. port (int): Port to use for the connection. use_cache (bool): Whether to cache requests. virtual_schema (str): The virtual schema of the dataset. Examples: Listing datasets: >>> server = Server(host='http://www.ensembl.org') >>> mart = server.['ENSEMBL_MART_ENSEMBL'] >>> mart.list_datasets() Selecting a dataset: >>> dataset = mart['hsapiens_gene_ensembl'] """ RESULT_COLNAMES = ['type', 'name', 'display_name', 'unknown', 'unknown2', 'unknown3', 'unknown4', 'virtual_schema', 'unknown5'] def __init__(self, name, database_name, display_name, host=None, path=None, port=None, use_cache=True, virtual_schema=DEFAULT_SCHEMA, extra_params=None): super().__init__(host=host, path=path, port=port, use_cache=use_cache) self._name = name self._database_name = database_name self._display_name = display_name self._virtual_schema = virtual_schema self._extra_params = extra_params self._datasets = None def __getitem__(self, name): return self.datasets[name] @property def name(self): """Name of the mart (used as id).""" return self._name @property def display_name(self): """Display name of the mart.""" return self._display_name @property def database_name(self): """Database name of the mart on the host.""" return self._database_name @property def datasets(self): """List of datasets in this mart.""" if self._datasets is None: self._datasets = self._fetch_datasets() return self._datasets
[docs] def list_datasets(self): """Lists available datasets in a readable DataFrame format. Returns: pd.DataFrame: Frame listing available datasets. """ def _row_gen(attributes): for attr in attributes.values(): yield (attr.name, attr.display_name) return pd.DataFrame.from_records( _row_gen(self.datasets), columns=['name', 'display_name'])
def _fetch_datasets(self): # Get datasets using biomart. response = self.get(type='datasets', mart=self._name) # Read result frame from response. result = pd.read_csv(StringIO(response.text), sep='\t', header=None, names=self.RESULT_COLNAMES) # Convert result to a dict of datasets. datasets = (self._dataset_from_row(row) for _, row in result.iterrows()) return {d.name: d for d in datasets} def _dataset_from_row(self, row): return Dataset(name=row['name'], display_name=row['display_name'], host=self.host, path=self.path, port=self.port, use_cache=self.use_cache, virtual_schema=row['virtual_schema']) def __repr__(self): return (('<biomart.Mart name={!r}, display_name={!r},' ' database_name={!r}>') .format(self._name, self._display_name, self._database_name))