"""GenePlexus API."""
import os
import os.path as osp
import warnings
from typing import Any
from typing import Dict
from typing import List
from typing import Optional
import pystow
import yaml
from . import _geneplexus
from . import util
from ._config import config
from ._config import logger
from ._config.logger_util import set_stream_level
from .download import download_select_data
from .exception import CustomDataError
[docs]class GenePlexus:
"""The GenePlexus API class."""
def __init__(
self,
file_loc: Optional[str] = None,
net_type: config.NET_TYPE = "STRING",
features: config.FEATURE_TYPE = "Embedding",
gsc: config.GSC_TYPE = "GO",
input_genes: Optional[List[str]] = None,
auto_download: bool = False,
log_level: config.LOG_LEVEL_TYPE = "WARNING",
):
"""Initialize the GenePlexus object.
Args:
file_loc: Location of data files, if not specified, set to default
data path ``~/.data/geneplexus``
net_type: Type of network to use.
features: Type of features of the network to use.
gsc: Type of gene set collection to use for generating negatives.
input_genes: Input gene list, can be mixed type. Can also be set
later if not specified at init time by simply calling
:meth:`load_genes` (default: :obj:`None`).
auto_download: Automatically download necessary files if set.
log_level: Logging level.
"""
set_stream_level(logger, log_level)
self._is_custom: bool = False
self.file_loc = file_loc # type: ignore
self.features = features
self.gsc = gsc
self.net_type = net_type
self.log_level = log_level
self.auto_download = auto_download
self.input_genes: List[str] = []
self.check_custom()
if self.auto_download and self._is_custom:
warnings.warn(
f"Skipping auto download for custom network {self.net_type}. "
"Unset auto_download option to suppress this message.",
UserWarning,
stacklevel=2,
)
elif self.auto_download:
download_select_data(
self.file_loc,
"All",
self.net_type,
self.features,
["GO", "DisGeNet"],
log_level=log_level,
)
if input_genes is not None:
self.load_genes(input_genes)
@property
def _params(self) -> List[str]:
return [
"file_loc",
"net_type",
"features",
"gsc",
"auto_download",
"log_level",
"input_genes",
]
[docs] def dump_config(self, outdir: str):
"""Save parameters configuration to a config file."""
params_dict = {i: getattr(self, i) for i in self._params}
path = osp.join(outdir, "config.yaml")
with open(path, "w") as f:
yaml.dump(params_dict, f)
logger.info(f"Config saved to {path}")
@property
def file_loc(self) -> str:
"""File location.
Use default data location ~/.data/geneplexus if not set.
"""
return self._file_loc
@file_loc.setter
def file_loc(self, file_loc: Optional[str]):
if file_loc is None:
self._file_loc = str(pystow.join("geneplexus"))
else:
self._file_loc = util.normexpand(file_loc)
logger.info(f"Data direcory set to {self._file_loc}")
@property
def net_type(self) -> config.NET_TYPE:
"""Network to use."""
return self._net_type # type: ignore
@net_type.setter
def net_type(self, net_type: config.NET_TYPE):
util.check_param("network", net_type, util.get_all_net_types(self.file_loc))
if net_type not in config.ALL_NETWORKS:
data_files = os.listdir(self.file_loc)
node_order_fn = f"NodeOrder_{net_type}.txt"
if node_order_fn not in data_files:
raise ValueError(f"Missing file {node_order_fn} for custom network {net_type}")
self._is_custom = True
logger.info(f"Using custom network {net_type!r}")
self._net_type = net_type
@property
def features(self) -> config.FEATURE_TYPE:
"""Features to use."""
return self._features
@features.setter
def features(self, features: config.FEATURE_TYPE):
util.check_param("feature", features, config.ALL_FEATURES)
self._features = features
@property
def gsc(self) -> config.GSC_TYPE:
"""Geneset collection."""
return self._gsc
@gsc.setter
def gsc(self, gsc: config.GSC_TYPE):
self._standard_gsc = self._custom_gsc = None
util.check_param("GSC", gsc, util.get_all_gscs(self.file_loc))
if gsc not in config.ALL_GSCS:
data_files = os.listdir(self.file_loc)
orig_gsc_fn = f"GSCOriginal_{gsc}.json"
if orig_gsc_fn not in data_files:
raise ValueError(f"Missing file {orig_gsc_fn} for custom GSC {gsc}")
logger.info(f"Using custom GSC {gsc!r}")
self._gsc = gsc
[docs] def check_custom(self):
"""Check custom network and gsc options.
The following files are required:
* ``Data_{features}_{net_type}.npy``
* ``GSC_{gsc}_{net_type}_GoodSets.json``
* ``GSC_{gsc}_{net_type}_universetxt``
"""
if self._net_type in config.ALL_NETWORKS and self._gsc in config.ALL_GSCS:
logger.debug("Skipping custom data checks, using standard data.")
return
# Require feature file, gsc file, and gsc universe file
data_files = os.listdir(self.file_loc)
features_fname = f"Data_{self.features}_{self.net_type}.npy"
gsc_fname = f"GSC_{self.gsc}_{self.net_type}_GoodSets.json"
universe_fname = f"GSC_{self.gsc}_{self.net_type}_universe.txt"
if features_fname not in data_files:
raise CustomDataError(
f"Missing custom network feature data file {features_fname}, "
"set up using geneplexus.custom.edgelist_loc first.",
)
elif gsc_fname not in data_files or universe_fname not in data_files:
raise CustomDataError(
f"Missing custom GSC data files {gsc_fname} and/or {universe_fname}, "
"set up using geneplexus.custom.subset_gsc_to_network first.",
)
[docs] def load_genes(self, input_genes: List[str]):
"""Load gene list, convert to Entrez, and set up positives/negatives.
:attr:`GenePlexus.input_genes` (List[str]): Input gene list.
Args:
input_genes: Input gene list, can be mixed type.
See also:
Use :meth:`geneplexus.util.read_gene_list` to load a gene list
from a file.
"""
self._load_genes(input_genes)
self._convert_to_entrez()
self._get_pos_and_neg_genes()
[docs] def _load_genes(self, input_genes: List[str]):
"""Load gene list into the GenePlexus object.
Note:
Implicitely converts genes to upper case.
"""
self.input_genes = [item.upper() for item in input_genes]
[docs] def _convert_to_entrez(self):
"""Convert the loaded genes to Entrez.
:attr:`GenePlexus.df_convert_out` (DataFrame)
A table where the first column contains the original gene IDs, the
second column contains the corresponding converted Entrez gene IDs.
The rest of the columns are indicators of whether a given gene is
present in any one of the networks.
:attr:`GenePlexus.table_summary` (List[Dict[str, int]])
List of netowrk stats summary dictionaries. Each dictionary has
three keys: **Network**, **NetworkGenes**, and **PositiveGenes**
(the number intersection between the input genes and the network
genes).
:attr:`GenePlexus.input_count` (int)
Number of input genes.
"""
self.convert_ids, df_convert_out = _geneplexus._initial_id_convert(self.input_genes, self.file_loc)
self.df_convert_out, self.table_summary, self.input_count = _geneplexus._make_validation_df(
df_convert_out,
self.file_loc,
)
return self.df_convert_out
[docs] def _get_pos_and_neg_genes(self):
"""Set up positive and negative genes given the network.
:attr:`GenePlexus.pos_genes_in_net` (array of str)
Array of input gene Entrez IDs that are present in the network.
:attr:`GenePlexus.genes_not_in_net` (array of str)
Array of input gene Entrez IDs that are absent in the network.
:attr:`GenePlexus.net_genes` (array of str)
Array of network gene Entrez IDs.
:attr:`GenePlexus.negative_genes` (array of str)
Array of negative gene Entrez IDs derived using the input genes and
the background gene set collection (GSC).
"""
self.pos_genes_in_net, self.genes_not_in_net, self.net_genes = _geneplexus._get_genes_in_network(
self.file_loc,
self.net_type,
self.convert_ids,
)
self.negative_genes = _geneplexus._get_negatives(
self.file_loc,
self.net_type,
self.gsc,
self.pos_genes_in_net,
)
return self.pos_genes_in_net, self.negative_genes, self.net_genes
[docs] def fit_and_predict(
self,
logreg_kwargs: Optional[Dict[str, Any]] = None,
min_num_pos: int = 15,
num_folds: int = 3,
null_val: float = -10,
random_state: Optional[int] = 0,
cross_validate: bool = True,
):
"""Fit a model and predict gene scores.
Args:
logreg_kwargs: Scikit-learn logistic regression settings (see
:class:`~sklearn.linear_model.LogisticRegression`). If not set,
then use the default logistic regression settings (l2 penalty,
10,000 max iterations, lbfgs solver).
min_num_pos: Minimum number of positives required for performing
cross validation evaluation.
num_folds: Number of cross validation folds.
null_val: Null values to fill if cross validation was not able to
be performed.
random_state: Random state for reproducible shuffling stratified
cross validation. Set to None for random.
cross_validate: Whether or not to perform cross validation to
evaluate the prediction performance on the gene set. If set to
``False``, then skip cross validation and return null_val as cv
scores.
:attr:`GenePlexus.mdl_weights` (array of float)
Trained model parameters.
:attr:`GenePlexus.probs` (array of float)
Genome-wide gene prediction scores. A high value indicates the
relevance of the gene to the input gene list.
:attr:`GenePlexus.avgps` (array of float)
Cross validation results. Performance is measured using
log2(auprc/prior).
:attr:`GenePlexus.df_probs` (DataFrame)
A table with 7 columns: **Entrez** (the gene Entrez ID), **Symbol**
(the gene Symbol), **Name** (the gene Name), **Probability** (the
probability of a gene being part of the input gene list),
**Known/Novel** (whether the gene is in the input gene list),
**Class-Label** (positive, negative, or neutral), **Rank** (rank of
relevance of the gene to the input gene list).
"""
self.mdl_weights, self.probs, self.avgps = _geneplexus._run_sl(
self.file_loc,
self.net_type,
self.features,
self.pos_genes_in_net,
self.negative_genes,
self.net_genes,
logreg_kwargs=logreg_kwargs,
min_num_pos=min_num_pos,
num_folds=num_folds,
null_val=null_val,
random_state=random_state,
cross_validate=cross_validate,
)
self.df_probs = _geneplexus._make_prob_df(
self.file_loc,
self.net_genes,
self.probs,
self.pos_genes_in_net,
self.negative_genes,
)
return self.mdl_weights, self.df_probs, self.avgps
[docs] def make_sim_dfs(self):
"""Compute similarities bewteen the input genes and GO or DisGeNet.
The similarities are compuared based on the model trained on the input
gene set and models pre-trained on known GO and DisGeNet gene sets.
:attr:`GenePlexus.df_sim_GO` (DataFrame)
A table with 4 columns: **ID** (the GO term ID), **Name** (name of
the GO term), **Similarity** (similarity between the input model
and a model trained on the GO term gene set), **Rank** (rank of
similarity between the input model and a model trained on the GO
term gene set).
:attr:`GenePlexus.df_sim_Dis` (DataFrame)
A table with 4 columns: **ID** (the DO term ID), **Name** (name of
the DO term), **Similarity** (similarity between the input model
and a model trained on the DO term gene set), **Rank** (rank of
similarity between the input model and a model trained on the DO
term gene set).
:attr:`GenePlexus.weights_GO`
Dictionary of pretrained model weights for GO. A key is a GO term,
and the value is a dictionary with three keys: **Name** (name of
the GO term), **Weights** (pretrained model weights), **PosGenes**
(positive genes for this GO term).
:attr:`GenePlexus.weights_Dis`
Dictionary of pretrained model weights for DisGeNet. A key is a DO
term, and the value is a dictionary with three keys: **Name** (name
of the DO term), **Weights** (pretrained model weights),
**PosGenes** (positive genes for this DO term).
"""
self.df_sim_GO, self.df_sim_Dis, self.weights_GO, self.weights_Dis = _geneplexus._make_sim_dfs(
self.file_loc,
self.mdl_weights,
self.gsc,
self.net_type,
self.features,
)
return self.df_sim_GO, self.df_sim_Dis, self.weights_GO, self.weights_Dis
[docs] def make_small_edgelist(self, num_nodes: int = 50):
"""Make a subgraph induced by the top predicted genes.
:attr:`GenePlexus.df_edge` (DataFrame)
Table of edge list corresponding to the subgraph induced by the top
predicted genes (in Entrez gene ID).
:attr:`GenePlexus.isolated_genes` (List[str])
List of top predicted genes (in Entrez gene ID) that are isolated
from other top predicted genes in the network.
:attr:`GenePlexus.df_edge_sym` (DataFrame)
Table of edge list corresponding to the subgraph induced by the top
predicted genes (in gene symbol).
:attr:`GenePlexus.isolated_genes_sym` (List[str])
List of top predicted genes (in gene symbol) that are isolated from
other top predicted genes in the network.
Args:
num_nodes: Number of top genes to include.
"""
self.df_edge, self.isolated_genes, self.df_edge_sym, self.isolated_genes_sym = _geneplexus._make_small_edgelist(
self.file_loc,
self.df_probs,
self.net_type,
num_nodes=num_nodes,
)
return self.df_edge, self.isolated_genes, self.df_edge_sym, self.isolated_genes_sym
[docs] def alter_validation_df(self):
"""Make table about presence of input genes in the network.
:attr:`df_convert_out_subset`
:attr:`positive_genes`
"""
self.df_convert_out_subset, self.positive_genes = _geneplexus._alter_validation_df(
self.df_convert_out,
self.table_summary,
self.net_type,
)
return self.df_convert_out_subset, self.positive_genes