Source code for geneplexus.custom

"""Helper functions for setting up custom networks and GSCs."""
import json
import os.path as osp

import numpy as np

from ._config import logger


[docs]def edgelist_to_nodeorder( edgelist_loc: str, data_dir: str, net_name: str, sep: str = "\t", skiplines: int = 0, ): """Convert :term:`edgelist` to node order. The node order (NodeOrder) file is used to map gene IDs to rows in the data repsentation matrix. Args: edgelist_loc: Location of the edgelist data_dir: The directory to save the file net_name: The name of the network sep: The separation used in the edgelist file (default tab) skiplines: The number of lines to skip for header """ logger.info("Making the NodeOrder File") with open(edgelist_loc) as f: nodeset = set() for idx, line in enumerate(f): if idx - skiplines < 0: continue else: nodeset.update(line.strip().split(sep)[:2]) outfile = osp.join(data_dir, f"NodeOrder_{net_name}.txt") logger.info(f"Saving NodeOrder file to {outfile}") np.savetxt(outfile, sorted(nodeset), fmt="%s")
[docs]def edgelist_to_matrix( edgelist_loc: str, data_dir: str, net_name: str, features: str, alpha: float = 0.85, sep: str = "\t", skiplines: int = 0, ): """Convert :term:`edgelist` to an adjacency matrix or influence matrix. Note: The NodeOrder file needs to be a single column text file. If not supplying custom GSC, the file needs to be in Entrez ID space. Args: edgelist_loc: Location of the edgelist data_dir: The directory to save the file net_name: The name of the network features: Features for the networks (Adjacency or Influence, All) alpha: Restart parameter. sep: The separation used in the edgelist file (default tab) skiplines: The number of lines to skip for header """ if alpha < 0 or alpha > 1: raise ValueError(f"Restart parameter (alpha) must be between 0 and 1, got {alpha!r}") # Load in the NodeOrder file and make node index map nodeorder_loc = osp.join(data_dir, f"NodeOrder_{net_name}.txt") nodelist = np.loadtxt(nodeorder_loc, dtype=str) node_to_ind = {j: i for i, j in enumerate(nodelist)} # Make adjacency matrix logger.info("Making the adjacency matrix") adj_mat = np.zeros((len(nodelist), len(nodelist)), dtype=float) with open(edgelist_loc) as f: for idx, line in enumerate(f): if idx - skiplines < 0: continue terms = line.strip().split(sep) node1, node2 = terms[:2] if len(terms) > 3: raise ValueError("Too many columns in edgelist file") if (node1 not in node_to_ind) or (node2 not in node_to_ind): raise KeyError(f"Nodes in Edgelist but not in NodeOrder file ({node1!r} or {node2!r})") i, j = node_to_ind[node1], node_to_ind[node2] weight = 1.0 if len(terms) == 2 else terms[2] adj_mat[i, j] = adj_mat[j, i] = weight # Optionally make influence matrix if (features == "Influence") or (features == "All"): logger.info("Making the influence matrix") adj_mat_norm = adj_mat / adj_mat.sum(axis=0) id_mat = np.identity(len(nodelist)) F_mat = alpha * np.linalg.inv(id_mat - (1 - alpha) * adj_mat_norm) # Save the data logger.info("Saving the data") if (features == "Adjacency") or (features == "All"): np.save(osp.join(data_dir, f"Data_Adjacency_{net_name}.npy"), adj_mat) if (features == "Influence") or (features == "All"): np.save(osp.join(data_dir, f"Data_Influence_{net_name}.npy"), F_mat)
[docs]def subset_gsc_to_network( data_dir: str, net_name: str, gsc_name: str, max_size: int = 200, min_size: int = 10, ): """Subset :term:`GSC` to only include genes in the network. Note: Use the :meth:`geneplexus.download.download_select_data` function to get the preprocessed GO and DisGeNet files first. Args: data_dir: The directory to save the file net_name: The name of the network gsc_name: The name of the GSC max_size: Maximum geneset size. min_size: Minimum geneset size. """ logger.info("Subsetting the GSC (this may take a few minutes)") # load in the NodeOrder file nodeorder_loc = osp.join(data_dir, f"NodeOrder_{net_name}.txt") nodelist = np.loadtxt(nodeorder_loc, dtype=str) # load the orginal GSC with open(osp.join(data_dir, f"GSCOriginal_{gsc_name}.json")) as handle: gsc_orig = json.load(handle) # subset GSc based on network universe_genes = np.array([]) gsc_subset = {} for akey in gsc_orig: org_genes = gsc_orig[akey]["Genes"] genes_tmp = np.intersect1d(nodelist, org_genes) if (len(genes_tmp) <= max_size) and (len(genes_tmp) >= min_size): gsc_subset[akey] = {"Name": gsc_orig[akey]["Name"], "Genes": genes_tmp.tolist()} universe_genes = np.union1d(universe_genes, genes_tmp) logger.info("Saving the data") with open(osp.join(data_dir, f"GSC_{gsc_name}_{net_name}_GoodSets.json"), "w") as f: json.dump(gsc_subset, f, ensure_ascii=False, indent=4) np.savetxt(osp.join(data_dir, f"GSC_{gsc_name}_{net_name}_universe.txt"), universe_genes, fmt="%s")