Source code for knowledgebases

from abc import abstractmethod
import math
from opentargets import OpenTargetsClient
import contextlib, sys
import pandas as pd
import benchutils as util
import logging
import time, requests, json
from datetime import datetime
from lxml.html import fromstring

[docs]@contextlib.contextmanager def suppress_stdout(suppress=True): std_ref = sys.stdout if suppress: sys.stdout = open('/dev/null', 'w') yield sys.stdout = std_ref
with suppress_stdout(): from bioservices import KEGG, REST import pypath.core.network as network import pypath.core.entity as entity import pypath.internals.input_formats as input_formats logging.disable(0) WARNING_NOTHING_RETRIEVED_MESSAGE = "WARNING: No prior knowledge retrieved from {kb} for {lbl} - continue with empty set. This could be because there actually is no related information available in the knowledge base. If this message occurs with a HTTP status code >=500, there is probably a connection issue on server side. Consider retrying later." ############################### WEB SERVICES ###############################
[docs]class ENRICHR(REST): """Queries some of the API endpoints of the EnrichR web service (https://maayanlab.cloud/Enrichr/help#api). :param config: configuration parameters for EnrichR web service (as specified in config file). :type config: dict """ def __init__(self): self.config = util.getConfig("Enrichr") # temporarily disable logging to only show errors, otherwise our output will get spammed by bioservice logs (they reset the logging level internally when creating a new instance) logging.disable(50) super().__init__("ENRICHR", url=self.config["webservice_uri"]) logging.getLogger("bioservices:ENRICHR").setLevel(logging.ERROR) logging.disable(0)
[docs] def addlist(self, geneList): """Queries EnrichR to annotate a given list of genes. Returns a userListID, which can be used to retrieve the actual results in a second query. :param geneList: list of genes to annotate :type genes: list of str :return: json response containing a userListID. :rtype: dict of str """ genesStr = "" for gene in geneList: genesStr += str(gene) + "\n" payload = { 'list': (None, genesStr) } response = self.http_post('addList', files=payload, frmt="json") return response
[docs] def export(self, params): """Download file of enrichment results. Requires a userListId that was retrieved from a prior query. :param params: list of parameters to use for that query (userListId: Identifier returned from addList endpoint, filename: Name of text file download, backgroundType: Gene set library for which to download results) :type params: list of str :return: text file containing enrichment results. :rtype: str """ params["stream"] = "true" response = self.http_get("export", params = params) return response
[docs] def genemap(self, params): """Finds all terms, their descriptions, and optional categorizations, for a given gene identifier. :param params: list of parameters to be used for the query (gene Gene to use in search for terms, json (optional): Set "true" to return JSON rather plaintext, setup (optional): Set "true" to category information for the libraries) :type params: list of str :return: json object of all terms containing the specified gene and their descriptions. :rtype: dict of str """ response = self.http_get("genemap", params = params) return response
[docs] def enrich(self, params): """Returns all that are terms available in library (specified by backgroundType param) and enriched in the given set of genes (specified by userListId param). :param params: list of parameters to be used for the query (userListId: Identifier returned from addList endpoint; backgroundType: Gene set library to enrich against) :type params: list of str :return: dataframe object of all enriched terms (unsorted, unfiltered. :rtype: dataframe """ response = self.http_get("enrich", params = params) res = pd.DataFrame.from_dict(list(response.values())[0]) if not res.empty: res.columns = ["Rank", "Term name", "P-value", "Z-score", "Combined score", "Overlapping genes", "Adjusted p-value", "Old p-value", "Old adjusted p-value"] res = res[["Term name", "P-value", "Z-score", "Combined score", "Overlapping genes", "Adjusted p-value", "Old p-value", "Old adjusted p-value"]] else: #create empty dataframe if we did not receive any results res = pd.DataFrame(columns=["Term name", "P-value", "Z-score", "Combined score", "Overlapping genes", "Adjusted p-value", "Old p-value", "Old adjusted p-value"]) return res
[docs]class UMLS_AUTH(REST): """Singleton class. Python code encapsulates it in a way that is not shown in Sphinx, so have a look at the descriptions in the source code. Authentication service to get access to the UMLS database UMLS database (which we need for retrieving CUI disease codes for querying DisGeNET). You first have to get a ticket-granting ticket (tgt, valid for 8 hours) with the help of an API key. With the tgt, you can then request a service ticket for every new query to the UMLS database. The service ticket must then be used for the query. The task of this class is to generate a valid tgt and subsequent service ticket. Documentation on the authentication process: https://documentation.uts.nlm.nih.gov/rest/authentication.html :param config: configuration parameters for UMLS web service as specified in config file. :type config: dict :param tgt_timestamp: timestamp of the tgt. If it is older than 8 hours, we need to request a new tgt. :type tgt_timestamp: str :param tgt: id of the ticket-granting ticket (valid for 8 hours). With this ticket, we can then query the actual UMLS web service. :type tgt: list of str :param service: uri for the service login :type service: str """ class __UMLS_AUTH(REST): def __init__(self): self.config = util.getConfig("UMLS") self.tgt_timestamp = None self.tgt = None self.service = self.config["loginservice_uri"] #temporarily disable logging to only show errors, otherwise our output will get spammed by bioservice logs (they reset the logging level internally when creating a new instance) logging.disable(50) super().__init__("UMLS_AUTH", url=self.config["login_uri"]) logging.getLogger("bioservices:UMLS_AUTH").setLevel(logging.ERROR) logging.disable(0) def get_tgt(self): """Get a ticket-granting ticket (tgt) from the authentication service. :return: a valid tgt (valid for 8 hours). :rtype: str """ # we use a different uri here, so use requests instead of the bioservices interface self.tgt_timestamp = datetime.now() params = {'apikey': self.config["apikey"]} r = self.http_post(self.config["auth_endpoint"], data=params) response = fromstring(r) ## extract the entire URL needed from the HTML form (action attribute) returned - looks similar to https://utslogin.nlm.nih.gov/cas/v1/tickets/TGT-36471-aYqNLN2rFIJPXKzxwdTNC5ZT7z3B3cTAKfSc5ndHQcUxeaDOLN-cas ## we make a POST call to this URL in the getst method #only extract the very last part of the uri tgt_str = response.xpath('//form/@action')[0] tgt = tgt_str.split("/")[-1] return tgt # authentication function: get the service ticket - valid for up to 8 minutes def get_st(self): """Get a service ticket from the authentication service (valid for 8 minutes), which can then be used in the actual query. In order to get a service ticket, a valid ticket-granting ticket (tgt) must be provided. If the last tgt is outdated, generate a new one. :return: service ticket. :rtype: str """ # check if a valid tgt exists (expires after 8 hours) if self.tgt == None: # request new tgt self.tgt = self.get_tgt() elif (datetime.now() - self.tgt_timestamp).total_seconds() > 28800: # request new tgt self.tgt = self.get_tgt() params = {'service': self.service} h = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain", "User-Agent": "python"} st = self.http_post(self.config["auth_endpoint"] + self.tgt, data=params) return st instance = None def __init__(self): if not UMLS_AUTH.instance: UMLS_AUTH.instance = UMLS_AUTH.__UMLS_AUTH() def __getattr__(self, name): return getattr(self.instance, name)
#web service for retrieven UMLS CUI codes for labels (DisGeNET requires CUIs)
[docs]class UMLS(REST): """Retrieves UMLS CUI codes for labels, which can then be used for querying DisGeNET. :param config: configuration parameters for UMLS web service (as specified in configuration file). :type config: dict :param auth: authentication component to generate a valid service ticket (required for every query). :type auth: :class:`UMLS_AUTH` """ def __init__(self): self.config = util.getConfig("UMLS") self.auth = UMLS_AUTH() # temporarily disable logging to only show errors, otherwise our output will get spammed by bioservice logs (they reset the logging level internally when creating a new instance) logging.disable(50) super().__init__("UMLS", url=self.config["webservice_uri"]) logging.getLogger("bioservices:UMLS").setLevel(logging.ERROR) logging.disable(0)
[docs] def getCUIs(self, labels): """Get CUIs for the given labels. :param labels: list of identifiers for which to retrieve CUIs, e.g. disease names. :type labels: list of str :return: list of CUIs. :rtype: list of str """ # do the actual request cuis = set() for label in labels: #get a valid service ticket params = {'ticket': self.auth.get_st()} query = "/search/current?string=" + label + "&searchType=exact" response = self.http_get(query, params = params) if response is None: util.logWarning(WARNING_NOTHING_RETRIEVED_MESSAGE.format(kb="DisGeNET", lbl=" ,".join(labels))) continue # check if 500 is returned if isinstance(response, int): if response >= 500: util.logWarning("WARNING: Error code " + str(response) + " returned from DisGeNET for " + " ,".join(labels)) else: util.logDebug("DEBUG: Code " + str(response) + " returned from DisGeNET for " + " ,".join(labels)) util.logWarning(WARNING_NOTHING_RETRIEVED_MESSAGE.format(kb="DisGeNET", lbl=" ,".join(labels))) continue #get CUIs out of response results = response["result"]["results"] label_cuis = [] for result in results: label_cuis.append(result["ui"]) cuis.update(label_cuis) return list(cuis)
[docs]class DISGENET(REST): """Queries the DisGeNET web service for a given set of labels and retrieves association scores for all genes related to the query labels. DisGeNET API documentation: https://www.disgenet.org/api/ :param umls: list of gene names to be mapped :type umls: :class:`UMLS` for transforming disease names to CUIs (required for query) """ def __init__(self): self.umls = UMLS() # temporarily disable logging to only show errors, otherwise our output will get spammed by bioservice logs (they reset the logging level internally when creating a new instance) logging.disable(50) super().__init__("DisGeNET", url=util.config["DisGeNET"]["webservice_url"]) logging.getLogger("bioservices:DisGeNET").setLevel(logging.ERROR) logging.disable(0)
[docs] def getVersion(self): """Get the current version of the DisGeNET API endpoint. :return: web service version infos. :rtype: json dict """ ret = self.http_get("/version") return ret
[docs] def query(self, labels): """Conducts the actual query to retrive gene-disease association scores for a given list of disease labels. Transforms the disease labels into CUIs before with the UMLS web service. :param labels: list of disease labels for which to retrieve gene-disease associations. :type labels: list of str :return: DataFrame with gene-disease association scores. :rtype: :class:`pandas.DataFrame` """ cuis = self.umls.getCUIs(labels) cui_string = "" cui_string += "%2C".join(cuis) #add the api key for DisGeNET api_key = util.getConfigValue("DisGeNET", "apikey") authentication = {"Authorization": "Bearer %s" % api_key} requestString = "/gda/disease/" + cui_string ret = None self.TIMEOUT = 30 #retry query until we have to wait too long. DisGeNET sometimes seems to be quite slow while self.TIMEOUT < 250 and ret == None: util.logDebug("DEBUG: Trying to connect to DisGeNET. Timeout set to " + str(self.TIMEOUT)) #disable timout warnings to be printed to the command line logging.disable(100) ret = self.http_get(requestString, headers = authentication) logging.disable(0) self.TIMEOUT += 30 #check if 500 is returned if isinstance(ret, int): if ret >= 500: util.logWarning("WARNING: Error code " + str(ret) + " returned from DisGeNET for " + " ,".join(labels)) else: util.logDebug("DEBUG: Code " + str(ret) + " returned from DisGeNET for " + " ,".join(labels)) util.logWarning(WARNING_NOTHING_RETRIEVED_MESSAGE.format(kb = "DisGeNET", lbl = " ,".join(labels))) return pd.DataFrame(columns=["gene_symbol", util.config["DisGeNET"]["associationScore"]]) # bring the result into a readable format results = pd.DataFrame(ret) try:#if the dataframe is empty, just return an empty dataframe reduced_results = results.loc[:, ["gene_symbol", util.config["DisGeNET"]["associationScore"]]] except: util.logWarning(WARNING_NOTHING_RETRIEVED_MESSAGE.format(kb="DisGeNET", lbl=" ,".join(labels))) return pd.DataFrame(columns=["gene_symbol", util.config["DisGeNET"]["associationScore"]]) reduced_results.columns = ["gene_symbol", "score"] #save the results in an intermediate file dir_name = util.config["DisGeNET"]["outputDir"] + datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + "/" util.createDirectory(dir_name) filename = "queryResults.csv" reduced_results.to_csv(dir_name + filename, sep = ",", index = False) #map gene IDs to the desired format if necessary desiredIDFormat = util.config["Dataset"]["finalGeneIDFormat"] if desiredIDFormat != "HGNC": outputFile = dir_name + "mapped_" + "HGNC_" + desiredIDFormat + "/" + filename util.createDirectory(dir_name + "mapped_" + "HGNC_" + desiredIDFormat + "/") reduced_results = util.mapRanking(reduced_results, "HGNC", desiredIDFormat, outputFile) return reduced_results
[docs]class GCONVERT(): """Queries the g:Convert web service to map a list of identifiers to a desired format. g:Convert makes use of the Ensembl build. g:Convert API documentation: https://biit.cs.ut.ee/gprofiler/page/apis :param url: API url as specified in the configuration file. :type url: str """ def __init__(self): self.url = util.config["gConvert"]["webservice_url"] self.name = "GCONVERT"
[docs] def query(self, items, originalFormat, desiredFormat): """Map a list of itendifiers to the desired format. :param items: list of identifiers to be mapped :type items: list of str :param originalFormat: current format of the identifiers :type originalFormat: str :param desiredFormat: desired identifier format :type desiredFormat: str :return: DataFrame containing the identifier mapping. :rtype: :class:`pandas.DataFrame` """ mapping = None r = requests.post( url=self.url, json={ 'organism': 'hsapiens', 'target': desiredFormat, 'query': items, } ) result = r.json()['result'] desireds = list() originals = list() for item in result: desireds.append(item["converted"]) originals.append(item["incoming"]) mapping = pd.DataFrame({originalFormat: originals, desiredFormat: desireds}) return mapping
[docs]class PATHWAYCOMMONSWS(REST): """Queries the PathwayCommons web service. Bioservices' existing implementation to query PathwayCommons was not used because it contained outdated values for _valid_formats for pathway retrieval, so we used the original code and adapted it to work correctly. """ def __init__(self): self.easyXMLConversion = False self._default_extension = "json" # temporarily disable logging to only show errors, otherwise our output will get spammed by bioservice logs (they reset the logging level internally when creating a new instance) logging.disable(50) super().__init__("PathwayCommonsWS", url=util.config["PathwayCommons"]["webservice_url"]) logging.getLogger("bioservices:PathwayCommonsWS").setLevel(logging.ERROR) logging.disable(0) _valid_direction = ["BOTHSTREAM", "DOWNSTREAM", "UPSTREAM"]
[docs] def getVersion(self): """Map a list of genes to the desired format. :param genes: list of gene names to be mapped :type genes: list of str :return: list of mapped gene names. :rtype: list of str """ ret = self.http_get("/version") return ret
# just a get/set to the default extension def _set_default_ext(self, ext): """Map a list of genes to the desired format. :param genes: list of gene names to be mapped :type genes: list of str :return: list of mapped gene names. :rtype: list of str """ self.devtools.check_param_in_list(ext, ["json", "xml"]) self._default_extension = ext def _get_default_ext(self): """Map a list of genes to the desired format. :param genes: list of gene names to be mapped :type genes: list of str :return: list of mapped gene names. :rtype: list of str """ return self._default_extension default_extension = property(_get_default_ext, _set_default_ext, doc="set extension of the requests (default is json). Can be 'json' or 'xml'")
[docs] def search(self, q, page=0, datasource=None, organism=None, type=None): """Text search in PathwayCommons using Lucene query syntax Some of the parameters are BioPAX properties, others are composite relationships. All index fields are (case-sensitive): comment, ecnumber, keyword, name, pathway, term, xrefdb, xrefid, dataSource, and organism. The pathway field maps to all participants of pathways that contain the keyword(s) in any of its text fields. Finally, keyword is a transitive aggregate field that includes all searchable keywords of that element and its child elements. All searches can also be filtered by data source and organism. It is also possible to restrict the domain class using the 'type' parameter. This query can be used standalone or to retrieve starting points for graph searches. :param str q: requires a keyword , name, external identifier, or a Lucene query string. :param int page: (N>=0, default is 0), search result page number. :param str datasource: filter by data source (use names or URIs of pathway data sources or of any existing Provenance object). If multiple data source values are specified, a union of hits from specified sources is returned. datasource=[reactome,pid] returns hits associated with Reactome or PID. :param str organism: The organism can be specified either by official name, e.g. "homo sapiens" or by NCBI taxonomy id, e.g. "9606". Similar to data sources, if multiple organisms are declared a union of all hits from specified organisms is returned. For example organism=[9606, 10016] returns results for both human and mice. :param str type: BioPAX class filter """ if self.default_extension == "xml": url = "search.xml?q=%s" % q elif self.default_extension == "json": url = "search.json?q=%s" % q params = {} if page>=0: params['page'] = page else: util.logWarning("WARNING: PathwayCommons search result page should be >=0") if datasource: params['datasource'] = datasource if type: params['type'] = type if organism: params['organism'] = organism res = self.http_get(url, frmt=self.default_extension, params=params) if self.default_extension == "xml": res = self.easyXML(res) return res
[docs] def get(self, uri, frmt="BIOPAX"): """Retrieves full pathway information for a set of elements elements can be for example pathway, interaction or physical entity given the RDF IDs. Get commands only retrieve the BioPAX elements that are directly mapped to the ID. Use the :meth:`traverse` query to traverse BioPAX graph and obtain child/owner elements. :param str uri: valid/existing BioPAX element's URI (RDF ID; for utility classes that were "normalized", such as entity refereneces and controlled vocabularies, it is usually a Identifiers.org URL. Multiple IDs can be provided using list uri=[http://identifiers.org/uniprot/Q06609, http://identifiers.org/uniprot/Q549Z0'] See also about MIRIAM and Identifiers.org. :param str format: output format (values) :return: a complete BioPAX representation for the record pointed to by the given URI is returned. Other output formats are produced by converting the BioPAX record on demand and can be specified by the optional format parameter. Please be advised that with some output formats it might return "no result found" error if the conversion is not applicable for the BioPAX result. For example, BINARY_SIF output usually works if there are some interactions, complexes, or pathways in the retrieved set and not only physical entities. """ # validates the URIs if isinstance(uri, str): url = "get?uri=" +uri elif isinstance(uri, list): url = "get?uri=" +uri[0] if len(uri)>1: for u in uri[1:]: url += "&uri=" + u if frmt != "BIOPAX": url += "&format=%s" % frmt res = self.http_get(url) return res
[docs]class KnowledgeBaseFactory():#singleton class """Singleton class. Python code encapsulates it in a way that is not shown in Sphinx, so have a look at the descriptions in the source code. Creates knowledge bases based on the provided name and creates all corresponding objects, e.g. web service endpoints. Every knowledge base implementation must be registered here, otherwise it will not be accessible. """ class __KnowledgeBaseFactory(): def createKnowledgeBase(self, knowledgebase): """Creates knowledge base based on a given name. :param knowledgebase: name of the knowledge base to be created. :type knowledgebase: str :return: knowledge base object. :rtype: :class:`KnowledgeBase` or inheriting classes """ if knowledgebase == "DisGeNET": return Disgenet() if knowledgebase == "KEGG": #create a pathway mapper pathwayparser = KEGGPathwayParser() return Kegg(pathwayparser) if knowledgebase == "Enrichr": return Enrichr() if knowledgebase == "OpenTargets": return OpenTargets() if knowledgebase == "PathwayCommons": return Pathwaycommons() if knowledgebase == "Biomart": return BioMART() if knowledgebase == "gConvert": return Gconvert() util.logError("ERROR: The listed knowledge base is not available. See the documentation for available knowledge bases.") exit() instance = None def __init__(self): if not KnowledgeBaseFactory.instance: KnowledgeBaseFactory.instance = KnowledgeBaseFactory.__KnowledgeBaseFactory() def __getattr__(self, name): return getattr(self.instance, name)
############################### KNOWLEDGE BASES ###############################
[docs]class KnowledgeBase: """Super class for every knowledge base implementation. If a new knowledge base is implemented, it must inherit from this class and implement methods :meth:`KnowledgeBase.getRelevantGenes()`, :meth:`KnowledgeBase.getGeneScores()`, and :meth:`KnowledgeBase.getRelevantPathways()`. :param name: name of the knowledge base :type name: str :param config: configuration parameter of the knowledge base as specified in the config file. :type config: dict :param webservice: web service querying object :type webservice: :class:`bioservices.REST` or inheriting classes. :param hasGeneInformation: true if the knowledge base provides gene association information, false otherwise :type hasGeneInformation: bool :param hasPathwayInformation: true if the knowledge base also provides pathway information, false otherwise :type hasPathwayInformation: bool """ def __init__(self, name, kb_config, webservice, geneInfo, pathwayInfo): self.name = name self.config = kb_config util.createDirectory(self.config["outputDir"]) self.webservice = webservice self.hasGeneInformation = geneInfo self.hasPathwayInformation = pathwayInfo super().__init__()
[docs] @abstractmethod def getRelevantGenes(self, labels): """Abstract. Get all genes that are associated to a list of labels, e.g. disease names. :param labels: list of labels for which to retrieve the genes. :type labels: list of str :return: list of associated genes. :rtype: list of str """ pass
[docs] @abstractmethod def getGeneScores(self, labels): """Abstract. Get all genes and their association scores for a given list of disease names. :param labels: list of disease names for which to get gene-disease-association scores. :type labels: list of str :return: DataFrame of genes and their association scores. :rtype: :class:`pandas.DataFrame` """ pass
[docs] @abstractmethod def getRelevantPathways(self, labels): """Get all pathways related to a set of labels, e.g disease names. :param labels: list of labels for which to find related pathways. :type labels: list of str :return: dict of pathway names and pathway representations. :rtype: dict with :class:`pypath.Network` as values """ pass
[docs] def getName(self): """Returns the name of the knowledge base. :return: knowledge base name. :rtype: str """ return self.name
[docs] def hasPathways(self): """Returns if knowledge base retrieves pathway information, i.e. if :meth:`KnowledgeBase.getRelevantPathways()` is implemented.. :return: true if knowledge base provides pathway information, false otherwise. :rtype: bool """ return self.hasPathwayInformation
[docs] def hasGenes(self): """Returns if knowledge base retrieves gene information, i.e. if :meth:`KnowledgeBase.getRelevantGenes()` :meth:`KnowledgeBase.getGeneScores()` are implemented. :return: true if knowledge base provides gene information, false otherwise. :rtype: bool """ return self.hasGeneInformation
[docs]class Enrichr(KnowledgeBase): """Special knowledge base not intended to be used by feature selection approaches. Instead, it is used for evaluation purposes to annotate and enrich rankings. :param name: name of the knowledge base :type name: str :param config: configuration parameter of the knowledge base as specified in the config file. :type config: dict :param webservice: web service querying object :type webservice: :class:`bioservices.REST` or inheriting classes. :param hasGeneInformation: true if the knowledge base provides gene association information, false otherwise :type hasGeneInformation: bool :param hasPathwayInformation: true if the knowledge base also provides pathway information, false otherwise :type hasPathwayInformation: bool """ def __init__(self): super().__init__("Enrichr", util.getConfig("Enrichr"), ENRICHR(), False, False)
[docs] def downloadEnrichedTerms(self, userIdList, filePrefix): """Downloads enriched terms from a former query into a file. Filters these terms for those with an adjusted p-value > 0.05, then sorts by combined score in descending order. :param userIdList: userIdList to retrieve enrichment/annotation results from the original query. :type userIdList: str :param filePrefix: prefix to use in filename. :type filePrefix: str """ geneSetLibrary = self.config["geneSetLibrary"] outputDir = self.config["outputDir"] + datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + "/" util.createDirectory(outputDir) outputFile = filePrefix + "_enrichedTerms.txt" params = {"userListId": str(userIdList), "backgroundType": geneSetLibrary} response = self.webservice.enrich(params) # first, filter enriched terms by q-score > 0.05 # second, order by combined score in descending order # see also this best practices recommendation: https://www.researchgate.net/post/Enrichr_what_value_of_combined_score_is_significant final_terms = response[(response["Adjusted p-value"] < 0.05)] final_terms = final_terms.sort_values(by = "Combined score", ascending = False) final_terms.to_csv(outputFile, sep = "\t", index = False)
[docs] def getRelevantGenes(self, labels): """Is not implemented for EnrichR. :param labels: list of gene names to be mapped :type labels: list of str :return: :class:`NotImplementedError` as this knowledge base is not intended to be used for such analyses. :rtype: :class:`NotImplementedError` """ raise NotImplementedError("EnrichR is currently only used for subsequent annotation and not intended to be used during analysis.")
[docs] def getGeneScores(self, labels): """Is not implemented for EnrichR. :param labels: list of gene names to be mapped :type labels: list of str :return: :class:`NotImplementedError` as this knowledge base is not intended to be used for such analyses. :rtype: :class:`NotImplementedError` """ raise NotImplementedError("EnrichR is currently only used for subsequent annotation and not intended to be used during analysis.")
[docs] def getRelevantPathways(self, labels): """Is not implemented for EnrichR. :param labels: list of labels for which to find related pathways. :type labels: list of str :return: :class:`NotImplementedError` as this knowledge base is not intended to be used for such analyses. :rtype: :class:`NotImplementedError` """ raise NotImplementedError("EnrichR is currently only used for subsequent annotation and not intended to be used during analysis.")
[docs] def enrichGeneset(self, geneList, filePrefix): """Sends a list of identifies (here, genes) to EnrichR web service and stores all term enrichments in a file. :param geneList: list of gene names for which to retrieve enrichments. :type geneList: list of str :param filePrefix: prefix to use in file name (to store enrichments). :type filePrefix: str """ #submit list for analysis data = self.webservice.addlist(geneList) userListID = data["userListId"] #download files of enriched terms self.downloadEnrichedTerms(userListID, filePrefix)
[docs] def annotateGene(self, gene): """Annotates a gene with terms. :param gene: gene name. :type gene: str :return: list of all annotations to the provided gene. :rtype: list of str """ params = {"json": "true", "setup": "true", "gene": gene} response = self.webservice.genemap(params) try: annotations = response["gene"] geneSetLibrary = self.config["geneSetLibrary"] annotation = annotations[geneSetLibrary] except: #no annotation was found annotation = [] return annotation
[docs] def annotateGenes(self, geneList, filePrefix): """Annotates a list of genes with relevant terms. :param geneList: list of gene names to annotate. :type geneList: list of str :param filePrefix: prefix to use when storing results in a file. :type filePrefix: str :return: dict of gene names and lists of their annotations. :rtype: dict """ annotations = {} # if false then annotate each gene individually for gene in geneList: annotations[gene] = self.annotateGene(gene) outputDir = self.config["outputDir"] + datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + "/" util.createDirectory(outputDir) outputFile = filePrefix + "_annotatedGenes.txt" # write gene annotations to file with open(outputFile, 'w') as f: f.write("attributeName\tannotation\n") for gene, annotation in annotations.items(): anno = ",".join(annotation) g = str(gene) if anno != "": f.write(g + "\t" + anno + "\n") return annotations
[docs]class BioMART(): """Maps a identifiers or data sets with identifiers to the desired format by using BiomaRt. Wrapper class that internally invokes BiomaRt's R code. Very unstable, so currently not used. However, it can be exchanged in :meth:`benchutils.retrieveMappings()` function. """
[docs] def mapItems(self, itemList, originalFormat, desiredFormat): """Map a list of identifiers to the desired format. Internally invokes external R code that uses the BiomaRt package. :param itemList: list of identifiers to be mapped :type itemList: list of str :param originalFormat: original identifier format. :type originalFormat: str :param desiredFormat: format to which to map identifiers. :type desiredFormat: str :return: mapping data frame of identifiers (with original and desired format) :rtype: :class:`pandas.DataFrame` """ # write results into intermediate file outputDir = util.getConfigValue("Biomart", "outputDir") + datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + "/" util.createDirectory(outputDir) util.createDirectory(outputDir + "input/") util.createDirectory(outputDir + "output/") filename = "mapping.csv" #put itemList into input file with open(outputDir + "input/" + filename, "w") as f: for item in itemList: f.write("%s\n" % item) params = [originalFormat, desiredFormat, outputDir + "input/" + filename, outputDir + "output/" + filename] util.runRCommand(util.getConfig("R"), "IdentifierMapping.R", params) mappedItems = pd.read_csv(outputDir + "output/" + filename) return mappedItems
[docs] def getRelevantGenes(self, labels): """Is not implemented for BiomaRt. :param labels: list of gene names to be mapped :type labels: list of str :return: :class:`NotImplementedError` as this knowledge base is not intended to be used for such analyses. :rtype: :class:`NotImplementedError` """ raise NotImplementedError("Biomart is currently only used for identifier mapping and not intended to be used during analysis.")
[docs] def getGeneScores(self, labels): """Is not implemented for BiomaRt. :param labels: list of gene names to be mapped :type labels: list of str :return: :class:`NotImplementedError` as this knowledge base is not intended to be used for such analyses. :rtype: :class:`NotImplementedError` """ raise NotImplementedError( "Biomart is currently only used for identifier mapping and not intended to be used during analysis.")
[docs] def getRelevantPathways(self, labels): """Is not implemented for BiomaRt. :param labels: list of labels for which to find related pathways. :type labels: list of str :return: :class:`NotImplementedError` as this knowledge base is not intended to be used for such analyses. :rtype: :class:`NotImplementedError` """ raise NotImplementedError( "Biomart is currently only used for identifier mapping and not intended to be used during analysis.")
[docs]class Gconvert(KnowledgeBase): """Maps identifiers or data sets containing identifiers to the desired format by using the g:Convert web service. :param name: name of the knowledge base :type name: str :param config: configuration parameter of the knowledge base as specified in the config file. :type config: dict :param webservice: web service querying object. :type webservice: :class:`bioservices.REST` or inheriting classes :param hasGeneInformation: true if the knowledge base provides gene association information, false otherwise :type hasGeneInformation: bool :param hasPathwayInformation: true if the knowledge base also provides pathway information, false otherwise :type hasPathwayInformation: bool """ def __init__(self): super().__init__("gConvert", util.getConfig("gConvert"), GCONVERT(), False, False)
[docs] def mapItems(self, itemList, originalFormat, desiredFormat): """Map a list of identifiers to the desired format. :param itemList: list of identifiers to be mapped. :type itemList: list of str :param originalFormat: current format of the identifiers. :type originalFormat: str :param desiredFormat: desired format to which to map identifiers. :type desiredFormat: str :return: DataFrame table containing mappings of the identifiers from original to desired format. :rtype: :class:`pandas.DataFrame` """ # write results into intermediate file mapping = self.webservice.query(itemList, originalFormat, desiredFormat) #remove nans in mapping indexNames = mapping[mapping[desiredFormat] == 'nan'].index mapping = mapping.drop(indexNames) return mapping
[docs] def getRelevantGenes(self, labels): """Is not implemented for g:Convert. :param labels: list of gene names to be mapped :type labels: list of str :return: :class:`NotImplementedError` as this knowledge base is not intended to be used for such analyses. :rtype: :class:`NotImplementedError` """ raise NotImplementedError("g:Convert is currently only used for identifier mapping and not intended to be used during analysis.")
[docs] def getGeneScores(self, labels): """Is not implemented for g:Convert. :param labels: list of gene names to be mapped :type labels: list of str :return: :class:`NotImplementedError` as this knowledge base is not intended to be used for such analyses. :rtype: :class:`NotImplementedError` """ raise NotImplementedError( "g:Convert is currently only used for identifier mapping and not intended to be used during analysis.")
[docs] def getRelevantPathways(self, labels): """Is not implemented for g:Convert. :param labels: list of labels for which to find related pathways. :type labels: list of str :return: :class:`NotImplementedError` as this knowledge base is not intended to be used for such analyses. :rtype: :class:`NotImplementedError` """ raise NotImplementedError( "g:Convert is currently only used for identifier mapping and not intended to be used during analysis.")
[docs]class OpenTargets(KnowledgeBase): """Knowledge base implementation of OpenTargets. Uses the OpenTargetsClient Python implementation provided by OpenTargets to query the web service API. :param name: name of the knowledge base :type name: str :param config: configuration parameter of the knowledge base as specified in the config file. :type config: dict :param webservice: web service querying implementation. :type webservice: :class:`opentargets.OpenTargetsClient` :param hasGeneInformation: true if the knowledge base provides gene association information, false otherwise :type hasGeneInformation: bool :param hasPathwayInformation: true if the knowledge base also provides pathway information, false otherwise :type hasPathwayInformation: bool """ def __init__(self): logging.disable(50) super().__init__("OpenTargets", util.getConfig("OpenTargets"), OpenTargetsClient(), True, False) logging.disable(0)
[docs] def getAssociations(self, labels): """Get all relevant information for a given set of labels, sorted by their association scores in descending order. Writes web service results into an intermediate file and maps the identifiers to have the correct format for further processing. :param labels: list of labels, e.g. disease names. :type labels: list of str :return: DataFrame containing all related genes and their association scores. :rtype: :class:`pandas.DataFrame` """ cols = ["gene_symbol", "score"] associated_genes = pd.DataFrame(columns=cols) for term in labels: try: a_for_disease = self.webservice.get_associations_for_disease(term) except : util.logWarning("WARNING: " + sys.exc_info()[0]) util.logWarning(WARNING_NOTHING_RETRIEVED_MESSAGE.format(kb = "OpenTargets", lbl = term)) continue #check if an error code was returned if isinstance(a_for_disease, int): if a_for_disease >= 500: util.logWarning( "WARNING: Error code " + str(a_for_disease) + " returned from OpenTargets for " + " ,".join(labels)) else: util.logDebug("DEBUG: Code " + str(a_for_disease) + " returned from OpenTargets for " + " ,".join(labels)) util.logWarning(WARNING_NOTHING_RETRIEVED_MESSAGE.format(kb="OpenTargets", lbl=term)) continue geneIDs = list() assocscores = list() for a in a_for_disease: geneID = a["id"] #the ID is currently attached by an EFO ID - remove it geneID = geneID.split("-")[0] geneIDs.append(geneID) score = a['association_score']['overall'] assocscores.append(score) associated_genes = pd.DataFrame({"gene_symbol": geneIDs, "score": assocscores}) #write results into intermediate file outputDir = util.getConfigValue("OpenTargets", "outputDir") + datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + "/" util.createDirectory(outputDir) outputFile = "genes.csv" associated_genes.to_csv(outputDir + outputFile, sep = ",", index = False) # check if gene ID format from expression data set matches OpenTargets' Ensembl Gene ID format desiredIDFormat = util.config["Dataset"]["finalGeneIDFormat"] #if desiredIDFormat != "ensembl_gene_id": if desiredIDFormat != "ENSG": outputFile = outputDir + "mapped_" + "ENSG_" + desiredIDFormat + "/" + outputFile util.createDirectory(outputDir + "mapped_" + "ENSG_" + desiredIDFormat + "/") associated_genes = util.mapRanking(associated_genes, "ENSG", desiredIDFormat, outputFile) #order by score associated_genes.sort_values('score', ascending=False, inplace=True) #remove duplicates associated_genes = associated_genes.drop_duplicates(subset = "gene_symbol") return associated_genes
[docs] def getRelevantGenes(self, labels): """Get all genes that are somehow associated to the given labels, e.g. disease names. :param labels: list of identifiers, e.g. disease names, for which to find associated genes. :type labels: list of str :return: list of associated genes. :rtype: list of str """ genes = self.getAssociations(labels) return list(genes.loc[:,"gene_symbol"].unique())
[docs] def getGeneScores(self, labels): """Get all genes and their association scores that are related to the given labels, e.g. disease names. :param labels: list of identifiers, e.g. disease names, for which to find associated genes. :type labels: list of str :return: DataFrame of associated genes and their association scores, in descending order. :rtype: :class:`pandas.DataFrame` """ geneScores = self.getAssociations(labels) return geneScores
[docs] def getRelevantPathways(self, labels): """As OpenTargets currently does not provide pathway information, this feature is not implemented for OpenTargets. :param labels: list of labels for which to find related pathways. :type labels: list of str :return: :class:`NotImplementedError` as this knowledge base is not intended to be used for such analyses. :rtype: :class:`NotImplementedError` """ raise NotImplementedError("OpenTargets cannot return pathways at the moment (although they also provide references to pathways, but this feature is not supported in the API.")
[docs]class Kegg(KnowledgeBase): """Knowledge base implementation for KEGG. Uses the KEGG web service implementation provided by bioservices. Requires an instance of :class:`KEGGPathwayParser` to be able to map retrieved pathways into the internal pathway format. :param name: name of the knowledge base :type name: str :param config: configuration parameter of the knowledge base as specified in the config file. :type config: dict :param webservice: web service querying implementation. :type webservice: :class:`bioservices.KEGG` :param hasGeneInformation: true if the knowledge base provides gene association information, false otherwise :type hasGeneInformation: bool :param hasPathwayInformation: true if the knowledge base also provides pathway information, false otherwise :type hasPathwayInformation: bool :param pathwayparser: pathway mapping class that transforms KEGG pathways in SIF format into the internally used pathway format. :type pathwayparser: :class:`KEGGPathwayParser` """ def __init__(self, pathwayparser): self.pathwayparser = pathwayparser # temporarily disable logging to only show errors, otherwise our output will get spammed by bioservice logs (they reset the logging level internally when creating a new instance) logging.disable(50) super().__init__("KEGG", util.getConfig("KEGG"), KEGG(), True, True) logging.getLogger("bioservices:KEGG").setLevel(logging.ERROR) logging.getLogger("bioservices:keggparser").setLevel(logging.ERROR) logging.disable(0)
[docs] def getPathwayNames(self, labels): """Retrieve all pathway names related to the given labels, e.g. disease names. :param labels: list labels, e.g. disease names, for which to find pathways. :type labels: list of str :return: list of pathway names. :rtype: list of str """ pathways = [] for label in labels: with suppress_stdout(): pathwayListString = self.webservice.find("pathway", "\"" + label + "\"") # pathway are returned in the form path:pathwayid\tdescription\n if pathwayListString == "\n" or isinstance(pathwayListString, int): pathwayList = [] util.logWarning(WARNING_NOTHING_RETRIEVED_MESSAGE.format(kb= "KEGG", lbl = label)) else: pathwayList = pathwayListString.split("\n")[:-1] count = 0 for pathway in pathwayList: if count <= int(self.config["maxNumPathways"]): count += 1 # strip off the path prefix pathwayName = pathway.split("\t")[0] pathwayName = pathwayName[5:] # replace "map" by "hsa" (because we want the reference organisms for homo sapiens) pathwayName = pathwayName.replace("map", "hsa") pathways.append(pathwayName) else: break return pathways
[docs] def getRelevantGenes(self, labels): """Get all genes that are related to a set of labels, e.g. disease names. For KEGG, this means we retrieve all genes that are contained in pathways associated to these labels. :param labels: list of identifiers, e.g. disease names, for which to find associated genes. :type labels: list of str :return: list of associated genes. :rtype: list of str """ pathways = self.getRelevantPathways(labels) genes = set() for pathway in pathways.values(): nodes = pathway.nodes for node in nodes.values(): genes.add(node.label) #write genes to file outputDir = util.getConfigValue("KEGG", "outputDir") + datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + "/" util.createDirectory(outputDir) filename = "queryResults.txt" with open(outputDir + filename, 'w') as f: f.write("attributeName,score\n") f.write(",0.0\n".join(genes)) return genes
[docs] def getGeneScores(self, labels): """Get association scores for all genes that are related to the provided labels, e.g. disease names. For KEGG, the association score for a gene is the sum of its degree percentile rank for every pathway, normalized by the overall number of pathways retrieved. This favors hub genes/genes having many interactions with other genes. :param labels: list of identifiers, e.g. disease names, for which to find associated genes. :type labels: list of str :return: DataFrame of associated genes and their association scores, in descending order. :rtype: :class:`pandas.DataFrame` """ pathways = self.getRelevantPathways(labels) genePathwayCounts = {} occurrenceCounts = {} for pathway in pathways.values(): interactions = pathway.interactions_by_nodes reduced_interactions = {} #update interactions to only store the interactions counts for entity in interactions.keys(): if entity.label in genePathwayCounts.keys(): # add pathway count genePathwayCounts[entity.label] += 1 else: genePathwayCounts[entity.label] = 1 reduced_interactions[entity.label] = len(interactions[entity]) #make dataframe from count interactions_df = pd.DataFrame.from_dict(reduced_interactions, orient = 'index') interactions_df.columns = ["degree"] percentiles = interactions_df["degree"].rank(method='min', pct = True) for item in percentiles.iteritems(): perc_score = item[1] feature = item[0] if feature in occurrenceCounts.keys(): occurrenceCounts[feature] += perc_score else: occurrenceCounts[feature] = perc_score genes = list() scores = list() for geneID in occurrenceCounts.keys(): genes.append(geneID) score = occurrenceCounts[geneID] / genePathwayCounts[geneID] scores.append(score) geneScores = pd.DataFrame({"gene_symbol": genes, "score": scores}) geneScores.sort_values("score", inplace=True, ascending = False) #write genes to file outputDir = util.getConfigValue("KEGG", "outputDir") + datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + "/" util.createDirectory(outputDir) filename = "queryResults.txt" geneScores.to_csv(outputDir + filename, sep = ",", index = False) return geneScores
[docs] def getRelevantPathways(self, labels): """Get all pathways related to a set of labels, e.g. disease names. Uses the :class:`KEGGPathwayParser` to map KEGG's pathways from SIF to :class:`pypath.Network`. :param labels: list of gene names to be mapped :type labels: list of str :return: dict of pathway names and their internal representation as :class:`pypath.Network`. :rtype: dict """ pathway_graphs = {} pathways = [] pathway_graphs = {} for label in labels: with suppress_stdout(): pathwayListString = self.webservice.find("pathway", "\"" + label + "\"") # pathway are returned in the form path:pathwayid\tdescription\n if ((pathwayListString == "\n") or (isinstance(pathwayListString, int)) or (pathwayListString is None)): pathwayList = [] util.logWarning(WARNING_NOTHING_RETRIEVED_MESSAGE.format(kb = "KEGG", lbl = label)) else: pathwayList = pathwayListString.split("\n")[:-1] count = 0 for pathway in pathwayList: # strip off the path prefix pathwayName = pathway.split("\t")[0] pathwayName = pathwayName[5:] # replace "map" by "hsa" (because we want the reference organisms for homo sapiens) pathwayName = pathwayName.replace("map", "hsa") pathway_resource = self.webservice.get(pathwayName, "kgml") #only read the pathway if it contains meaningful information if len(str(pathway_resource)) > 10: if count <= int(self.config["maxNumPathways"]): xmlPathway = self.webservice.parse_kgml_pathway(pathwayName, pathway_resource) parsed_pathway = self.pathwayparser.parsePathway(xmlPathway, pathwayName) #only add if the pathway is not empty if parsed_pathway.vcount > 0: pathway_graphs[pathwayName] = parsed_pathway count += 1 else: break return pathway_graphs
[docs]class Disgenet(KnowledgeBase): """Knowledge base implementation for DisGeNET. :param name: name of the knowledge base :type name: str :param config: configuration parameter of the knowledge base as specified in the config file. :type config: dict :param webservice: web service querying implementation. :type webservice: :class:`DISGENET` :param hasGeneInformation: true if the knowledge base provides gene association information, false otherwise :type hasGeneInformation: bool :param hasPathwayInformation: true if the knowledge base also provides pathway information, false otherwise :type hasPathwayInformation: bool """ def __init__(self): super().__init__("DisGeNET", util.getConfig("DisGeNET"), DISGENET(), True, False)
[docs] def getRelevantGenes(self, labels): """Get all genes that are related to a set of labels, e.g. disease names. :param labels: list of identifiers, e.g. disease names, for which to find associated genes. :type labels: list of str :return: list of associated genes. :rtype: list of str """ assocs = self.webservice.query(labels) #check if error code was returned if isinstance(assocs, int): if assocs >= 500: util.logWarning( "WARNING: Error code " + str(assocs) + " returned from DisGeNET for " + " ,".join(labels)) else: util.logDebug( "DEBUG: Code " + str(assocs) + " returned from DisGeNET for " + " ,".join(labels)) util.logWarning(WARNING_NOTHING_RETRIEVED_MESSAGE.format(kb="DisGeNET", lbl=", ".join(labels))) return pd.DataFrame(columns = ["gene_symbol", "score"]) # extract only the genes from the results # for now, just merge all gene sets from the query into one (no matter how many genes were associated to a particular disease # (=no interleaving) relevantGenes = set(assocs.loc[:,"gene_symbol"]) return list(relevantGenes)
[docs] def getGeneScores(self, labels): """Get association scores for all genes that are related to the provided labels, e.g. disease names. DisGeNET provides a couple of association scores to its genes (https://www.disgenet.org/dbinfo). Which score to use can be defined by the user in the config file. :param labels: list of identifiers, e.g. disease names, for which to find associated genes. :type labels: list of str :return: DataFrame of associated genes and their association scores, in descending order. :rtype: :class:`pandas.DataFrame` """ assocs = self.webservice.query(labels) # check if an error code was returned # check if error code was returned if isinstance(assocs, int): util.logWarning("WARNING: " + str(assocs) + " RETURNED FOR LABELS: " + ", ".join(labels)) return pd.DataFrame(columns = ["gene_symbol", "score"]) if assocs.empty: return pd.DataFrame(columns = ["gene_symbol", "score"]) assocs.columns = ["gene_symbol", "score"] #sort by assoc score assocs.sort_values("score", inplace=True, ascending = False) # for duplicate gene entries, select entries with highest association score assocs.drop_duplicates("gene_symbol", keep="first") return assocs
[docs] def getRelevantPathways(self, labels): """As DisGeNET currently does not provide pathway information, this feature is not implemented. :param labels: list of labels for which to find related pathways. :type labels: list of str :return: :class:`NotImplementedError` as this knowledge base is not intended to be used for such analyses. :rtype: :class:`NotImplementedError` """ raise NotImplementedError("DisGeNET is not a pathway database. Querying for pathways is not possible.")
[docs]class Pathwaycommons(KnowledgeBase): """Knowledge base implementation for PathwayCommons. :param name: name of the knowledge base :type name: str :param config: configuration parameter of the knowledge base as specified in the config file. :type config: dict :param webservice: web service querying implementation. :type webservice: :class:`opentargets.OpenTargetsClient` :param hasGeneInformation: true if the knowledge base provides gene association information, false otherwise :type hasGeneInformation: bool :param hasPathwayInformation: true if the knowledge base also provides pathway information, false otherwise :type hasPathwayInformation: bool """ def __init__(self): super().__init__("PathwayCommons", util.getConfig("PathwayCommons"), PATHWAYCOMMONSWS(), True, True)
[docs] def getGeneScores(self, labels): """Get association scores for all genes that are related to the provided labels, e.g. disease names. For PathwayCommons, the association score for a gene is the sum of its degree percentile rank for every pathway, normalized by the overall number of pathways retrieved. This favors hub genes/genes having many interactions with other genes. :param labels: list of identifiers, e.g. disease names, for which to find associated genes. :type labels: list of str :return: DataFrame of associated genes and their association scores, in descending order. :rtype: :class:`pandas.DataFrame` """ pathways = self.getRelevantPathways(labels) genePathwayCounts = {} #stores the number of pathways a gene participates in occurrenceCounts = {} for pathway in pathways.values(): interactions = pathway.interactions_by_nodes reduced_interactions = {} # update interactions to only store the interactions counts for entity in interactions.keys(): if entity.label in genePathwayCounts.keys(): #add pathway count genePathwayCounts[entity.label] += 1 else: genePathwayCounts[entity.label] = 1 reduced_interactions[entity.label] = len(interactions[entity]) # make dataframe from count interactions_df = pd.DataFrame.from_dict(reduced_interactions, orient='index') interactions_df.columns = ["degree"] percentiles = interactions_df["degree"].rank(method='min', pct=True) for item in percentiles.iteritems(): perc_score = item[1] feature = item[0] if feature in occurrenceCounts.keys(): occurrenceCounts[feature] += perc_score else: occurrenceCounts[feature] = perc_score genes = list() scores = list() for geneID in occurrenceCounts.keys(): genes.append(geneID) #normalize the sum of percentile scores by the number of pathways containing that gene score = occurrenceCounts[geneID] / genePathwayCounts[geneID] scores.append(score) geneScores = pd.DataFrame({"gene_symbol": genes, "score": scores}) geneScores.sort_values("score", inplace=True, ascending=False) return geneScores
[docs] def getRelevantGenes(self, labels): """Get all genes that are related to a set of labels, e.g. disease names. For PathwayCommons, this means we retrieve all genes that are contained in pathways associated to these labels. :param labels: list of identifiers, e.g. disease names, for which to find associated genes. :type labels: list of str :return: list of associated genes. :rtype: list of str """ pathways = self.getRelevantPathways(labels) genes = set() for pathway in pathways.values(): nodes = pathway.nodes for node in nodes.values(): genes.add(node.label) return genes
[docs] def readPathway(self, pathway): """Reads a pathway to create :class:`pypath.Network`. :param pathway: pathway string to parse :type pathway: str """ interactions = pathway.text.split("\n") for interaction in interactions: yield interaction.split()
[docs] def getRelevantPathways(self, labels): """Get all pathways related to a set of labels, e.g. disease names as :class:`pypath.Network`. :param labels: list of gene names to be mapped :type labels: list of str :return: dict of pathway names and their internal representation as :class:`pypath.Network`. :rtype: dict """ #collect pathway IDs first overall = 0 pathways = {} p_ids = set() for term in labels: pathwayIDs = self.webservice.search(term, organism="homo sapiens", type="pathway") # if no error code was returned if (pathwayIDs is None): util.logWarning(WARNING_NOTHING_RETRIEVED_MESSAGE.format(kb="PathwayCommons", lbl=term)) continue if isinstance(pathwayIDs, int): if pathwayIDs >= 500: util.logWarning( "WARNING: Error code " + str(pathwayIDs) + " returned from PathwayCommons for " + " ,".join(labels)) else: util.logDebug( "DEBUG: Code " + str(pathwayIDs) + " returned from PathwayCommons for " + " ,".join(labels)) util.logWarning(WARNING_NOTHING_RETRIEVED_MESSAGE.format(kb="PathwayCommons", lbl=term)) continue numHits = pathwayIDs["numHits"] maxHits = pathwayIDs["maxHitsPerPage"] pages = math.ceil(numHits / maxHits) if pages > 1: for i in range(0, pages): pathwayIDs = self.webservice.search(term, page=i, organism="homo sapiens", type="pathway") #if no error code was returned if (pathwayIDs is None): util.logWarning(WARNING_NOTHING_RETRIEVED_MESSAGE.format(kb="PathwayCommons", lbl=term)) continue if isinstance(pathwayIDs, int): if pathwayIDs >= 500: util.logWarning( "WARNING: Error code " + str( pathwayIDs) + " returned from PathwayCommons for " + " ,".join(labels)) else: util.logDebug( "DEBUG: Code " + str(pathwayIDs) + " returned from PathwayCommons for " + " ,".join( labels)) util.logWarning(WARNING_NOTHING_RETRIEVED_MESSAGE.format(kb="PathwayCommons", lbl = term)) continue items = pathwayIDs["searchHit"] overall += len(items) for item in items: p_ids.add(item["uri"]) count = 0 for id in p_ids: pathway_sif = self.webservice.get(id, frmt = "SIF") #if server does not return an error code and the pathway has a sif version if (pathway_sif is None): util.logInfo( "No SIF could be returned from PathwayCommons for pathway " + id + ". This is likely if that pathway does not contain genes as nodes. It would have been filtered either way.") continue logging.disable(50) if not isinstance(pathway_sif, int) and not (pathway_sif.text == ""): #only load the pathway if the maxNumPathways count is not reached yet if count <= int(self.config["maxNumPathways"]): params = {"self": self, "pathway": pathway_sif} #create pypath pathway from SIF input = input_formats.NetworkInput( name=id, input= Pathwaycommons.readPathway, input_args=params, separator='\t', id_col_a=0, id_col_b=2, id_type_a='genesymbol', id_type_b='genesymbol', sign = (1, "+", "-") ) pathway = network.Network() pathway.load(input) #only add pathway if it has nodes if pathway.vcount > 0: pathways[id] = pathway count += 1 else: break logging.disable(0) else: continue return pathways
############################### PATHWAY MAPPERS ###############################
[docs]class PathwayParser(): """Super class that maps a pathway from its original format (provided by a knowledge base) to the internally used :class:`pypath.Network`. When having to map pathways from a knowledge base, implement a new class that inherits from this one and implements :meth:`PathwayParser.parsePathway()`. """ #abstract method for pathway parsing. returns an igraph object for the input pathway
[docs] @abstractmethod def parsePathway(self, pathway, pathwayID): """Abstract method. Parse a pathway to the internally used format of :class:`pypath.Network`. :param pathway: pathway string to parse :type pathway: str :param pathwayID: name of the pathway :type pathwayID: str :return: pathway in the internally used format.. :rtype: :class:`pypath.Network` """ pass
[docs]class KEGGPathwayParser(PathwayParser): """Parse KEGG pathways, which are returned in KGML format. """
[docs] def readInteractions(self, interactions, geneIds): """Parses interactions for a set of genes. :param interactions: interactions to parse :type interactions: list :param geneIds: gene ids whose interactions to add :type geneIds: list of str """ for interaction in interactions: # filter pathway relations for protein-protein (PPrel) and gene expression (GErel) interactions if (interaction["link"] == "PPrel") or (interaction["link"] == "GErel"): try: source = geneIds[interaction["entry1"]].label target = geneIds[interaction["entry2"]].label relation_name = interaction["name"] interactionString = "\t".join([source, target, relation_name]) yield interactionString.split() except: continue
[docs] def parsePathway(self, kgml_pathway, pathwayID): """Parse KEGG pathway to the internally used format of :class:`pypath.Network`. :param pathway: pathway string to parse :type pathway: str :param pathwayID: name of the pathway :type pathwayID: str :return: pathway in the internally used format.. :rtype: :class:`pypath.Network` """ logging.disable(50) #create gene IDs first geneIds = {} for entry in kgml_pathway["entries"]: if entry["type"] == "gene": # select first of the gene names in the list to be the alias node = entity.Entity(entry["gene_names"].split(", ")[0].strip("...")) geneIds[entry["id"]]= node params = {"self": self, "interactions": kgml_pathway["relations"], "geneIds": geneIds} # create pypath pathway from SIF input = input_formats.NetworkInput( name=pathwayID, input=KEGGPathwayParser.readInteractions, input_args=params, separator='\t', id_col_a=0, id_col_b=1, id_type_a='genesymbol', id_type_b='genesymbol', sign=(2, "+", "-") ) pathway = network.Network() pathway.load(input) logging.disable(0) return pathway