Module prmf.ensembl
Ensembl.org recommends the use of biomart to map HGNC, ENSG, ENST, ENSP Currently these are implemented in the R script dl_ensembl_map TODO add biomart python API functions here
View Source
"""
Ensembl.org recommends the use of biomart to map HGNC, ENSG, ENST, ENSP
Currently these are implemented in the R script dl_ensembl_map
TODO add biomart python API functions here
"""
import sys
import re
import os, os.path
import subprocess as sp
import glob
def download_mapping_tsv(outdir, release='latest'):
"""
Download an Ensembl identifier mapping file (HGNC <-> ENSP)
Parameters
----------
outdir: str
Directory to place downloaded file in; file will be named as it is on EBI's FTP server
release: int or str
either 'latest' or a specific release number such as '97'
Returns
-------
rv : str or None
the downloaded filepath or None if it cannot be determined (due to glob wildcards matching multiple files)
TODO
----
"""
rv = None
url = None
if release == 'latest':
url = 'ftp://ftp.ensembl.org/pub/current_tsv/homo_sapiens/Homo_sapiens.GRCh38.*.refseq.tsv.gz'
else:
url = 'ftp://ftp.ensembl.org/pub/release-{}/tsv/homo_sapiens/Homo_sapiens.GRCh38.{}.refseq.tsv.gz'.format(release, release)
bn = os.path.basename(url)
fpath = os.path.join(outdir, bn)
glob_rv = glob.glob(fpath)
if len(glob_rv) == 1:
rv = glob_rv[0]
args = ['wget', url]
sp.check_call(args, cwd=outdir)
return rv
def parse_mapping_tsv(fh, key_index=0, value_index=1, delim='\t'):
"""
Parse a TSV file like the file at
ftp://ftp.ensembl.org/pub/release-94/tsv/homo_sapiens/Homo_sapiens.GRCh38.94.refseq.tsv.gz
into a mapping
"""
rv = {}
if type(fh) is str:
fh = open(fh, 'r')
for line in fh:
line = line.rstrip()
words = line.split(delim)
# some values for key_index may be missing, skip those entries in the file
if key_index >= len(words):
continue
key = words[key_index]
value = None
if(len(words) > value_index):
value = words[value_index]
else:
# TODO warn?
pass
if value is not None:
if key not in rv:
rv[key] = set()
rv[key].add(value)
return rv
def map_hgnc_to_ensps(fh):
"""
Return map from HGNC symbol to many ENSPs
Parameters
----------
fh : file-like
database file from dl_ensembl_map: 4 column TSV of HGNC, ENSG, ENST, ENSP
Returns
-------
rv : dict
mapping of HGNC symbol to a set of ENSPs
"""
return parse_mapping_tsv(fh, key_index=0, value_index=3)
def filter_one_to_many(hgnc_to_ensps_map, G):
"""
Return a map from HGNC symbol to one ENSP. The one is chosen to be the first ENSP in G.
ENSPs are examined in sorted order.
Parameters
----------
hgnc_to_ensps_map : dict
return value of map_hgnc_to_ensps
copy_map : dict
map hgnc to the number of ENSP in the map that also appear in G
G : nx.Graph
protein-protein interaction network such as STRING
"""
rv = {}
copy_map = {}
for hgnc, ensps in hgnc_to_ensps_map.items():
ensps = sorted(ensps)
any_ensp_in_G = False
for ensp in ensps:
if ensp in G:
any_ensp_in_G = True
if hgnc in rv:
# then this is the second ENSP in G
if hgnc in copy_map:
copy_map[hgnc] += 1
else:
copy_map[hgnc] = 2
else:
rv[hgnc] = ensp
if not any_ensp_in_G:
# then use the first ensp in sorted order
if len(ensps) > 0:
rv[hgnc] = ensps[0]
return rv, copy_map
def map_ensp_to_hgnc(fh):
rv = {}
for line in fh:
line = line.rstrip()
words = line.split('\t')
hgnc_symbol = words[0]
ensp_id = None
if(len(words) >= 4):
ensp_id = words[3]
if ensp_id is not None:
if ensp_id in rv:
sys.stderr.write("Encountered ENSP id {} more than once\n".format(ensp_id))
rv[ensp_id] = hgnc_symbol
return rv
def apply_map(word_map, gene_list_fh):
"""
Read words (one per line) from gene_list_fh and apply word_map.
Return mapped words as a set.
Parameters
----------
word_map : dict
mapping of HGNC symbol to set of ENSPs
gene_list_fh : file-like
newline delimited file of HGNC symbols
Returns
-------
rv : list
sorted list of mapped ENSP identifiers
"""
rv = set()
for line in gene_list_fh:
hgnc_symbol = line.rstrip()
ensp_ids = word_map.get(hgnc_symbol)
if ensp_ids is None:
sys.stderr.write("Unrecognized HGNC symbol: {}\n".format(hgnc_symbol))
else:
rv = rv.union(ensp_ids)
rv = sorted(rv)
return rv
def apply_mapping(word_map, io_pairs):
"""
For infile, outfile pairs in <io_pairs>, read words separated by PCRE non-word-character
regexp '\W', apply <word_map>, and write mapped words to outfile. Note many words in infile
do not need to appear in the <word_map> and will be left unmapped.
"""
sep_re = re.compile('\W')
word = ""
for infile, outfile in io_pairs:
with open(infile, "r") as ifh:
with open(outfile, "w") as ofh:
for line in ifh:
for char in line:
match_data = sep_re.match(char)
if(match_data is not None):
# then process word
ensp = word_map.get(word)
if ensp is None:
ofh.write(word + match_data.group(0))
else:
ofh.write(ensp + match_data.group(0))
word = ""
else:
word += char
Functions
apply_map
def apply_map(
word_map,
gene_list_fh
)
Read words (one per line) from gene_list_fh and apply word_map. Return mapped words as a set.
Parameters
word_map : dict mapping of HGNC symbol to set of ENSPs
gene_list_fh : file-like newline delimited file of HGNC symbols
Returns
rv : list sorted list of mapped ENSP identifiers
View Source
def apply_map(word_map, gene_list_fh):
"""
Read words (one per line) from gene_list_fh and apply word_map.
Return mapped words as a set.
Parameters
----------
word_map : dict
mapping of HGNC symbol to set of ENSPs
gene_list_fh : file-like
newline delimited file of HGNC symbols
Returns
-------
rv : list
sorted list of mapped ENSP identifiers
"""
rv = set()
for line in gene_list_fh:
hgnc_symbol = line.rstrip()
ensp_ids = word_map.get(hgnc_symbol)
if ensp_ids is None:
sys.stderr.write("Unrecognized HGNC symbol: {}\n".format(hgnc_symbol))
else:
rv = rv.union(ensp_ids)
rv = sorted(rv)
return rv
apply_mapping
def apply_mapping(
word_map,
io_pairs
)
For infile, outfile pairs in
View Source
def apply_mapping(word_map, io_pairs):
"""
For infile, outfile pairs in <io_pairs>, read words separated by PCRE non-word-character
regexp '\W', apply <word_map>, and write mapped words to outfile. Note many words in infile
do not need to appear in the <word_map> and will be left unmapped.
"""
sep_re = re.compile('\W')
word = ""
for infile, outfile in io_pairs:
with open(infile, "r") as ifh:
with open(outfile, "w") as ofh:
for line in ifh:
for char in line:
match_data = sep_re.match(char)
if(match_data is not None):
# then process word
ensp = word_map.get(word)
if ensp is None:
ofh.write(word + match_data.group(0))
else:
ofh.write(ensp + match_data.group(0))
word = ""
else:
word += char
download_mapping_tsv
def download_mapping_tsv(
outdir,
release='latest'
)
Download an Ensembl identifier mapping file (HGNC <-> ENSP)
Parameters
outdir: str Directory to place downloaded file in; file will be named as it is on EBI's FTP server
release: int or str either 'latest' or a specific release number such as '97'
Returns
rv : str or None the downloaded filepath or None if it cannot be determined (due to glob wildcards matching multiple files)
TODO
View Source
def download_mapping_tsv(outdir, release='latest'):
"""
Download an Ensembl identifier mapping file (HGNC <-> ENSP)
Parameters
----------
outdir: str
Directory to place downloaded file in; file will be named as it is on EBI's FTP server
release: int or str
either 'latest' or a specific release number such as '97'
Returns
-------
rv : str or None
the downloaded filepath or None if it cannot be determined (due to glob wildcards matching multiple files)
TODO
----
"""
rv = None
url = None
if release == 'latest':
url = 'ftp://ftp.ensembl.org/pub/current_tsv/homo_sapiens/Homo_sapiens.GRCh38.*.refseq.tsv.gz'
else:
url = 'ftp://ftp.ensembl.org/pub/release-{}/tsv/homo_sapiens/Homo_sapiens.GRCh38.{}.refseq.tsv.gz'.format(release, release)
bn = os.path.basename(url)
fpath = os.path.join(outdir, bn)
glob_rv = glob.glob(fpath)
if len(glob_rv) == 1:
rv = glob_rv[0]
args = ['wget', url]
sp.check_call(args, cwd=outdir)
return rv
filter_one_to_many
def filter_one_to_many(
hgnc_to_ensps_map,
G
)
Return a map from HGNC symbol to one ENSP. The one is chosen to be the first ENSP in G. ENSPs are examined in sorted order.
Parameters
hgnc_to_ensps_map : dict return value of map_hgnc_to_ensps
copy_map : dict map hgnc to the number of ENSP in the map that also appear in G
G : nx.Graph protein-protein interaction network such as STRING
View Source
def filter_one_to_many(hgnc_to_ensps_map, G):
"""
Return a map from HGNC symbol to one ENSP. The one is chosen to be the first ENSP in G.
ENSPs are examined in sorted order.
Parameters
----------
hgnc_to_ensps_map : dict
return value of map_hgnc_to_ensps
copy_map : dict
map hgnc to the number of ENSP in the map that also appear in G
G : nx.Graph
protein-protein interaction network such as STRING
"""
rv = {}
copy_map = {}
for hgnc, ensps in hgnc_to_ensps_map.items():
ensps = sorted(ensps)
any_ensp_in_G = False
for ensp in ensps:
if ensp in G:
any_ensp_in_G = True
if hgnc in rv:
# then this is the second ENSP in G
if hgnc in copy_map:
copy_map[hgnc] += 1
else:
copy_map[hgnc] = 2
else:
rv[hgnc] = ensp
if not any_ensp_in_G:
# then use the first ensp in sorted order
if len(ensps) > 0:
rv[hgnc] = ensps[0]
return rv, copy_map
map_ensp_to_hgnc
def map_ensp_to_hgnc(
fh
)
View Source
def map_ensp_to_hgnc(fh):
rv = {}
for line in fh:
line = line.rstrip()
words = line.split('\t')
hgnc_symbol = words[0]
ensp_id = None
if(len(words) >= 4):
ensp_id = words[3]
if ensp_id is not None:
if ensp_id in rv:
sys.stderr.write("Encountered ENSP id {} more than once\n".format(ensp_id))
rv[ensp_id] = hgnc_symbol
return rv
map_hgnc_to_ensps
def map_hgnc_to_ensps(
fh
)
Return map from HGNC symbol to many ENSPs
Parameters
fh : file-like database file from dl_ensembl_map: 4 column TSV of HGNC, ENSG, ENST, ENSP
Returns
rv : dict mapping of HGNC symbol to a set of ENSPs
View Source
def map_hgnc_to_ensps(fh):
"""
Return map from HGNC symbol to many ENSPs
Parameters
----------
fh : file-like
database file from dl_ensembl_map: 4 column TSV of HGNC, ENSG, ENST, ENSP
Returns
-------
rv : dict
mapping of HGNC symbol to a set of ENSPs
"""
return parse_mapping_tsv(fh, key_index=0, value_index=3)
parse_mapping_tsv
def parse_mapping_tsv(
fh,
key_index=0,
value_index=1,
delim='\t'
)
Parse a TSV file like the file at ftp://ftp.ensembl.org/pub/release-94/tsv/homo_sapiens/Homo_sapiens.GRCh38.94.refseq.tsv.gz into a mapping
View Source
def parse_mapping_tsv(fh, key_index=0, value_index=1, delim='\t'):
"""
Parse a TSV file like the file at
ftp://ftp.ensembl.org/pub/release-94/tsv/homo_sapiens/Homo_sapiens.GRCh38.94.refseq.tsv.gz
into a mapping
"""
rv = {}
if type(fh) is str:
fh = open(fh, 'r')
for line in fh:
line = line.rstrip()
words = line.split(delim)
# some values for key_index may be missing, skip those entries in the file
if key_index >= len(words):
continue
key = words[key_index]
value = None
if(len(words) > value_index):
value = words[value_index]
else:
# TODO warn?
pass
if value is not None:
if key not in rv:
rv[key] = set()
rv[key].add(value)
return rv