Skip to content

Commit f6b69f9

Browse files
authored
Merge branch 'main' into 82-migrate-to-new-biocommons-python-package-template
2 parents 5bebfb3 + a41da91 commit f6b69f9

File tree

7 files changed

+431
-34
lines changed

7 files changed

+431
-34
lines changed

Makefile

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@ publish: build ## publish package to PyPI
3737
$(call INFO_MESSAGE, "Publishing package")
3838
uv publish # Requires UV_PUBLISH_TOKEN or Trusted Publishing setup
3939

40-
4140
############################################################################
4241
#= FORMATTING, TESTING, AND CODE QUALITY
4342

@@ -55,6 +54,10 @@ test: ## Test the code with pytest
5554
@echo "🚀 Testing code: Running pytest"
5655
uv run pytest --cov=. --cov-report=xml
5756

57+
# to be incorporated
58+
# test-learn:
59+
# VCR_RECORD_MODE=new_episodes pytest -x
60+
5861
############################################################################
5962
#= DOCUMENTATION
6063

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ readme = "README.md"
5050
requires-python = ">=3.11"
5151

5252
[project.urls]
53-
Documentation = "https://biocommons.github.io/python-package/"
53+
Documentation = "https://biocommons.github.io/bioutils/"
5454
Homepage = "https://github.com/biocommons/bioutils"
5555
Issues = "https://github.com/biocommons/bioutils/issues"
5656
Repository = "https://github.com/biocommons/bioutils"

src/bioutils/seqfetcher.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,10 @@
1515
ncbi_tool = "bioutils"
1616
ncbi_email = "[email protected]"
1717
retry_limit = 3
18+
enst_default_seq_type = "cdna"
1819

1920

20-
def fetch_seq(ac, start_i=None, end_i=None):
21+
def fetch_seq(ac, start_i=None, end_i=None, **rest):
2122
"""Fetches sequences and subsequences from NCBI eutils and Ensembl REST interfaces.
2223
2324
Args:
@@ -82,13 +83,13 @@ def fetch_seq(ac, start_i=None, end_i=None):
8283
raise RuntimeError("No sequence fetcher for {ac}".format(ac=ac))
8384

8485
if len(eligible_fetchers) >= 1: # pragma: nocover (no way to test)
85-
_logger.debug("Multiple sequence fetchers found for " "{ac}; using first".format(ac=ac))
86+
_logger.debug("Multiple sequence fetchers found for {ac}; using first".format(ac=ac))
8687

8788
fetcher = eligible_fetchers[0]
8889
_logger.debug("fetching {ac} with {f}".format(ac=ac, f=fetcher))
8990

9091
try:
91-
return fetcher(ac, start_i, end_i)
92+
return fetcher(ac, start_i, end_i, **rest)
9293
except requests.RequestException as ex:
9394
raise RuntimeError("Failed to fetch {ac} ({ex})".format(ac=ac, ex=ex))
9495

@@ -97,7 +98,7 @@ def fetch_seq(ac, start_i=None, end_i=None):
9798
# Internal functions
9899

99100

100-
def _fetch_seq_ensembl(ac, start_i=None, end_i=None):
101+
def _fetch_seq_ensembl(ac, start_i=None, end_i=None, seq_type=None):
101102
"""Fetch sequence slice from Ensembl public REST interface.
102103
103104
Args:
@@ -106,6 +107,7 @@ def _fetch_seq_ensembl(ac, start_i=None, end_i=None):
106107
Defaults to None.
107108
end_i (int, optional): The end index (interbase coordinates) of the subsequence to fetch.
108109
Defaults to None.
110+
seq_type (str, optional): The type of Ensembl sequence to fetch
109111
110112
Returns:
111113
str: The requested (sub)sequence
@@ -141,7 +143,16 @@ def _fetch_seq_ensembl(ac, start_i=None, end_i=None):
141143
ac, version = m.groups()
142144
version = int(version)
143145

146+
if ac.startswith("ENST") and seq_type is None:
147+
try:
148+
seq_type = os.environ["ENST_DEFAULT_SEQ_TYPE"]
149+
except KeyError:
150+
seq_type = enst_default_seq_type
151+
_logger.warning(f"{ac}: Transcript type not specified or set in ENST_DEFAULT_SEQ_TYPE; assuming {seq_type}")
152+
144153
url = f"http://rest.ensembl.org/sequence/id/{ac}"
154+
if seq_type:
155+
url += f"?type={seq_type}"
145156
r = requests.get(url, headers={"Content-Type": "application/json"})
146157
r.raise_for_status()
147158
data = r.json()
@@ -204,7 +215,7 @@ def _fetch_seq_ncbi(ac, start_i=None, end_i=None):
204215
"""
205216

206217
db = "protein" if ac[1] == "P" else "nucleotide"
207-
url_fmt = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?" "db={db}&id={ac}&rettype=fasta"
218+
url_fmt = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db={db}&id={ac}&rettype=fasta"
208219

209220
if start_i is None or end_i is None:
210221
url = url_fmt.format(db=db, ac=ac)

tests/data/cassettes/test_fetch_ENST00000617537_env

Lines changed: 166 additions & 0 deletions
Large diffs are not rendered by default.

tests/data/cassettes/test_fetch_ENST00000617537_noenv

Lines changed: 166 additions & 0 deletions
Large diffs are not rendered by default.

tests/data/cassettes/test_fetch_seq

Lines changed: 41 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,6 @@ interactions:
9696
Oct 2025 01:38:10 GMT
9797
Strict-Transport-Security:
9898
- max-age=31536000; includeSubDomains; preload
99-
Transfer-Encoding:
100-
- chunked
10199
X-RateLimit-Limit:
102100
- '3'
103101
X-RateLimit-Remaining:
@@ -164,8 +162,6 @@ interactions:
164162
Oct 2025 01:38:10 GMT
165163
Strict-Transport-Security:
166164
- max-age=31536000; includeSubDomains; preload
167-
Transfer-Encoding:
168-
- chunked
169165
X-RateLimit-Limit:
170166
- '3'
171167
X-RateLimit-Remaining:
@@ -276,8 +272,6 @@ interactions:
276272
Oct 2025 01:38:11 GMT
277273
Strict-Transport-Security:
278274
- max-age=31536000; includeSubDomains; preload
279-
Transfer-Encoding:
280-
- chunked
281275
X-RateLimit-Limit:
282276
- '3'
283277
X-RateLimit-Remaining:
@@ -345,8 +339,6 @@ interactions:
345339
Oct 2025 01:38:12 GMT
346340
Strict-Transport-Security:
347341
- max-age=31536000; includeSubDomains; preload
348-
Transfer-Encoding:
349-
- chunked
350342
X-RateLimit-Limit:
351343
- '3'
352344
X-RateLimit-Remaining:
@@ -414,8 +406,6 @@ interactions:
414406
Oct 2025 01:38:12 GMT
415407
Strict-Transport-Security:
416408
- max-age=31536000; includeSubDomains; preload
417-
Transfer-Encoding:
418-
- chunked
419409
X-RateLimit-Limit:
420410
- '3'
421411
X-RateLimit-Remaining:
@@ -483,8 +473,6 @@ interactions:
483473
Oct 2025 01:38:13 GMT
484474
Strict-Transport-Security:
485475
- max-age=31536000; includeSubDomains; preload
486-
Transfer-Encoding:
487-
- chunked
488476
X-RateLimit-Limit:
489477
- '3'
490478
X-RateLimit-Remaining:
@@ -552,8 +540,6 @@ interactions:
552540
Oct 2025 01:38:13 GMT
553541
Strict-Transport-Security:
554542
- max-age=31536000; includeSubDomains; preload
555-
Transfer-Encoding:
556-
- chunked
557543
X-RateLimit-Limit:
558544
- '3'
559545
X-RateLimit-Remaining:
@@ -620,8 +606,6 @@ interactions:
620606
Oct 2025 01:38:14 GMT
621607
Strict-Transport-Security:
622608
- max-age=31536000; includeSubDomains; preload
623-
Transfer-Encoding:
624-
- chunked
625609
X-RateLimit-Limit:
626610
- '3'
627611
X-RateLimit-Remaining:
@@ -688,8 +672,6 @@ interactions:
688672
Oct 2025 01:38:14 GMT
689673
Strict-Transport-Security:
690674
- max-age=31536000; includeSubDomains; preload
691-
Transfer-Encoding:
692-
- chunked
693675
X-RateLimit-Limit:
694676
- '3'
695677
X-RateLimit-Remaining:
@@ -757,8 +739,6 @@ interactions:
757739
Oct 2025 01:38:15 GMT
758740
Strict-Transport-Security:
759741
- max-age=31536000; includeSubDomains; preload
760-
Transfer-Encoding:
761-
- chunked
762742
X-RateLimit-Limit:
763743
- '3'
764744
X-RateLimit-Remaining:
@@ -826,8 +806,6 @@ interactions:
826806
Oct 2025 01:38:15 GMT
827807
Strict-Transport-Security:
828808
- max-age=31536000; includeSubDomains; preload
829-
Transfer-Encoding:
830-
- chunked
831809
X-RateLimit-Limit:
832810
- '3'
833811
X-RateLimit-Remaining:
@@ -923,4 +901,45 @@ interactions:
923901
status:
924902
code: 200
925903
message: OK
904+
- request:
905+
body: null
906+
headers:
907+
Accept:
908+
- '*/*'
909+
Accept-Encoding:
910+
- gzip, deflate
911+
Connection:
912+
- keep-alive
913+
Content-Type:
914+
- application/json
915+
User-Agent:
916+
- python-requests/2.32.3
917+
method: GET
918+
uri: http://rest.ensembl.org/sequence/id/ENST00000288602?type=cdna
919+
response:
920+
body:
921+
string: '{"query":"ENST00000288602","molecule":"dna","desc":null,"seq":"CCGCTCGGGCCCCGGCTCTCGGTTATAAGATGGCGGCGCTGAGCGGTGGCGGTGGTGGCGGCGCGGAGCCGGGCCAGGCTCTGTTCAACGGGGACATGGAGCCCGAGGCCGGCGCCGGCGCCGGCGCCGCGGCCTCTTCGGCTGCGGACCCTGCCATTCCGGAGGAGGTGTGGAATATCAAACAAATGATTAAGTTGACACAGGAACATATAGAGGCCCTATTGGACAAATTTGGTGGGGAGCATAATCCACCATCAATATATCTGGAGGCCTATGAAGAATACACCAGCAAGCTAGATGCACTCCAACAAAGAGAACAACAGTTATTGGAATCTCTGGGGAACGGAACTGATTTTTCTGTTTCTAGCTCTGCATCAATGGATACCGTTACATCTTCTTCCTCTTCTAGCCTTTCAGTGCTACCTTCATCTCTTTCAGTTTTTCAAAATCCCACAGATGTGGCACGGAGCAACCCCAAGTCACCACAAAAACCTATCGTTAGAGTCTTCCTGCCCAACAAACAGAGGACAGTGGTACCTGCAAGGTGTGGAGTTACAGTCCGAGACAGTCTAAAGAAAGCACTGATGATGAGAGGTCTAATCCCAGAGTGCTGTGCTGTTTACAGAATTCAGGATGGAGAGAAGAAACCAATTGGTTGGGACACTGATATTTCCTGGCTTACTGGAGAAGAATTGCATGTGGAAGTGTTGGAGAATGTTCCACTTACAACACACAACTTTGTACGAAAAACGTTTTTCACCTTAGCATTTTGTGACTTTTGTCGAAAGCTGCTTTTCCAGGGTTTCCGCTGTCAAACATGTGGTTATAAATTTCACCAGCGTTGTAGTACAGAAGTTCCACTGATGTGTGTTAATTATGACCAACTTGATTTGCTGTTTGTCTCCAAGTTCTTTGAACACCACCCAATACCACAGGAAGAGGCGTCCTTAGCAGAGACTGCCCTAACATCTGGATCATCCCCTTCCGCACCCGCCTCGGACTCTATTGGGCCCCAAATTCTCACCAGTCCGTCTCCTTCAAAATCCATTCCAATTCCACAGCCCTTCCGACCAGCAGATGAAGATCATCGAAATCAATTTGGGCAACGAGACCGATCCTCATCAGCTCCCAATGTGCATATAAACACAATAGAACCTGTCAATATTGATGACTTGATTAGAGACCAAGGATTTCGTGGTGATGGAGCCCCTTTGAACCAGCTGATGCGCTGTCTTCGGAAATACCAATCCCGGACTCCCAGTCCCCTCCTACATTCTGTCCCCAGTGAAATAGTGTTTGATTTTGAGCCTGGCCCAGTGTTCAGAGGATCAACCACAGGTTTGTCTGCTACCCCCCCTGCCTCATTACCTGGCTCACTAACTAACGTGAAAGCCTTACAGAAATCTCCAGGACCTCAGCGAGAAAGGAAGTCATCTTCATCCTCAGAAGACAGGAATCGAATGAAAACACTTGGTAGACGGGACTCGAGTGATGATTGGGAGATTCCTGATGGGCAGATTACAGTGGGACAAAGAATTGGATCTGGATCATTTGGAACAGTCTACAAGGGAAAGTGGCATGGTGATGTGGCAGTGAAAATGTTGAATGTGACAGCACCTACACCTCAGCAGTTACAAGCCTTCAAAAATGAAGTAGGAGTACTCAGGAAAACACGACATGTGAATATCCTACTCTTCATGGGCTATTCCACAAAGCCACAACTGGCTATTGTTACCCAGTGGTGTGAGGGCTCCAGCTTGTATCACCATCTCCATATCATTGAGACCAAATTTGAGATGATCAAACTTATAGATATTGCACGACAGACTGCACAGGGCATGGATTACTTACACGCCAAGTCAATCATCCACAGAGACCTCAAGAGTAATAATATATTTCTTCATGAAGACCTCACAGTAAAAATAGGTGATTTTGGTCTAGCTACAGTGAAATCTCGATGGAGTGGGTCCCATCAGTTTGAACAGTTGTCTGGATCCATTTTGTGGATGGCACCAGAAGTCATCAGAATGCAAGATAAAAATCCATACAGCTTTCAGTCAGATGTATATGCATTTGGAATTGTTCTGTATGAATTGATGACTGGACAGTTACCTTATTCAAACATCAACAACAGGGACCAGATAATTTTTATGGTGGGACGAGGATACCTGTCTCCAGATCTCAGTAAGGTACGGAGTAACTGTCCAAAAGCCATGAAGAGATTAATGGCAGAGTGCCTCAAAAAGAAAAGAGATGAGAGACCACTCTTTCCCCAAATTCTCGCCTCTATTGAGCTGCTGGCCCGCTCATTGCCAAAAATTCACCGCAGTGCATCAGAACCCTCCTTGAATCGGGCTGGTTTCCAAACAGAGGATTTTAGTCTATATGCTTGTGCTTCTCCAAAAACACCCATCCAGGCAGGGGGATATGGTGCGTTTCCTGTCCACTGAAACAAATGAGTGAGAGAGTTCAGGAGAGTAGCAACAAAAGGAAAATAAATGAACATATGTTTGCTTATATGTTAAATTGAATAAAATACTCTCTTTTTTTTTAAGGTGAAC","id":"ENST00000288602","version":11}'
922+
headers:
923+
Content-Length:
924+
- '2662'
925+
Content-Type:
926+
- application/json
927+
Date:
928+
- Wed, 05 Mar 2025 04:21:08 GMT
929+
Vary:
930+
- Content-Type
931+
- Origin
932+
X-RateLimit-Limit:
933+
- '55000'
934+
X-RateLimit-Period:
935+
- '3600'
936+
X-RateLimit-Remaining:
937+
- '54999'
938+
X-RateLimit-Reset:
939+
- '2332'
940+
X-Runtime:
941+
- '0.206213'
942+
status:
943+
code: 200
944+
message: OK
926945
version: 1

tests/test_seqfetcher.py

Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,7 @@
44
import pytest
55
import vcr
66

7-
from bioutils.seqfetcher import (
8-
_add_eutils_api_key,
9-
_fetch_seq_ncbi,
10-
fetch_seq,
11-
)
7+
from bioutils.seqfetcher import _add_eutils_api_key, _fetch_seq_ncbi, fetch_seq, enst_default_seq_type
128

139

1410
@pytest.fixture(autouse=True)
@@ -40,6 +36,40 @@ def test_fetch_seq():
4036
assert "MAALSGGGGGGAEPGQALFNGDMEP" == fetch_seq("ENSP00000288602", 0, 25)
4137

4238

39+
ENST00000617537_470_480 = {
40+
# In [16]: s_gen[470:480], s_cdna[470:480], s_cds[470:480]
41+
# Out[16]: ("TAGGTATGCA", "TAGGGTGTGT", "TGACATTTGT")
42+
"genomic": "TAGGTATGCA",
43+
"cdna": "TAGGGTGTGT",
44+
"cds": "TGACATTTGT",
45+
}
46+
47+
48+
@vcr.use_cassette
49+
def test_fetch_ENST00000617537_noenv(caplog, monkeypatch):
50+
"""ensure expected lengths for ENST00000617537 with ENST_DEFAULT_SEQ_TYPE unset"""
51+
monkeypatch.delenv("ENST_DEFAULT_SEQ_TYPE", raising=False)
52+
ac = "ENST00000617537"
53+
assert ENST00000617537_470_480[enst_default_seq_type] == fetch_seq(ac, start_i=470, end_i=480)
54+
assert "Transcript type not specified or set in ENST_DEFAULT_SEQ_TYPE" in caplog.text
55+
assert ENST00000617537_470_480["genomic"] == fetch_seq(ac, start_i=470, end_i=480, seq_type="genomic")
56+
assert ENST00000617537_470_480["cdna"] == fetch_seq(ac, start_i=470, end_i=480, seq_type="cdna")
57+
assert ENST00000617537_470_480["cds"] == fetch_seq(ac, start_i=470, end_i=480, seq_type="cds")
58+
59+
60+
@vcr.use_cassette
61+
def test_fetch_ENST00000617537_env(caplog, monkeypatch):
62+
"""ensure expected lengths for ENST00000617537 with ENST_DEFAULT_SEQ_TYPE set"""
63+
user_enst_default_type = "cds" # intentionally != enst_default_seq_type to ensure use
64+
monkeypatch.setenv("ENST_DEFAULT_SEQ_TYPE", user_enst_default_type)
65+
ac = "ENST00000617537"
66+
assert ENST00000617537_470_480[user_enst_default_type] == fetch_seq(ac, start_i=470, end_i=480)
67+
assert "Transcript type not specified or set in ENST_DEFAULT_SEQ_TYPE" not in caplog.text
68+
assert ENST00000617537_470_480["genomic"] == fetch_seq(ac, start_i=470, end_i=480, seq_type="genomic")
69+
assert ENST00000617537_470_480["cdna"] == fetch_seq(ac, start_i=470, end_i=480, seq_type="cdna")
70+
assert ENST00000617537_470_480["cds"] == fetch_seq(ac, start_i=470, end_i=480, seq_type="cds")
71+
72+
4373
@vcr.use_cassette
4474
def test_fetch_seq_ncbi_invalid_positions():
4575
with pytest.raises(RuntimeError) as excinfo:
@@ -88,3 +118,5 @@ def test_rate_limit():
88118
num_requests = num_threads = 5
89119
p = multiprocessing.Pool(num_threads)
90120
p.map(_check1, range(num_requests))
121+
p.close()
122+
p.join()

0 commit comments

Comments
 (0)