Skip to content

Commit 7d98713

Browse files
authored
feature: gtf2bed12 script. Add installation instructions via pip (set… (#1)
- gtf2bed12 script. - installation instructions - pytest
1 parent d63729e commit 7d98713

File tree

11 files changed

+309
-4
lines changed

11 files changed

+309
-4
lines changed

.github/workflows/python-test.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ jobs:
2323
- name: Install dependencies
2424
run: |
2525
python -m pip install --upgrade pip
26-
pip install flake8 pytest
27-
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
26+
pip install flake8 pytest pytest-console-scripts
27+
pip install .
2828
- name: Lint with flake8
2929
run: |
3030
# stop the build if there are Python syntax errors or undefined names
@@ -33,4 +33,4 @@ jobs:
3333
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
3434
- name: Test with pytest
3535
run: |
36-
pytest
36+
pytest tests/test.py

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
tests/__pycache__/
2+
venv/

README.md

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,32 @@
1-
# z-gtf
1+
# zgtf
22
gtf conversion utlity
3+
4+
# Installation
5+
6+
```bash
7+
# clone the repo
8+
git clone https://github.com/zavolanlab/zgtf.git
9+
# create a virtual environment
10+
python3 -m venv venv
11+
# activate the virtual environment
12+
source venv/bin/activate
13+
# install zgtf scripts
14+
pip install .
15+
```
16+
17+
# Run
18+
19+
Convert gtf file to bed12
20+
```bash
21+
gtf2bed12 --gtf <INPUT.gtf> --bed12 <OUTPUT.bed> --verbose
22+
```
23+
24+
```
25+
arguments:
26+
-h, --help show this help message and exit
27+
--gtf FILE Annotation file (gtf format)
28+
--bed12 FILE Output file (bed12 format)
29+
--transcript_type TRANSCRIPT_TYPE
30+
Transcript type [Default: protein_coding]
31+
-v, --verbose Verbose
32+
```

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
htseq>=0.11

requirements_dev.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
htseq>=0.11
2+
pytest
3+
pytest-console-scripts

scripts/gtf2bed12

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
#!/usr/bin/env python
2+
3+
import sys
4+
import os
5+
from argparse import ArgumentParser, RawTextHelpFormatter
6+
from zgtf.zgtf import gtf_to_transcript_exons, transcript_exons_to_bed12
7+
8+
def main():
9+
"""Convert gtf file to bed12"""
10+
11+
__doc__ = "Convert gtf file to bed12"
12+
13+
parser = ArgumentParser(
14+
description=__doc__,
15+
formatter_class=RawTextHelpFormatter
16+
)
17+
18+
parser.add_argument(
19+
"--gtf",
20+
dest="gtf",
21+
help="Annotation file (gtf format)",
22+
required=True,
23+
metavar="FILE"
24+
)
25+
26+
parser.add_argument(
27+
"--bed12",
28+
dest="bed12",
29+
help="Output file (bed12 format)",
30+
required=True,
31+
metavar="FILE"
32+
)
33+
34+
parser.add_argument(
35+
"--transcript_type",
36+
dest="transcript_type",
37+
help="Transcript type [Default: protein_coding]",
38+
required=False,
39+
default="protein_coding"
40+
)
41+
42+
parser.add_argument(
43+
"-v",
44+
"--verbose",
45+
action="store_true",
46+
dest="verbose",
47+
default=False,
48+
required=False,
49+
help="Verbose"
50+
)
51+
52+
try:
53+
options = parser.parse_args()
54+
except(Exception):
55+
parser.print_help()
56+
57+
if len(sys.argv) == 1:
58+
parser.print_help()
59+
sys.exit(1)
60+
61+
if options.verbose:
62+
sys.stdout.write(f"Parsing gtf file: {options.gtf}{os.linesep}")
63+
transcripts = gtf_to_transcript_exons(options.gtf, options.transcript_type)
64+
65+
w = open(options.bed12, "w")
66+
for transcript_id in transcripts.keys():
67+
exons = transcripts[transcript_id]
68+
w.write(transcript_exons_to_bed12(exons, transcript_id) + os.linesep)
69+
w.close()
70+
71+
72+
if __name__ == '__main__':
73+
try:
74+
main()
75+
except KeyboardInterrupt:
76+
sys.stderr.write("User interrupt!" + os.linesep)
77+
sys.exit(0)

setup.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import sys
2+
from setuptools import setup
3+
4+
if sys.version_info < (3, 6):
5+
sys.exit('Sorry, zgtf requires Python >= 3.6')
6+
7+
requirements = [
8+
"htseq>=0.11",
9+
]
10+
11+
setup(
12+
name='zgtf',
13+
version='0.1',
14+
description="gtf conversion utility.",
15+
author="Foivos Gypas",
16+
author_email='[email protected]',
17+
url='',
18+
packages=['zgtf'],
19+
package_dir={'zgtf': 'zgtf'},
20+
include_package_data=True,
21+
scripts=['scripts/gtf2bed12'],
22+
install_requires=requirements,
23+
keywords='zgtf',
24+
classifiers=[
25+
'Programming Language :: Python :: 3.6',
26+
]
27+
)

test_data/test.gtf

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
#!genome-build GRCh38.p13
2+
#!genome-version GRCh38
3+
#!genome-date 2013-12
4+
#!genome-build-accession NCBI:GCA_000001405.28
5+
#!genebuild-last-updated 2019-08
6+
1-10000-20000 havana gene 1870 4410 . + . gene_id "ENSG00000223972"; gene_version "5"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed_unprocessed_pseudogene";
7+
1-10000-20000 havana transcript 1870 4410 . + . gene_id "ENSG00000223972"; gene_version "5"; transcript_id "ENST00000456328"; transcript_version "2"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; tag "basic"; transcript_support_level "1";
8+
1-10000-20000 havana exon 1870 2228 . + . gene_id "ENSG00000223972"; gene_version "5"; transcript_id "ENST00000456328"; transcript_version "2"; exon_number "1"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00002234944"; exon_version "1"; tag "basic"; transcript_support_level "1";
9+
1-10000-20000 havana exon 2614 2722 . + . gene_id "ENSG00000223972"; gene_version "5"; transcript_id "ENST00000456328"; transcript_version "2"; exon_number "2"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00003582793"; exon_version "1"; tag "basic"; transcript_support_level "1";
10+
1-10000-20000 havana exon 3222 4410 . + . gene_id "ENSG00000223972"; gene_version "5"; transcript_id "ENST00000456328"; transcript_version "2"; exon_number "3"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00002312635"; exon_version "1"; tag "basic"; transcript_support_level "1";
11+
1-10000-20000 havana transcript 2011 3671 . + . gene_id "ENSG00000223972"; gene_version "5"; transcript_id "ENST00000450305"; transcript_version "2"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-201"; transcript_source "havana"; transcript_biotype "transcribed_unprocessed_pseudogene"; tag "basic"; transcript_support_level "NA";
12+
1-10000-20000 havana exon 2011 2058 . + . gene_id "ENSG00000223972"; gene_version "5"; transcript_id "ENST00000450305"; transcript_version "2"; exon_number "1"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-201"; transcript_source "havana"; transcript_biotype "transcribed_unprocessed_pseudogene"; exon_id "ENSE00001948541"; exon_version "1"; tag "basic"; transcript_support_level "NA";
13+
1-10000-20000 havana exon 2180 2228 . + . gene_id "ENSG00000223972"; gene_version "5"; transcript_id "ENST00000450305"; transcript_version "2"; exon_number "2"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-201"; transcript_source "havana"; transcript_biotype "transcribed_unprocessed_pseudogene"; exon_id "ENSE00001671638"; exon_version "2"; tag "basic"; transcript_support_level "NA";
14+
1-10000-20000 havana exon 2614 2698 . + . gene_id "ENSG00000223972"; gene_version "5"; transcript_id "ENST00000450305"; transcript_version "2"; exon_number "3"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-201"; transcript_source "havana"; transcript_biotype "transcribed_unprocessed_pseudogene"; exon_id "ENSE00001758273"; exon_version "2"; tag "basic"; transcript_support_level "NA";
15+
1-10000-20000 havana exon 2976 3053 . + . gene_id "ENSG00000223972"; gene_version "5"; transcript_id "ENST00000450305"; transcript_version "2"; exon_number "4"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-201"; transcript_source "havana"; transcript_biotype "transcribed_unprocessed_pseudogene"; exon_id "ENSE00001799933"; exon_version "2"; tag "basic"; transcript_support_level "NA";
16+
1-10000-20000 havana exon 3222 3375 . + . gene_id "ENSG00000223972"; gene_version "5"; transcript_id "ENST00000450305"; transcript_version "2"; exon_number "5"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-201"; transcript_source "havana"; transcript_biotype "transcribed_unprocessed_pseudogene"; exon_id "ENSE00001746346"; exon_version "2"; tag "basic"; transcript_support_level "NA";
17+
1-10000-20000 havana exon 3454 3671 . + . gene_id "ENSG00000223972"; gene_version "5"; transcript_id "ENST00000450305"; transcript_version "2"; exon_number "6"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-201"; transcript_source "havana"; transcript_biotype "transcribed_unprocessed_pseudogene"; exon_id "ENSE00001863096"; exon_version "1"; tag "basic"; transcript_support_level "NA";
18+
1-10000-20000 havana gene 4405 8367 . - . gene_id "ENSG00000227232"; gene_version "5"; gene_name "WASH7P"; gene_source "havana"; gene_biotype "unprocessed_pseudogene";
19+
1-10000-20000 havana transcript 4405 8367 . - . gene_id "ENSG00000227232"; gene_version "5"; transcript_id "ENST00000488147"; transcript_version "1"; gene_name "WASH7P"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; transcript_name "WASH7P-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; tag "basic"; transcript_support_level "NA";
20+
1-10000-20000 havana exon 8269 8367 . - . gene_id "ENSG00000227232"; gene_version "5"; transcript_id "ENST00000488147"; transcript_version "1"; exon_number "3"; gene_name "WASH7P"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; transcript_name "WASH7P-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; exon_id "ENSE00003477500"; exon_version "1"; tag "basic"; transcript_support_level "NA";
21+
1-10000-20000 havana exon 7916 8062 . - . gene_id "ENSG00000227232"; gene_version "5"; transcript_id "ENST00000488147"; transcript_version "1"; exon_number "4"; gene_name "WASH7P"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; transcript_name "WASH7P-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; exon_id "ENSE00003565697"; exon_version "1"; tag "basic"; transcript_support_level "NA";
22+
1-10000-20000 havana exon 7607 7743 . - . gene_id "ENSG00000227232"; gene_version "5"; transcript_id "ENST00000488147"; transcript_version "1"; exon_number "5"; gene_name "WASH7P"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; transcript_name "WASH7P-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; exon_id "ENSE00003475637"; exon_version "1"; tag "basic"; transcript_support_level "NA";
23+
1-10000-20000 havana exon 7234 7369 . - . gene_id "ENSG00000227232"; gene_version "5"; transcript_id "ENST00000488147"; transcript_version "1"; exon_number "6"; gene_name "WASH7P"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; transcript_name "WASH7P-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; exon_id "ENSE00003502542"; exon_version "1"; tag "basic"; transcript_support_level "NA";
24+
1-10000-20000 havana exon 6859 7056 . - . gene_id "ENSG00000227232"; gene_version "5"; transcript_id "ENST00000488147"; transcript_version "1"; exon_number "7"; gene_name "WASH7P"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; transcript_name "WASH7P-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; exon_id "ENSE00003553898"; exon_version "1"; tag "basic"; transcript_support_level "NA";
25+
1-10000-20000 havana exon 6608 6766 . - . gene_id "ENSG00000227232"; gene_version "5"; transcript_id "ENST00000488147"; transcript_version "1"; exon_number "8"; gene_name "WASH7P"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; transcript_name "WASH7P-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; exon_id "ENSE00003621279"; exon_version "1"; tag "basic"; transcript_support_level "NA";
26+
1-10000-20000 havana exon 5797 5948 . - . gene_id "ENSG00000227232"; gene_version "5"; transcript_id "ENST00000488147"; transcript_version "1"; exon_number "9"; gene_name "WASH7P"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; transcript_name "WASH7P-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; exon_id "ENSE00002030414"; exon_version "1"; tag "basic"; transcript_support_level "NA";
27+
1-10000-20000 havana exon 5006 5039 . - . gene_id "ENSG00000227232"; gene_version "5"; transcript_id "ENST00000488147"; transcript_version "1"; exon_number "10"; gene_name "WASH7P"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; transcript_name "WASH7P-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; exon_id "ENSE00001935574"; exon_version "1"; tag "basic"; transcript_support_level "NA";
28+
1-10000-20000 havana exon 4405 4502 . - . gene_id "ENSG00000227232"; gene_version "5"; transcript_id "ENST00000488147"; transcript_version "1"; exon_number "11"; gene_name "WASH7P"; gene_source "havana"; gene_biotype "unprocessed_pseudogene"; transcript_name "WASH7P-201"; transcript_source "havana"; transcript_biotype "unprocessed_pseudogene"; exon_id "ENSE00001843071"; exon_version "1"; tag "basic"; transcript_support_level "NA";

tests/test.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
#!/usr/bin/env python
2+
3+
import os
4+
5+
def test_gtf2bed12_path(script_runner):
6+
ret = script_runner.run('gtf2bed12', '--help')
7+
assert ret.success
8+
9+
def test_gtf2bed12_plus_strand(script_runner):
10+
11+
gtf = "test_data/test.gtf"
12+
out = "test_data/out.bed"
13+
14+
script_runner.run('gtf2bed12',
15+
'--gtf', gtf,
16+
'--bed12', out,
17+
'--transcript_type', 'processed_transcript')
18+
19+
w = open(out, "r")
20+
lines = w.readline()
21+
w.close()
22+
23+
assert lines == "\t".join(["1-10000-20000",
24+
"1869",
25+
"4410",
26+
"ENST00000456328",
27+
"1",
28+
"+",
29+
"1869",
30+
"4410",
31+
"0",
32+
"3",
33+
"359,109,1189,",
34+
"0,744,1352," + os.linesep
35+
])
36+
37+
os.remove(out)
38+
39+
def test_gtf2bed12_minus_strand(script_runner):
40+
41+
gtf = "test_data/test.gtf"
42+
out = "test_data/out.bed"
43+
44+
script_runner.run('gtf2bed12',
45+
'--gtf', gtf,
46+
'--bed12', out,
47+
'--transcript_type', 'unprocessed_pseudogene')
48+
49+
w = open(out, "r")
50+
lines = w.readline()
51+
w.close()
52+
53+
assert lines == "\t".join(["1-10000-20000",
54+
"4404",
55+
"8367",
56+
"ENST00000488147",
57+
"1",
58+
"-",
59+
"4404",
60+
"8367",
61+
"0",
62+
"9",
63+
"98,34,152,159,198,136,137,147,99,",
64+
"0,601,1392,2203,2454,2829,3202,3511,3864," + os.linesep
65+
])
66+
67+
os.remove(out)

zgtf/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)