Skip to content

Commit cc9038d

Browse files
author
mdiberna
committed
general cleanup with new bed file
1 parent a3d8052 commit cc9038d

15 files changed

+965
-1907
lines changed

.gitignore

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,21 @@ src/swissisoform.egg-info
55
src/swissisoform/__pycache__
66

77
# data
8-
data/genome_data/hg38.fa
9-
data/genome_data/hg38.ncbiRefSeq.gtf
8+
data/genome_data/*.fa
9+
data/genome_data/*.gtf
10+
data/genome_data/*.txt
1011
data/ribosome_profiling
1112
data/mutation_data
1213

1314
# output folders
1415
notebooks/**/
16+
notebooks/*_test.ipynb
1517

1618
# script outputs
1719
scripts/out
1820
scripts/results
19-
scripts/results_reduced
21+
scripts/results_reduced
22+
23+
# All DS store files
24+
.DS_Store
25+
scripts/.DS_Store

data/genome_data/download_genome.sh

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,12 @@
11
#!/bin/bash
2-
UCSC_BASE="https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips"
32

4-
wget ${UCSC_BASE}/hg38.fa.gz
5-
wget ${UCSC_BASE}/genes/hg38.ncbiRefSeq.gtf.gz
3+
#!/bin/bash
4+
GENCODE_BASE="https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_25"
5+
wget ${GENCODE_BASE}/GRCh38.p7.genome.fa.gz
6+
wget ${GENCODE_BASE}/gencode.v25.annotation.gtf.gz
7+
8+
GENCODE_BASE_LATEST="https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/latest_release"
9+
wget ${GENCODE_BASE_LATEST}/gencode.v47.annotation.gtf.gz
610

711
gunzip *.gz
12+
rm *.gz

notebooks/bed_cleanup.ipynb

Lines changed: 84 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,68 @@
22
"cells": [
33
{
44
"cell_type": "code",
5-
"execution_count": 3,
5+
"execution_count": 1,
66
"metadata": {},
77
"outputs": [],
88
"source": [
99
"from swissisoform.alternative_isoforms import AlternativeIsoform\n",
10-
"from swissisoform.utils import (\n",
11-
" cleanup_bed,\n",
10+
"from swissisoform.utils import cleanup_bed, update_gencode_gene_names"
11+
]
12+
},
13+
{
14+
"cell_type": "markdown",
15+
"metadata": {},
16+
"source": [
17+
"### GTF cleanup"
18+
]
19+
},
20+
{
21+
"cell_type": "code",
22+
"execution_count": 2,
23+
"metadata": {},
24+
"outputs": [
25+
{
26+
"name": "stdout",
27+
"output_type": "stream",
28+
"text": [
29+
"Creating gene ID to name mappings from reference GTF: ../data/genome_data/gencode.v47.annotation.gtf\n",
30+
"Extracted 57992 gene names from GENCODE GTF\n",
31+
"Extracted 78724 gene names from reference GTF\n",
32+
"Created 21316 gene name updates\n",
33+
"\n",
34+
"GTF Update Summary:\n",
35+
" Total lines processed: 2579822\n",
36+
" Genes processed: 58037\n",
37+
" Genes with updated names: 21324\n",
38+
" Total lines updated: 221358\n",
39+
" Output saved to: ../data/genome_data/gencode.v25.annotation.ensembl_cleaned.gtf\n"
40+
]
41+
},
42+
{
43+
"data": {
44+
"text/plain": [
45+
"{'total_lines': 2579822,\n",
46+
" 'updated_lines': 221358,\n",
47+
" 'genes_processed': 58037,\n",
48+
" 'genes_updated': 21324}"
49+
]
50+
},
51+
"execution_count": 2,
52+
"metadata": {},
53+
"output_type": "execute_result"
54+
}
55+
],
56+
"source": [
57+
"# Clean the GTF file using the current Ensembl gene names\n",
58+
"input_gtf = \"../data/genome_data/gencode.v25.annotation.gtf\"\n",
59+
"output_gtf = \"../data/genome_data/gencode.v25.annotation.ensembl_cleaned.gtf\"\n",
60+
"reference_gtf = \"../data/genome_data/gencode.v47.annotation.gtf\"\n",
61+
"\n",
62+
"update_gencode_gene_names(\n",
63+
" input_gtf_path=input_gtf,\n",
64+
" output_gtf_path=output_gtf,\n",
65+
" reference_gtf_path=reference_gtf,\n",
66+
" verbose=True,\n",
1267
")"
1368
]
1469
},
@@ -21,7 +76,7 @@
2176
},
2277
{
2378
"cell_type": "code",
24-
"execution_count": 6,
79+
"execution_count": 3,
2580
"metadata": {},
2681
"outputs": [],
2782
"source": [
@@ -31,54 +86,53 @@
3186
},
3287
{
3388
"cell_type": "code",
34-
"execution_count": 7,
89+
"execution_count": 4,
3590
"metadata": {},
3691
"outputs": [
3792
{
3893
"name": "stdout",
3994
"output_type": "stream",
4095
"text": [
41-
"Fetching Ensembl reference data...\n",
42-
"Retrieved 48379 mappings\n",
96+
"Extracting gene mapping from GTF: ../data/genome_data/gencode.v25.annotation.ensembl_cleaned.gtf\n",
97+
"Extracted 58037 unique gene ID to name mappings from GTF\n",
98+
"Retrieved 116029 gene name mappings\n",
4399
"\n",
44100
"Cleanup Summary:\n",
45101
" Total entries: 4926\n",
46-
" Invalid entries removed: 4\n",
102+
" Invalid entries removed: 0\n",
47103
" Duplicates removed: 38\n",
48104
" Gene names updated: 154\n",
49-
" Valid entries in final file: 4884\n"
105+
" Valid entries in final file: 4888\n"
50106
]
51107
},
52108
{
53109
"data": {
54110
"text/plain": [
55111
"{'total': 4926,\n",
56112
" 'invalid_format': 0,\n",
57-
" 'invalid_ensembl': 4,\n",
113+
" 'invalid_ensembl': 0,\n",
58114
" 'duplicates': 38,\n",
59115
" 'updated': 154,\n",
60-
" 'valid': 4884}"
116+
" 'valid': 4888}"
61117
]
62118
},
63-
"execution_count": 7,
119+
"execution_count": 4,
64120
"metadata": {},
65121
"output_type": "execute_result"
66122
}
67123
],
68124
"source": [
69-
"cleanup_bed(input_bed, output_bed, verbose=True)"
125+
"cleanup_bed(input_bed, output_bed, gtf_path=output_gtf, verbose=True)"
70126
]
71127
},
72128
{
73129
"cell_type": "code",
74-
"execution_count": 9,
130+
"execution_count": 5,
75131
"metadata": {},
76132
"outputs": [],
77133
"source": [
78134
"alt_isoforms = AlternativeIsoform()\n",
79-
"alt_isoforms.load_bed(\n",
80-
" \"../data/ribosome_profiling/full_truncations_JL_cleaned.bed\"\n",
81-
")\n",
135+
"alt_isoforms.load_bed(\"../data/ribosome_profiling/full_truncations_JL_cleaned.bed\")\n",
82136
"gene_list = alt_isoforms.get_gene_list()\n",
83137
"\n",
84138
"with open(\"../data/ribosome_profiling/gene_list.txt\", \"w\") as f:\n",
@@ -95,7 +149,7 @@
95149
},
96150
{
97151
"cell_type": "code",
98-
"execution_count": 4,
152+
"execution_count": 6,
99153
"metadata": {},
100154
"outputs": [],
101155
"source": [
@@ -105,54 +159,53 @@
105159
},
106160
{
107161
"cell_type": "code",
108-
"execution_count": 5,
162+
"execution_count": 7,
109163
"metadata": {},
110164
"outputs": [
111165
{
112166
"name": "stdout",
113167
"output_type": "stream",
114168
"text": [
115-
"Fetching Ensembl reference data...\n",
116-
"Retrieved 48379 mappings\n",
169+
"Extracting gene mapping from GTF: ../data/genome_data/gencode.v25.annotation.ensembl_cleaned.gtf\n",
170+
"Extracted 58037 unique gene ID to name mappings from GTF\n",
171+
"Retrieved 116029 gene name mappings\n",
117172
"\n",
118173
"Cleanup Summary:\n",
119-
" Total entries: 28\n",
174+
" Total entries: 26\n",
120175
" Invalid entries removed: 0\n",
121176
" Duplicates removed: 0\n",
122177
" Gene names updated: 2\n",
123-
" Valid entries in final file: 28\n"
178+
" Valid entries in final file: 26\n"
124179
]
125180
},
126181
{
127182
"data": {
128183
"text/plain": [
129-
"{'total': 28,\n",
184+
"{'total': 26,\n",
130185
" 'invalid_format': 0,\n",
131186
" 'invalid_ensembl': 0,\n",
132187
" 'duplicates': 0,\n",
133188
" 'updated': 2,\n",
134-
" 'valid': 28}"
189+
" 'valid': 26}"
135190
]
136191
},
137-
"execution_count": 5,
192+
"execution_count": 7,
138193
"metadata": {},
139194
"output_type": "execute_result"
140195
}
141196
],
142197
"source": [
143-
"cleanup_bed(input_bed, output_bed, verbose=True)"
198+
"cleanup_bed(input_bed, output_bed, gtf_path=output_gtf, verbose=True)"
144199
]
145200
},
146201
{
147202
"cell_type": "code",
148-
"execution_count": 6,
203+
"execution_count": 8,
149204
"metadata": {},
150205
"outputs": [],
151206
"source": [
152207
"alt_isoforms = AlternativeIsoform()\n",
153-
"alt_isoforms.load_bed(\n",
154-
" \"../data/ribosome_profiling/selected_truncations_JL_cleaned.bed\"\n",
155-
")\n",
208+
"alt_isoforms.load_bed(\"../data/ribosome_profiling/selected_truncations_JL_cleaned.bed\")\n",
156209
"gene_list = alt_isoforms.get_gene_list()\n",
157210
"\n",
158211
"with open(\"../data/ribosome_profiling/gene_list_reduced.txt\", \"w\") as f:\n",

0 commit comments

Comments
 (0)