|
2 | 2 | "cells": [
|
3 | 3 | {
|
4 | 4 | "cell_type": "code",
|
5 |
| - "execution_count": 3, |
| 5 | + "execution_count": 1, |
6 | 6 | "metadata": {},
|
7 | 7 | "outputs": [],
|
8 | 8 | "source": [
|
9 | 9 | "from swissisoform.alternative_isoforms import AlternativeIsoform\n",
|
10 |
| - "from swissisoform.utils import (\n", |
11 |
| - " cleanup_bed,\n", |
| 10 | + "from swissisoform.utils import cleanup_bed, update_gencode_gene_names" |
| 11 | + ] |
| 12 | + }, |
| 13 | + { |
| 14 | + "cell_type": "markdown", |
| 15 | + "metadata": {}, |
| 16 | + "source": [ |
| 17 | + "### GTF cleanup" |
| 18 | + ] |
| 19 | + }, |
| 20 | + { |
| 21 | + "cell_type": "code", |
| 22 | + "execution_count": 2, |
| 23 | + "metadata": {}, |
| 24 | + "outputs": [ |
| 25 | + { |
| 26 | + "name": "stdout", |
| 27 | + "output_type": "stream", |
| 28 | + "text": [ |
| 29 | + "Creating gene ID to name mappings from reference GTF: ../data/genome_data/gencode.v47.annotation.gtf\n", |
| 30 | + "Extracted 57992 gene names from GENCODE GTF\n", |
| 31 | + "Extracted 78724 gene names from reference GTF\n", |
| 32 | + "Created 21316 gene name updates\n", |
| 33 | + "\n", |
| 34 | + "GTF Update Summary:\n", |
| 35 | + " Total lines processed: 2579822\n", |
| 36 | + " Genes processed: 58037\n", |
| 37 | + " Genes with updated names: 21324\n", |
| 38 | + " Total lines updated: 221358\n", |
| 39 | + " Output saved to: ../data/genome_data/gencode.v25.annotation.ensembl_cleaned.gtf\n" |
| 40 | + ] |
| 41 | + }, |
| 42 | + { |
| 43 | + "data": { |
| 44 | + "text/plain": [ |
| 45 | + "{'total_lines': 2579822,\n", |
| 46 | + " 'updated_lines': 221358,\n", |
| 47 | + " 'genes_processed': 58037,\n", |
| 48 | + " 'genes_updated': 21324}" |
| 49 | + ] |
| 50 | + }, |
| 51 | + "execution_count": 2, |
| 52 | + "metadata": {}, |
| 53 | + "output_type": "execute_result" |
| 54 | + } |
| 55 | + ], |
| 56 | + "source": [ |
| 57 | + "# Clean the GTF file using the current Ensembl gene names\n", |
| 58 | + "input_gtf = \"../data/genome_data/gencode.v25.annotation.gtf\"\n", |
| 59 | + "output_gtf = \"../data/genome_data/gencode.v25.annotation.ensembl_cleaned.gtf\"\n", |
| 60 | + "reference_gtf = \"../data/genome_data/gencode.v47.annotation.gtf\"\n", |
| 61 | + "\n", |
| 62 | + "update_gencode_gene_names(\n", |
| 63 | + " input_gtf_path=input_gtf,\n", |
| 64 | + " output_gtf_path=output_gtf,\n", |
| 65 | + " reference_gtf_path=reference_gtf,\n", |
| 66 | + " verbose=True,\n", |
12 | 67 | ")"
|
13 | 68 | ]
|
14 | 69 | },
|
|
21 | 76 | },
|
22 | 77 | {
|
23 | 78 | "cell_type": "code",
|
24 |
| - "execution_count": 6, |
| 79 | + "execution_count": 3, |
25 | 80 | "metadata": {},
|
26 | 81 | "outputs": [],
|
27 | 82 | "source": [
|
|
31 | 86 | },
|
32 | 87 | {
|
33 | 88 | "cell_type": "code",
|
34 |
| - "execution_count": 7, |
| 89 | + "execution_count": 4, |
35 | 90 | "metadata": {},
|
36 | 91 | "outputs": [
|
37 | 92 | {
|
38 | 93 | "name": "stdout",
|
39 | 94 | "output_type": "stream",
|
40 | 95 | "text": [
|
41 |
| - "Fetching Ensembl reference data...\n", |
42 |
| - "Retrieved 48379 mappings\n", |
| 96 | + "Extracting gene mapping from GTF: ../data/genome_data/gencode.v25.annotation.ensembl_cleaned.gtf\n", |
| 97 | + "Extracted 58037 unique gene ID to name mappings from GTF\n", |
| 98 | + "Retrieved 116029 gene name mappings\n", |
43 | 99 | "\n",
|
44 | 100 | "Cleanup Summary:\n",
|
45 | 101 | " Total entries: 4926\n",
|
46 |
| - " Invalid entries removed: 4\n", |
| 102 | + " Invalid entries removed: 0\n", |
47 | 103 | " Duplicates removed: 38\n",
|
48 | 104 | " Gene names updated: 154\n",
|
49 |
| - " Valid entries in final file: 4884\n" |
| 105 | + " Valid entries in final file: 4888\n" |
50 | 106 | ]
|
51 | 107 | },
|
52 | 108 | {
|
53 | 109 | "data": {
|
54 | 110 | "text/plain": [
|
55 | 111 | "{'total': 4926,\n",
|
56 | 112 | " 'invalid_format': 0,\n",
|
57 |
| - " 'invalid_ensembl': 4,\n", |
| 113 | + " 'invalid_ensembl': 0,\n", |
58 | 114 | " 'duplicates': 38,\n",
|
59 | 115 | " 'updated': 154,\n",
|
60 |
| - " 'valid': 4884}" |
| 116 | + " 'valid': 4888}" |
61 | 117 | ]
|
62 | 118 | },
|
63 |
| - "execution_count": 7, |
| 119 | + "execution_count": 4, |
64 | 120 | "metadata": {},
|
65 | 121 | "output_type": "execute_result"
|
66 | 122 | }
|
67 | 123 | ],
|
68 | 124 | "source": [
|
69 |
| - "cleanup_bed(input_bed, output_bed, verbose=True)" |
| 125 | + "cleanup_bed(input_bed, output_bed, gtf_path=output_gtf, verbose=True)" |
70 | 126 | ]
|
71 | 127 | },
|
72 | 128 | {
|
73 | 129 | "cell_type": "code",
|
74 |
| - "execution_count": 9, |
| 130 | + "execution_count": 5, |
75 | 131 | "metadata": {},
|
76 | 132 | "outputs": [],
|
77 | 133 | "source": [
|
78 | 134 | "alt_isoforms = AlternativeIsoform()\n",
|
79 |
| - "alt_isoforms.load_bed(\n", |
80 |
| - " \"../data/ribosome_profiling/full_truncations_JL_cleaned.bed\"\n", |
81 |
| - ")\n", |
| 135 | + "alt_isoforms.load_bed(\"../data/ribosome_profiling/full_truncations_JL_cleaned.bed\")\n", |
82 | 136 | "gene_list = alt_isoforms.get_gene_list()\n",
|
83 | 137 | "\n",
|
84 | 138 | "with open(\"../data/ribosome_profiling/gene_list.txt\", \"w\") as f:\n",
|
|
95 | 149 | },
|
96 | 150 | {
|
97 | 151 | "cell_type": "code",
|
98 |
| - "execution_count": 4, |
| 152 | + "execution_count": 6, |
99 | 153 | "metadata": {},
|
100 | 154 | "outputs": [],
|
101 | 155 | "source": [
|
|
105 | 159 | },
|
106 | 160 | {
|
107 | 161 | "cell_type": "code",
|
108 |
| - "execution_count": 5, |
| 162 | + "execution_count": 7, |
109 | 163 | "metadata": {},
|
110 | 164 | "outputs": [
|
111 | 165 | {
|
112 | 166 | "name": "stdout",
|
113 | 167 | "output_type": "stream",
|
114 | 168 | "text": [
|
115 |
| - "Fetching Ensembl reference data...\n", |
116 |
| - "Retrieved 48379 mappings\n", |
| 169 | + "Extracting gene mapping from GTF: ../data/genome_data/gencode.v25.annotation.ensembl_cleaned.gtf\n", |
| 170 | + "Extracted 58037 unique gene ID to name mappings from GTF\n", |
| 171 | + "Retrieved 116029 gene name mappings\n", |
117 | 172 | "\n",
|
118 | 173 | "Cleanup Summary:\n",
|
119 |
| - " Total entries: 28\n", |
| 174 | + " Total entries: 26\n", |
120 | 175 | " Invalid entries removed: 0\n",
|
121 | 176 | " Duplicates removed: 0\n",
|
122 | 177 | " Gene names updated: 2\n",
|
123 |
| - " Valid entries in final file: 28\n" |
| 178 | + " Valid entries in final file: 26\n" |
124 | 179 | ]
|
125 | 180 | },
|
126 | 181 | {
|
127 | 182 | "data": {
|
128 | 183 | "text/plain": [
|
129 |
| - "{'total': 28,\n", |
| 184 | + "{'total': 26,\n", |
130 | 185 | " 'invalid_format': 0,\n",
|
131 | 186 | " 'invalid_ensembl': 0,\n",
|
132 | 187 | " 'duplicates': 0,\n",
|
133 | 188 | " 'updated': 2,\n",
|
134 |
| - " 'valid': 28}" |
| 189 | + " 'valid': 26}" |
135 | 190 | ]
|
136 | 191 | },
|
137 |
| - "execution_count": 5, |
| 192 | + "execution_count": 7, |
138 | 193 | "metadata": {},
|
139 | 194 | "output_type": "execute_result"
|
140 | 195 | }
|
141 | 196 | ],
|
142 | 197 | "source": [
|
143 |
| - "cleanup_bed(input_bed, output_bed, verbose=True)" |
| 198 | + "cleanup_bed(input_bed, output_bed, gtf_path=output_gtf, verbose=True)" |
144 | 199 | ]
|
145 | 200 | },
|
146 | 201 | {
|
147 | 202 | "cell_type": "code",
|
148 |
| - "execution_count": 6, |
| 203 | + "execution_count": 8, |
149 | 204 | "metadata": {},
|
150 | 205 | "outputs": [],
|
151 | 206 | "source": [
|
152 | 207 | "alt_isoforms = AlternativeIsoform()\n",
|
153 |
| - "alt_isoforms.load_bed(\n", |
154 |
| - " \"../data/ribosome_profiling/selected_truncations_JL_cleaned.bed\"\n", |
155 |
| - ")\n", |
| 208 | + "alt_isoforms.load_bed(\"../data/ribosome_profiling/selected_truncations_JL_cleaned.bed\")\n", |
156 | 209 | "gene_list = alt_isoforms.get_gene_list()\n",
|
157 | 210 | "\n",
|
158 | 211 | "with open(\"../data/ribosome_profiling/gene_list_reduced.txt\", \"w\") as f:\n",
|
|
0 commit comments