cheeseman-lab
diff --git a/‎.gitignore
Lines changed: 9 additions & 3 deletions b/‎.gitignore
Lines changed: 9 additions & 3 deletions
diff --git a/‎data/genome_data/download_genome.sh
Lines changed: 8 additions & 3 deletions b/‎data/genome_data/download_genome.sh
Lines changed: 8 additions & 3 deletions
diff --git a/‎notebooks/bed_cleanup.ipynb
Lines changed: 84 additions & 31 deletions b/‎notebooks/bed_cleanup.ipynb
Lines changed: 84 additions & 31 deletions
@@ -5,15 +5,21 @@ src/swissisoform.egg-info
 src/swissisoform/__pycache__
 
 # data
-data/genome_data/hg38.fa
-data/genome_data/hg38.ncbiRefSeq.gtf
+data/genome_data/*.fa
+data/genome_data/*.gtf
+data/genome_data/*.txt
 data/ribosome_profiling
 data/mutation_data
 
 # output folders
 notebooks/**/
+notebooks/*_test.ipynb
 
 # script outputs
 scripts/out
 scripts/results
-scripts/results_reduced
+scripts/results_reduced
+
+# All DS store files
+.DS_Store
+scripts/.DS_Store
@@ -1,7 +1,12 @@
 #!/bin/bash
-UCSC_BASE="https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips"
 
-wget ${UCSC_BASE}/hg38.fa.gz
-wget ${UCSC_BASE}/genes/hg38.ncbiRefSeq.gtf.gz
+#!/bin/bash
+GENCODE_BASE="https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_25"
+wget ${GENCODE_BASE}/GRCh38.p7.genome.fa.gz
+wget ${GENCODE_BASE}/gencode.v25.annotation.gtf.gz
+
+GENCODE_BASE_LATEST="https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/latest_release"
+wget ${GENCODE_BASE_LATEST}/gencode.v47.annotation.gtf.gz
 
 gunzip *.gz
+rm *.gz
@@ -2,13 +2,68 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
     "from swissisoform.alternative_isoforms import AlternativeIsoform\n",
-    "from swissisoform.utils import (\n",
-    "    cleanup_bed,\n",
+    "from swissisoform.utils import cleanup_bed, update_gencode_gene_names"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### GTF cleanup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Creating gene ID to name mappings from reference GTF: ../data/genome_data/gencode.v47.annotation.gtf\n",
+      "Extracted 57992 gene names from GENCODE GTF\n",
+      "Extracted 78724 gene names from reference GTF\n",
+      "Created 21316 gene name updates\n",
+      "\n",
+      "GTF Update Summary:\n",
+      "  Total lines processed: 2579822\n",
+      "  Genes processed: 58037\n",
+      "  Genes with updated names: 21324\n",
+      "  Total lines updated: 221358\n",
+      "  Output saved to: ../data/genome_data/gencode.v25.annotation.ensembl_cleaned.gtf\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'total_lines': 2579822,\n",
+       " 'updated_lines': 221358,\n",
+       " 'genes_processed': 58037,\n",
+       " 'genes_updated': 21324}"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Clean the GTF file using the current Ensembl gene names\n",
+    "input_gtf = \"../data/genome_data/gencode.v25.annotation.gtf\"\n",
+    "output_gtf = \"../data/genome_data/gencode.v25.annotation.ensembl_cleaned.gtf\"\n",
+    "reference_gtf = \"../data/genome_data/gencode.v47.annotation.gtf\"\n",
+    "\n",
+    "update_gencode_gene_names(\n",
+    "    input_gtf_path=input_gtf,\n",
+    "    output_gtf_path=output_gtf,\n",
+    "    reference_gtf_path=reference_gtf,\n",
+    "    verbose=True,\n",
     ")"
    ]
   },
@@ -21,7 +76,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -31,54 +86,53 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Fetching Ensembl reference data...\n",
-      "Retrieved 48379 mappings\n",
+      "Extracting gene mapping from GTF: ../data/genome_data/gencode.v25.annotation.ensembl_cleaned.gtf\n",
+      "Extracted 58037 unique gene ID to name mappings from GTF\n",
+      "Retrieved 116029 gene name mappings\n",
       "\n",
       "Cleanup Summary:\n",
       "  Total entries: 4926\n",
-      "  Invalid entries removed: 4\n",
+      "  Invalid entries removed: 0\n",
       "  Duplicates removed: 38\n",
       "  Gene names updated: 154\n",
-      "  Valid entries in final file: 4884\n"
+      "  Valid entries in final file: 4888\n"
      ]
     },
     {
      "data": {
       "text/plain": [
        "{'total': 4926,\n",
        " 'invalid_format': 0,\n",
-       " 'invalid_ensembl': 4,\n",
+       " 'invalid_ensembl': 0,\n",
        " 'duplicates': 38,\n",
        " 'updated': 154,\n",
-       " 'valid': 4884}"
+       " 'valid': 4888}"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "cleanup_bed(input_bed, output_bed, verbose=True)"
+    "cleanup_bed(input_bed, output_bed, gtf_path=output_gtf, verbose=True)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
     "alt_isoforms = AlternativeIsoform()\n",
-    "alt_isoforms.load_bed(\n",
-    "    \"../data/ribosome_profiling/full_truncations_JL_cleaned.bed\"\n",
-    ")\n",
+    "alt_isoforms.load_bed(\"../data/ribosome_profiling/full_truncations_JL_cleaned.bed\")\n",
     "gene_list = alt_isoforms.get_gene_list()\n",
     "\n",
     "with open(\"../data/ribosome_profiling/gene_list.txt\", \"w\") as f:\n",
@@ -95,7 +149,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -105,54 +159,53 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Fetching Ensembl reference data...\n",
-      "Retrieved 48379 mappings\n",
+      "Extracting gene mapping from GTF: ../data/genome_data/gencode.v25.annotation.ensembl_cleaned.gtf\n",
+      "Extracted 58037 unique gene ID to name mappings from GTF\n",
+      "Retrieved 116029 gene name mappings\n",
       "\n",
       "Cleanup Summary:\n",
-      "  Total entries: 28\n",
+      "  Total entries: 26\n",
       "  Invalid entries removed: 0\n",
       "  Duplicates removed: 0\n",
       "  Gene names updated: 2\n",
-      "  Valid entries in final file: 28\n"
+      "  Valid entries in final file: 26\n"
      ]
     },
     {
      "data": {
       "text/plain": [
-       "{'total': 28,\n",
+       "{'total': 26,\n",
        " 'invalid_format': 0,\n",
        " 'invalid_ensembl': 0,\n",
        " 'duplicates': 0,\n",
        " 'updated': 2,\n",
-       " 'valid': 28}"
+       " 'valid': 26}"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "cleanup_bed(input_bed, output_bed, verbose=True)"
+    "cleanup_bed(input_bed, output_bed, gtf_path=output_gtf, verbose=True)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
     "alt_isoforms = AlternativeIsoform()\n",
-    "alt_isoforms.load_bed(\n",
-    "    \"../data/ribosome_profiling/selected_truncations_JL_cleaned.bed\"\n",
-    ")\n",
+    "alt_isoforms.load_bed(\"../data/ribosome_profiling/selected_truncations_JL_cleaned.bed\")\n",
     "gene_list = alt_isoforms.get_gene_list()\n",
     "\n",
     "with open(\"../data/ribosome_profiling/gene_list_reduced.txt\", \"w\") as f:\n",