Skip to content

Commit 07cad8e

Browse files
authored
Add download of reference proteomes
1 parent bed7469 commit 07cad8e

File tree

3 files changed

+15
-2
lines changed

3 files changed

+15
-2
lines changed

.devcontainer/devcontainer.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
// "forwardPorts": [],
1414

1515
// Use 'postCreateCommand' to run commands after the container is created.
16-
"postCreateCommand": "apt update && apt -y install curl pv pigz uuid-runtime parallel lz4 gawk",
16+
"postCreateCommand": "apt update && apt -y install curl pv pigz uuid-runtime parallel lz4 gawk libxml2-utils",
1717

1818
// Configure tool-specific properties.
1919
// "customizations": {},

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,6 @@ scripts/helper_scripts/parser/output
1919
scripts/helper_scripts/parser/src/META-INF
2020
.idea/
2121
*.iml
22+
index/
23+
output/
24+
temp/

scripts/build_database.sh

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,7 @@ EC_CLASS_URL="https://ftp.expasy.org/databases/enzyme/enzclass.txt"
278278
EC_NUMBER_URL="https://ftp.expasy.org/databases/enzyme/enzyme.dat"
279279
GO_TERM_URL="http://geneontology.org/ontology/go-basic.obo"
280280
INTERPRO_URL="http://ftp.ebi.ac.uk/pub/databases/interpro/current_release/entry.list"
281+
REFERENCE_PROTEOME_URL="https://rest.uniprot.org/proteomes/stream?fields=upid,organism_id,protein_count&format=tsv&query=(*)+AND+(proteome_type:1)"
281282

282283
### Utility functions required for the database construction process.
283284

@@ -374,7 +375,8 @@ extract_uniprot_version() {
374375
download_taxdmp() {
375376
# Check if our self-hosted version is available or not using the GitHub API
376377
LATEST_RELEASE_URL="https://api.github.com/repos/unipept/unipept-database/releases/latest"
377-
TAXDMP_RELEASE_ASSET_RE="unipept/unipept-database/releases/download/[^/]+/ncbi-taxdmp.zip"
378+
TAXDMP_RELEASE_ASSET_RE="unipept/unipept-database/releases/download/[^/]+/taxdmp.zip"
379+
378380
# Temporary disable the pipefail check (cause egrep can exit with code 1 if nothing is found).
379381
set +eo pipefail
380382
SELF_HOSTED_URL=$(curl -s "$LATEST_RELEASE_URL" | egrep -o "$TAXDMP_RELEASE_ASSET_RE")
@@ -783,6 +785,13 @@ fetch_interpro_entries() {
783785
log "Finished creating InterPro Entries."
784786
}
785787

788+
fetch_reference_proteomes() {
789+
log "Started creating UniProt Reference Proteomes."
790+
mkdir -p "$OUTPUT_DIR"
791+
curl -s "$REFERENCE_PROTEOME_URL" | tail -n +2 | cat -n | sed 's/^ *//' | $CMD_LZ4 - > "$OUTPUT_DIR/reference_proteomes.tsv.lz4"
792+
log "Finished creating UniProt Reference Proteomes."
793+
}
794+
786795
#dot: uniprot_entries -> create_kmer_index
787796
#dot: taxons -> create_kmer_index
788797
#dot: create_kmer_index [shape=box,color="#4e79a7"]
@@ -916,6 +925,7 @@ suffix-array)
916925
fetch_ec_numbers
917926
fetch_go_terms
918927
fetch_interpro_entries
928+
fetch_reference_proteomes
919929
extract_uniprot_version
920930
;;
921931
esac

0 commit comments

Comments
 (0)