@@ -278,6 +278,7 @@ EC_CLASS_URL="https://ftp.expasy.org/databases/enzyme/enzclass.txt"
278
278
EC_NUMBER_URL=" https://ftp.expasy.org/databases/enzyme/enzyme.dat"
279
279
GO_TERM_URL=" http://geneontology.org/ontology/go-basic.obo"
280
280
INTERPRO_URL=" http://ftp.ebi.ac.uk/pub/databases/interpro/current_release/entry.list"
281
+ REFERENCE_PROTEOME_URL=" https://rest.uniprot.org/proteomes/stream?fields=upid,organism_id,protein_count&format=tsv&query=(*)+AND+(proteome_type:1)"
281
282
282
283
# ## Utility functions required for the database construction process.
283
284
@@ -374,7 +375,8 @@ extract_uniprot_version() {
374
375
download_taxdmp () {
375
376
# Check if our self-hosted version is available or not using the GitHub API
376
377
LATEST_RELEASE_URL=" https://api.github.com/repos/unipept/unipept-database/releases/latest"
377
- TAXDMP_RELEASE_ASSET_RE=" unipept/unipept-database/releases/download/[^/]+/ncbi-taxdmp.zip"
378
+ TAXDMP_RELEASE_ASSET_RE=" unipept/unipept-database/releases/download/[^/]+/taxdmp.zip"
379
+
378
380
# Temporary disable the pipefail check (cause egrep can exit with code 1 if nothing is found).
379
381
set +eo pipefail
380
382
SELF_HOSTED_URL=$( curl -s " $LATEST_RELEASE_URL " | egrep -o " $TAXDMP_RELEASE_ASSET_RE " )
@@ -783,6 +785,13 @@ fetch_interpro_entries() {
783
785
log " Finished creating InterPro Entries."
784
786
}
785
787
788
+ fetch_reference_proteomes () {
789
+ log " Started creating UniProt Reference Proteomes."
790
+ mkdir -p " $OUTPUT_DIR "
791
+ curl -s " $REFERENCE_PROTEOME_URL " | tail -n +2 | cat -n | sed ' s/^ *//' | $CMD_LZ4 - > " $OUTPUT_DIR /reference_proteomes.tsv.lz4"
792
+ log " Finished creating UniProt Reference Proteomes."
793
+ }
794
+
786
795
# dot: uniprot_entries -> create_kmer_index
787
796
# dot: taxons -> create_kmer_index
788
797
# dot: create_kmer_index [shape=box,color="#4e79a7"]
@@ -916,6 +925,7 @@ suffix-array)
916
925
fetch_ec_numbers
917
926
fetch_go_terms
918
927
fetch_interpro_entries
928
+ fetch_reference_proteomes
919
929
extract_uniprot_version
920
930
;;
921
931
esac
0 commit comments