diff --git a/DESCRIPTION b/DESCRIPTION
index 34057d9..5011bd5 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: CuratedAtlasQueryR
 Title: Queries the Human Cell Atlas
-Version: 1.3.7
+Version: 1.4.7
 Authors@R: c(
     person(
         "Stefano",
@@ -126,7 +126,7 @@ biocViews:
     Transcription,
     Transcriptomics
 Encoding: UTF-8
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
 LazyDataCompression: xz
 URL: https://github.com/stemangiola/CuratedAtlasQueryR
 BugReports: https://github.com/stemangiola/CuratedAtlasQueryR/issues
diff --git a/R/metadata.R b/R/metadata.R
index 7d5de48..5bb6482 100644
--- a/R/metadata.R
+++ b/R/metadata.R
@@ -10,7 +10,8 @@ cache <- rlang::env(
 )
 
 #' Returns the URLs for all metadata files 
-#' @param databases A character vector specifying the names of the metadata files. Default is c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet")
+#' @param databases A character vector specifying the names of the metadata files. 
+#' Default is c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet", "prostate.0.1.0.parquet")
 #' @export
 #' @return A character vector of URLs to parquet files to download
 #' @examples
@@ -20,7 +21,8 @@ cache <- rlang::env(
 #'   immune system across age, sex and ethnicity." bioRxiv (2023): 2023-06.
 #'   doi:10.1101/2023.06.08.542671.
 #' @source [Mangiola et al.,2023](https://www.biorxiv.org/content/10.1101/2023.06.08.542671v3)
-get_database_url <- function(databases = c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet")) {
+get_database_url <- function(databases = c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet",
+                                           "prostate.0.1.0.parquet")) {
   glue::glue(
     "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/{databases}")
 }
diff --git a/dev/census_accepted_assays.csv b/dev/census_accepted_assays.csv
new file mode 100644
index 0000000..3488442
--- /dev/null
+++ b/dev/census_accepted_assays.csv
@@ -0,0 +1,99 @@
+id,assay
+EFO:0003755,FL-cDNA
+EFO:0008640,3'T-fill
+EFO:0008641,3’-end-seq
+EFO:0008643,3′-Seq
+EFO:0008661,Bru-Seq
+EFO:0008669,CAGEscan
+EFO:0008673,CapSeq
+EFO:0008675,CaptureSeq
+EFO:0008679,CEL-seq
+EFO:0008694,ClickSeq
+EFO:0008697,cP-RNA-Seq
+EFO:0008708,DeepCAGE
+EFO:0008710,Digital RNA
+EFO:0008718,DP-Seq
+EFO:0008720,DroNc-seq
+EFO:0008722,Drop-seq
+EFO:0008735,FACS-seq
+EFO:0008747,FRISCR
+EFO:0008748,FRT-Seq
+EFO:0008752,GMUCT 1.0
+EFO:0008753,GMUCT 2.0
+EFO:0008756,GRO-CAP
+EFO:0008763,Hi-SCL
+EFO:0008780,inDrop
+EFO:0008796,MARS-seq
+EFO:0008797,MATQ-seq
+EFO:0008824,NanoCAGE
+EFO:0008825,Nanogrid RNA-Seq
+EFO:0008826,NET-Seq
+EFO:0008850,PAS-Seq
+EFO:0008859,PEAT
+EFO:0008863,PLATE-Seq
+EFO:0008868,PRO-cap
+EFO:0008869,PRO-seq
+EFO:0008877,Quartz-seq
+EFO:0008896,RNA-Seq
+EFO:0008897,RNAtag-Seq
+EFO:0008898,RNET-seq
+EFO:0008903,SC3-seq
+EFO:0008908,SCI-seq
+EFO:0008919,Seq-Well
+EFO:0008929,SMA
+EFO:0008930,Smart-seq
+EFO:0008931,Smart-seq2
+EFO:0008937,snDrop-seq
+EFO:0008941,sNuc-Seq
+EFO:0008945,SPET-seq
+EFO:0008953,STRT-seq
+EFO:0008954,STRT-seq-2i
+EFO:0008956,SUPeR-seq
+EFO:0008962,TARDIS
+EFO:0008966,TCR Chain Paring
+EFO:0008967,TCR-LA-MC PCR
+EFO:0008972,TL-seq
+EFO:0008974,Tomo-Seq
+EFO:0008975,TRAP-Seq
+EFO:0008978,TSS Sequencing
+EFO:0008980,UMI Method
+EFO:0009309,Div-Seq
+EFO:0009899,10x 3' v2
+EFO:0009900,10x 5' v2
+EFO:0009901,10x 3' v1
+EFO:0009919,SPLiT-seq
+EFO:0009922,10x 3' v3
+EFO:0009991,Nuc-Seq
+EFO:0010003,RASL-seq
+EFO:0010004,SCRB-seq
+EFO:0010010,CEL-seq2
+EFO:0010022,Smart-3Seq
+EFO:0010034,Cappable-Seq
+EFO:0010041,Nascent-Seq
+EFO:0010058,Fluidigm C1-based library preparation
+EFO:0010184,Smart-like
+EFO:0010550,sci-RNA-seq
+EFO:0010713,10x immune profiling
+EFO:0010714,10x TCR enrichment
+EFO:0010715,10x Ig enrichment
+EFO:0010964,barcoded plate-based single cell RNA-seq
+EFO:0011025,10x 5' v1
+EFO:0022396,TruSeq
+EFO:0022488,Smart-seq3
+EFO:0022490,ScaleBio single cell RNA sequencing
+EFO:0030002,microwell-seq
+EFO:0030003,10x 3' transcription profiling
+EFO:0030004,10x 5' transcription profiling
+EFO:0030019,Seq-Well S3
+EFO:0030021,Nx1-seq
+EFO:0030028,sci-RNA-seq3
+EFO:0030030,Quant-seq
+EFO:0030031,SCOPE-chip
+EFO:0030061,mcSCRB-seq
+EFO:0030074,SORT-seq
+EFO:0030078,droplet-based single-cell RNA library preparation
+EFO:0700003,BD Rhapsody Whole Transcriptome Analysis
+EFO:0700004,BD Rhapsody Targeted mRNA
+EFO:0700010,TruDrop
+EFO:0700011,GEXSCOPE technology
+EFO:0700016,Smart-seq v4
\ No newline at end of file
diff --git a/dev/execute_hpcell_on_census_and_defining_data_tranformation.R b/dev/execute_hpcell_on_census_and_defining_data_tranformation.R
new file mode 100644
index 0000000..2e552ca
--- /dev/null
+++ b/dev/execute_hpcell_on_census_and_defining_data_tranformation.R
@@ -0,0 +1,206 @@
+library(glue)
+library(dplyr)
+library(arrow)
+library(SingleCellExperiment)
+library(tibble)
+library(SummarizedExperiment)
+library(glue)
+library(purrr)
+library(zellkonverter)
+library(tidyr)
+library(ggplot2)
+library(plotly)
+library(targets)
+library(stringr)
+library(CuratedAtlasQueryR)
+library(fs)
+library(HPCell)
+library(crew.cluster)
+directory = "/home/users/allstaff/shen.m/scratch/Census_rerun/split_h5ad_based_on_sample_id/"
+sample_anndata <- dir(glue("{directory}"), full.names = T)
+downloaded_samples_tbl <- read_parquet("/home/users/allstaff/shen.m/scratch/Census_rerun/census_samples_to_download_groups.parquet")
+downloaded_samples_tbl <- downloaded_samples_tbl |>
+  rename(cell_number = list_length) |> 
+  mutate(cell_number = cell_number |> as.integer(),
+         file_name = glue("{directory}{sample_2}.h5ad") |> as.character(), 
+         tier = case_when(
+           cell_number < 500 ~ "tier_1", cell_number >= 500 &
+             cell_number < 1000 ~ "tier_2", cell_number >= 1000 &
+             cell_number < 10000 ~ "tier_3", cell_number >= 10000 ~ "tier_4"
+         ))
+
+result_directory = "/vast/projects/cellxgene_curated/metadata_cellxgenedp_Apr_2024"
+sample_meta <- tar_read(metadata_dataset_id_common_sample_columns, store = glue("{result_directory}/_targets"))
+sample_tbl = downloaded_samples_tbl |> left_join(get_metadata() |> select(dataset_id, contains("norm")) |>
+                                 distinct() |> filter(!is.na(x_normalization)) |>
+                                 as_tibble(), by = "dataset_id")
+
+
+sample_tbl <- sample_tbl |> left_join(sample_meta, by = "dataset_id") |> distinct(file_name, tier, cell_number, dataset_id, sample_2,
+                                                                                  x_normalization, x_approximate_distribution) |>
+  mutate(transform_method = case_when(str_like(x_normalization, "C%") ~ "log",
+                                      x_normalization == "none" ~ "log",
+                                      x_normalization == "normalized" ~ "log",
+                                      is.na(x_normalization) & is.na(x_approximate_distribution) ~ "log",
+                                      is.na(x_normalization) & x_approximate_distribution == "NORMAL" ~ "NORMAL",
+                                      is.na(x_normalization) & x_approximate_distribution == "COUNT" ~ "COUNT",
+                                      str_like(x_normalization, "%canpy%") ~ "log1p",
+                                      TRUE ~ x_normalization)) |>
+  
+  mutate(method_to_apply =  case_when(transform_method %in% c("log","LogNormalization","LogNormalize","log-normalization") ~ "exp",
+                                      is.na(x_normalization) & is.na(x_approximate_distribution) ~ "exp",
+                                      str_like(transform_method, "Counts%") ~ "exp",
+                                      str_like(transform_method, "%log2%") ~ "exp",
+                                      transform_method %in% c("log1p", "log1p, base e", "Scanpy",
+                                                              "scanpy.api.pp.normalize_per_cell method, scaling factor 10000") ~ "expm1",
+                                      transform_method == "log1p, base 2" ~ "expm1",
+                                      transform_method == "NORMAL" ~ "exp",
+                                      transform_method == "COUNT" ~ "identity",
+                                      is.na(transform_method) ~ "identity"
+  ) ) |>
+  mutate(comment = case_when(str_like(x_normalization, "Counts%")  ~ "a checkpoint for max value of Assay must <= 50",
+                             is.na(x_normalization) & is.na(x_approximate_distribution) ~ "round negative value to 0",
+                             x_normalization == "normalized" ~ "round negative value to 0"
+  ))
+
+sample_tbl <- sample_tbl |> mutate(transformation_function = map(
+    method_to_apply,
+    ~ ( function(data) {
+      assay_name <- data@assays |> names() |> magrittr::extract2(1)
+      counts <- assay(data, assay_name)
+      density_est <- counts |> as.matrix() |> density()
+      mode_value <- density_est$x[which.max(density_est$y)]
+      if (mode_value < 0 )  counts <- counts + abs(mode_value)
+      
+      # Scale max counts to 20 to avoid any downstream failure
+      if (.x != "identity" && (max(counts) > 20)) {
+        scale_factor = 20 / max(counts)
+        counts <- counts * scale_factor}
+      
+      counts <- transform_method(counts)
+      # round counts to avoid potential substraction error due to different digits print out
+      counts <- counts |> round(5)
+      majority_gene_counts = names(which.max(table(as.vector(counts)))) |> as.numeric()
+      if (majority_gene_counts != 0) {
+        counts <- counts - majority_gene_counts
+      }
+      
+      # Avoid downstream failures negative counts
+      if((counts[,1:min(10000, ncol(counts))] |> min()) < 0)
+        counts[counts < 0] <- 0
+      
+      # Assign counts back to data
+      assay(data, assay_name) <- counts
+      
+      col_sums <- colSums(counts)
+      # Drop all zero cells
+      data <- data[, col_sums > 0]
+      
+      # Avoid downstream binding error
+      rowData(data) = NULL
+
+      data
+      
+    }) |>
+      # Meta programming, replacing the transformation programmatically
+      substitute( env = list(transform_method = as.name(.x))) |>
+      # Evaluate back to a working function
+      eval()
+  ))
+
+#sample_tbl |> saveRDS("~/scratch/Census_rerun/sample_tbl_input_for_hpcell.rds")
+sample_tbl <- readRDS("~/scratch/Census_rerun/sample_tbl_input_for_hpcell.rds")
+
+# Set the parent directory where the subdirectories will be created
+# parent_dir <- "~/scratch/Census_rerun/"
+# 
+# # Directory names to create
+# dir_names <- paste0("run", 1:25)
+# 
+# # Full paths of the directories
+# full_dir_paths <- file.path(parent_dir, dir_names)
+# 
+# # Create each directory if it does not exist
+# for (dir_path in full_dir_paths) {
+#   if (!dir_exists(dir_path)) {
+#     dir_create(dir_path)
+#   }
+# }
+
+# Run 1000 samples per run. Save log and result in the corresponding store
+store = "~/scratch/Census_rerun/run3/"
+setwd(glue("{store}"))
+sliced_sample_tbl = sample_tbl |> slice(2001:3000) |> select(file_name, tier, cell_number, dataset_id,
+                                                          sample_2, transformation_function)
+
+# Enable sample_names.rds to store sample names for the input
+sample_names <- sliced_sample_tbl |> pull(file_name)  |> set_names(sliced_sample_tbl |> pull(sample_2))
+
+sample_names |>
+  initialise_hpc(
+    gene_nomenclature = "ensembl",
+    data_container_type = "anndata",
+    store = store,
+    tier = sliced_sample_tbl |> pull(tier),
+    computing_resources = list(
+      crew_controller_slurm(
+        name = "tier_1",
+        script_lines = "#SBATCH --mem 35G",
+        slurm_cpus_per_task = 1,
+        workers = 200,
+        tasks_max = 1,
+        verbose = T
+      ),
+      
+      crew_controller_slurm(
+        name = "tier_2",
+        script_lines = "#SBATCH --mem 60G",
+        slurm_cpus_per_task = 1,
+        workers = 50,
+        tasks_max = 1,
+        verbose = T
+      ),
+      crew_controller_slurm(
+        name = "tier_3",
+        script_lines = "#SBATCH --mem 90G",
+        slurm_cpus_per_task = 1,
+        workers = 25,
+        tasks_max = 1,
+        verbose = T
+      ),
+      crew_controller_slurm(
+        name = "tier_4",
+        script_lines = "#SBATCH --mem 100G",
+        slurm_cpus_per_task = 1,
+        workers = 14,
+        tasks_max = 1,
+        verbose = T
+      )
+    )
+    
+  ) |>
+  tranform_assay(fx = sliced_sample_tbl |>
+                   pull(transformation_function),
+                 target_output = "sce_transformed") |>
+  
+  # Remove empty outliers based on RNA count threshold per cell
+  remove_empty_threshold(target_input = "sce_transformed", RNA_feature_threshold = 200) |>
+  
+  # Remove dead cells
+  remove_dead_scuttle(target_input = "sce_transformed") |>
+  
+  # Score cell cycle
+  score_cell_cycle_seurat(target_input = "sce_transformed") |>
+  
+  # Remove doublets
+  remove_doublets_scDblFinder(target_input = "sce_transformed") |>
+  
+  # Annotation
+  annotate_cell_type(target_input = "sce_transformed", azimuth_reference = "pbmcref") |>
+  
+  normalise_abundance_seurat_SCT(
+    factors_to_regress = c("subsets_Mito_percent", "subsets_Ribo_percent", "G2M.Score"),
+    target_input = "sce_transformed"
+  )
+
+
diff --git a/dev/split_census_anndata_base_on_sample_id.R b/dev/split_census_anndata_base_on_sample_id.R
new file mode 100644
index 0000000..502e1bf
--- /dev/null
+++ b/dev/split_census_anndata_base_on_sample_id.R
@@ -0,0 +1,134 @@
+library(targets)
+library(zellkonverter)
+library(tibble)
+library(dplyr)
+library(SummarizedExperiment)
+library(tidybulk)
+library(tidySingleCellExperiment)
+library(stringr)
+library(arrow)
+anndata_path_based_on_dataset_id_to_read <- "~/scratch/Census_rerun/h5ad/"
+anndata_path_based_on_sample_id_to_save <- "~/scratch/Census_rerun/split_h5ad_based_on_sample_id/"
+
+files <- list.files(anndata_path_based_on_dataset_id_to_read, pattern = "*h5ad", 
+             full.names = TRUE)
+
+save_data <- function(data, file_name) {
+  filename <- paste0(file.path(anndata_path_based_on_sample_id_to_save), file_name, ".h5ad")
+  zellkonverter::writeH5AD(data, file = filename, compression = "gzip" )
+}
+
+
+#' This function subsets samples from a dataset based on specified join IDs.
+#' It reads a single-cell expression dataset stored in the HDF5 AnnData format,
+#' removes unneeded metadata to optimize memory usage, and extracts the subset of cells 
+#' matching the provided observation join IDs.
+subset_samples <- function(dataset_id, observation_joinid, sample_id) {
+  
+  
+  # Construct the file path to the dataset
+  file_path <- paste0(anndata_path_based_on_dataset_id_to_read, dataset_id, ".h5ad")
+  
+  # Read the dataset from HDF5 file using zellkonverter package
+  sce <- zellkonverter::readH5AD(file_path, use_hdf5 = TRUE, reader = "R")
+  
+  # Extract the base name of the file and remove the '.h5ad' extension to get dataset_id
+  sce$dataset_id = file_path |> basename() |> str_remove("\\.h5ad$")
+  
+  # Add original cell identifiers from column names to the dataset
+  sce$observation_originalid = colnames(sce)
+  
+  # Create a new identifier by combining the original cell id with the dataset id
+  cell_modified = paste(sce$observation_originalid, sce$dataset_id, sep = "___")
+  
+  # Update column names with the new combined identifier
+  colnames(sce) <- cell_modified
+  
+  # Clear the metadata to reduce memory overhead
+  S4Vectors::metadata(sce) <- list()
+  
+  # Add sample identifier to the dataset
+  sce <- sce |> mutate(sample_id = !!sample_id)
+  
+  # Identify cells that match the observation_joinid
+  cells_to_subset <- which(colData(sce)$observation_joinid %in% unlist(observation_joinid))
+  
+  # Subset and return the SingleCellExperiment object with only the selected cells
+  sce[, cells_to_subset]
+}
+
+rename_features <- function(data) {
+  edb <- EnsDb.Hsapiens.v86
+  edb_df <- genes(edb,
+                  columns = c("gene_name", "entrezid", "gene_biotype"),
+                  filter = GeneIdFilter("ENSG", condition = "startsWith"),
+                  return.type = "data.frame") 
+  
+  gene_names_map <- rowData(data) |> as.data.frame() |> tibble::rownames_to_column(var = "gene_id") |>
+    dplyr::select(gene_id, feature_name) |>
+    mutate(current_gene_names = gsub("_", "-", feature_name)) |> dplyr::left_join(edb_df, by = "gene_id") |>
+    mutate(modified_gene_names = ifelse(stringr::str_like(current_gene_names, "%ENSG0%"), gene_name, current_gene_names))
+  
+  gene_names_map_tidy <- gene_names_map |>
+    dplyr::filter(!is.na(modified_gene_names))
+  
+  # delete ensemble IDs if cant be converted to gene symbols
+  rownames_to_delete <- gene_names_map |>
+    dplyr::filter(is.na(modified_gene_names)) |> pull(gene_id)
+  data <- data[!rownames(data) %in% rownames_to_delete, ]
+  
+  rownames(data) <- gene_names_map_tidy$modified_gene_names
+  # access assay name
+  assay_name <-  data@assays |> names() |> magrittr::extract2(1)
+  counts_slot <- assay(data, assay_name)
+  rownames(counts_slot) <- gene_names_map_tidy$modified_gene_names
+  
+  SummarizedExperiment::rowData(data)$feature_name <- NULL
+  SummarizedExperiment::rowData(data)$gene_name <- gene_names_map_tidy$modified_gene_names
+  
+  data
+}
+
+computing_resources = crew.cluster::crew_controller_slurm(
+  slurm_memory_gigabytes_per_cpu = 30, 
+  slurm_cpus_per_task = 1,
+  workers = 75,
+  verbose = TRUE
+)
+tar_option_set(
+  memory = "transient",
+  garbage_collection = TRUE,
+  storage = "worker",
+  retrieval = "worker",
+  format = "qs",
+  controller = computing_resources
+)
+
+list(
+  tar_target(
+    file_paths,
+    list.files(anndata_path_based_on_dataset_id_to_read, pattern = "*h5ad", 
+               full.names = TRUE),
+    deployment = "main"
+  ),
+  tar_target(
+    grouped_observation_joinid_per_sample,
+    read_parquet("~/scratch/Census_rerun/census_samples_to_download_groups.parquet")
+  ),
+  tar_target(
+    sliced_sce,
+    subset_samples(grouped_observation_joinid_per_sample$dataset_id, 
+                   grouped_observation_joinid_per_sample$observation_joinid,
+                   grouped_observation_joinid_per_sample$sample_2)  |> 
+      save_data(file_name = grouped_observation_joinid_per_sample$sample_2),
+    pattern = map(grouped_observation_joinid_per_sample),
+    format = "file"
+  )
+)
+
+# tar_make(store = "~/scratch/Census_rerun/split_h5ad_based_on_sample_id_target_store/",
+#          script = "~/git_control/CuratedAtlasQueryR/dev/split_census_anndata_base_on_sample_id.R",
+#          reporter = "summary")
+
+
+
diff --git a/man/get_database_url.Rd b/man/get_database_url.Rd
index 2962d1b..5c43517 100644
--- a/man/get_database_url.Rd
+++ b/man/get_database_url.Rd
@@ -8,11 +8,13 @@
 }
 \usage{
 get_database_url(
-  databases = c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet")
+  databases = c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet",
+    "prostate.0.1.0.parquet")
 )
 }
 \arguments{
-\item{databases}{A character vector specifying the names of the metadata files. Default is c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet")}
+\item{databases}{A character vector specifying the names of the metadata files.
+Default is c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet", "prostate.0.1.0.parquet")}
 }
 \value{
 A character vector of URLs to parquet files to download
diff --git a/tests/testthat/test-query.R b/tests/testthat/test-query.R
index 4a2cb1b..a6e4355 100755
--- a/tests/testthat/test-query.R
+++ b/tests/testthat/test-query.R
@@ -253,4 +253,16 @@ test_that("get_pseudobulk() syncs appropriate fixed file", {
     expect_gt(1)
 })
 
+test_that("get_single_cell_experiment() syncs prostate atlas", {
+  temp <- tempfile()
+  # A sample from prostate atlas
+  sample <- "GSM4089151"
+  meta <- get_metadata(cache_directory = temp) |> filter(sample_ == sample)
+  sce <- meta |> get_single_cell_experiment(cache_directory = temp)
+  sce |>
+    row.names() |>
+    length() |>
+    expect_gt(1)
+})
+