broadinstitute · jaspreetishar · Feb 5, 2025 · Feb 7, 2025 · Feb 7, 2025 · Feb 10, 2025
diff --git a/src/celldega/pre/__init__.py b/src/celldega/pre/__init__.py
@@ -13,13 +13,15 @@
 import os
 import hashlib
 import base64
+import tifffile
 from shapely.affinity import affine_transform
 from shapely.geometry import MultiPolygon
 
 import matplotlib.pyplot as plt
 from matplotlib.colors import to_hex
 
 import json
+import shutil
 
 from .landscape import *
 from .trx_tile import *
@@ -209,7 +211,7 @@ def make_meta_cell_image_coord(
     path_transformation_matrix,
     path_meta_cell_micron,
     path_meta_cell_image,
-    image_scale
+    image_scale=1
 ):
     """
     Apply an affine transformation to the cell coordinates in microns and save
@@ -234,14 +236,14 @@ def make_meta_cell_image_coord(
     --------
     >>> make_meta_cell_image_coord(
     ...     technology='Xenium',
-    ...     path_transformation_matrix='data/transformation_matrix.txt',
+    ...     path_transformation_matrix='data/xenium_transform.txt',
     ...     path_meta_cell_micron='data/meta_cell_micron.csv',
     ...     path_meta_cell_image='data/meta_cell_image.parquet'
     ... )
 
     """
 
-    transformation_matrix = pd.read_csv(
+    xenium_transform = pd.read_csv(
         path_transformation_matrix, header=None, sep=" "
     ).values
 
@@ -250,20 +252,28 @@ def make_meta_cell_image_coord(
         meta_cell = convert_long_id_to_short(meta_cell)
         meta_cell["name"] =  meta_cell["cell_id"]
         meta_cell = meta_cell.set_index('cell_id')
+
     elif technology == "Xenium":
         usecols = ["cell_id", "x_centroid", "y_centroid"]
         meta_cell = pd.read_csv(path_meta_cell_micron, index_col=0, usecols=usecols)
         meta_cell.columns = ["center_x", "center_y"]
         meta_cell["name"] = pd.Series(meta_cell.index, index=meta_cell.index)
 
+    elif technology == "custom":
+        meta_cell = gpd.read_parquet(path_meta_cell_micron)
+        meta_cell['center_x'] = meta_cell.centroid.x
+        meta_cell['center_y'] = meta_cell.centroid.y
+        meta_cell["name"] = pd.Series(meta_cell.index, index=meta_cell.index)
+        meta_cell.drop(['area', 'centroid'], axis=1, inplace=True)
+
     # Adding a ones column to accommodate for affine transformation
     meta_cell["ones"] = 1
 
     # Preparing the data for matrix multiplication
     points = meta_cell[["center_x", "center_y", "ones"]].values
 
     # Applying the transformation matrix
-    transformed_points = np.dot(transformation_matrix, points.T).T
+    transformed_points = np.dot(xenium_transform, points.T).T
 
     # Updating the DataFrame with transformed coordinates
     meta_cell["center_x"] = transformed_points[:, 0]
@@ -322,7 +332,10 @@ def make_meta_gene(technology, path_cbg, path_output):
         # genes = pd.read_csv(path_cbg + 'features.tsv.gz', sep='\t', header=None)[1].values.tolist()
         cbg = read_cbg_mtx(path_cbg)
         genes = cbg.columns.tolist()
-
+    elif technology == "custom":
+        cbg = pd.read_parquet(path_cbg)
+        genes = cbg.columns.tolist()
+
     # Get all categorical color palettes from Matplotlib and flatten them into a single list of colors
     palettes = [plt.get_cmap(name).colors for name in plt.colormaps() if "tab" in name]
     flat_colors = [color for palette in palettes for color in palette]
@@ -377,31 +390,179 @@ def get_max_zoom_level(path_image_pyramid):
     return max_pyramid_zoom
 
 
+def clustering(path_landscape_files, segmentation_parameters, cbg):
+
+    default_clustering = pd.read_csv(path_landscape_files + 'analysis/clustering/gene_expression_graphclust/clusters.csv', 
+                                        index_col=0)
+
+    default_clustering = pd.DataFrame(default_clustering.values, index=default_clustering.index.tolist(), columns=['cluster'])
+
+    default_clustering_ini = pd.DataFrame(default_clustering.values, index=default_clustering.index.tolist(), columns=['cluster'])
+    default_clustering_ini['cluster'] = default_clustering_ini['cluster'].astype('string')
+
+    meta_cell = pd.read_parquet(path_landscape_files + 'cell_metadata.parquet')
+    default_clustering = pd.DataFrame(index=meta_cell.index.tolist())
+    default_clustering.loc[default_clustering_ini.index.tolist(), 'cluster'] = default_clustering_ini['cluster']
+    default_clustering.to_parquet(path_landscape_files + f"cell_clusters/cluster_{segmentation_parameters['segmentation_approach']}.parquet")
+    ser_counts = default_clustering['cluster'].value_counts()
+    clusters = ser_counts.index.tolist()
+    palettes = [plt.get_cmap(name).colors for name in plt.colormaps() if "tab" in name]
+    flat_colors = [color for palette in palettes for color in palette]
+    flat_colors_hex = [to_hex(color) for color in flat_colors]
+
+    colors = [
+        flat_colors_hex[i % len(flat_colors_hex)] if "Blank" not in cluster else "#FFFFFF"
+        for i, cluster in enumerate(clusters)
+    ]
+
+    ser_color = pd.Series(colors, index=clusters, name='color')
+    meta_cluster = pd.DataFrame(ser_color)
+    meta_cluster['count'] = ser_counts
+    meta_cluster.to_parquet(path_landscape_files + f"cell_clusters/meta_cluster_{segmentation_parameters['segmentation_approach']}.parquet")
+
+    df_meta = pd.read_csv(path_landscape_files + 'analysis/clustering/gene_expression_graphclust/clusters.csv', index_col=0)
+    df_meta['Cluster'] = df_meta['Cluster'].astype('string')
+    df_meta.columns = ['cluster']
+
+    meta_cell['cluster'] = df_meta['cluster']
+
+    list_ser = []
+    for inst_cat in meta_cell['cluster'].unique().tolist():
+        if inst_cat is not None:
+            inst_cells = meta_cell[meta_cell['cluster'] == inst_cat].index.tolist()
+            inst_ser = cbg.loc[inst_cells].sum()/len(inst_cells)
+            inst_ser.name = inst_cat
+
+            list_ser.append(inst_ser)
+
+    df_sig = pd.concat(list_ser, axis=1)    
+
+    df_sig = pd.concat(list_ser, axis=1)
+    df_sig.columns = df_sig.columns.tolist()
+    df_sig.index = df_sig.index.tolist()
+
+    keep_genes = df_sig.index.tolist()
+    keep_genes = [x for x in keep_genes if 'Unassigned' not in x]
+    keep_genes = [x for x in keep_genes if 'NegControl' not in x]
+    keep_genes = [x for x in keep_genes if 'DeprecatedCodeword' not in x]
+
+    df_sig = df_sig.loc[keep_genes, clusters]
+
+    df_sig.sparse.to_dense().to_parquet(path_landscape_files + f"df_sig_{segmentation_parameters['segmentation_approach']}.parquet")
+
 def save_landscape_parameters(
-    technology, path_landscape_files, image_name="dapi_files", tile_size=1000, image_info={}, image_format='.webp'
-):
+    technology, path_landscape_files, image_name="dapi_files", tile_size=1000, image_info={}, image_format='.webp', segmentation_approach="default"):
     """
     Save the landscape parameters to a JSON file.
     """
 
-    path_image_pyramid = f"{path_landscape_files}/pyramid_images/{image_name}"
+    if os.path.isdir(path_landscape_files) and os.path.exists(f"{path_landscape_files}/landscape_parameters.json"):
+
+        with open(f"{path_landscape_files}/landscape_parameters.json", "r") as file:
+            landscape_parameters = json.load(file)
+
+        landscape_parameters[segmentation_approach] = {"technology": technology,
+                "max_pyramid_zoom": max_pyramid_zoom,
+                "tile_size": tile_size,
+                "image_info": image_info,
+                "image_format": image_format}
+
+        path_landscape_parameters = f"{path_landscape_files}/landscape_parameters.json"
 
-    print(path_image_pyramid)
+        with open(path_landscape_parameters, "w") as file:
+            json.dump(landscape_parameters, file, indent=4)
+
+    else:
+        path_image_pyramid = f"{path_landscape_files}/pyramid_images/{image_name}"
+
+        print(path_image_pyramid)
+
+        max_pyramid_zoom = get_max_zoom_level(path_image_pyramid)
+
+        landscape_parameters = {
+              segmentation_approach: {
+                "technology": technology,
+                "max_pyramid_zoom": max_pyramid_zoom,
+                "tile_size": tile_size,
+                "image_info": image_info,
+                "image_format": image_format
+            }}
+
+        path_landscape_parameters = f"{path_landscape_files}/landscape_parameters.json"
+
+        with open(path_landscape_parameters, "w") as file:
+            json.dump(landscape_parameters, file, indent=4)
+
+def add_custom_segmentation(path_landscape_files, path_segmentation_files, image_scale=1, tile_size=250):
+
+    with open(f"{path_segmentation_files}/segmentation_parameters.json", "r") as file:
+        segmentation_parameters = json.load(file)
+
+    make_meta_gene(technology=segmentation_parameters['technology'], 
+                   path_cbg=os.path.join(path_segmentation_files, "cell_by_gene_matrix.parquet"), 
+                   path_output=os.path.join(path_landscape_files, f"meta_gene_{segmentation_parameters['segmentation_approach']}.parquet"))
+
+    cbg_custom = pd.read_parquet(os.path.join(path_segmentation_files, "cell_by_gene_matrix.parquet"))
+
+    save_cbg_gene_parquets(path_landscape_files, 
+                           cbg=cbg_custom, 
+                           verbose=True, 
+                           custom_segmentation_approach=f"_{segmentation_parameters['segmentation_approach']}")
+
+    make_meta_cell_image_coord(technology = segmentation_parameters['technology'], 
+                            path_transformation_matrix = os.path.join(path_landscape_files, 'xenium_transform.csv'), 
+                            path_meta_cell_micron = os.path.join(path_segmentation_files, 'cell_metadata_micron_space.parquet'), 
+                            path_meta_cell_image = os.path.join(path_landscape_files, 'cell_metadata.parquet'),
+                            image_scale=image_scale)
+
+    tile_bounds = make_trx_tiles(technology = segmentation_parameters['technology'], 
+                                path_trx = os.path.join(path_segmentation_files, 'transcripts.parquet'),
+                                path_transformation_matrix = os.path.join(path_landscape_files, 'xenium_transform.csv'), 
+                                path_trx_tiles = os.path.join(path_landscape_files, 'transcript_tiles'),
+                                tile_size=tile_size,
+                                image_scale=image_scale)
+
+    make_cell_boundary_tiles(technology = segmentation_parameters['technology'],
+                path_cell_boundaries = os.path.join(path_segmentation_files, "cell_polygons.parquet"),
+                path_meta_cell_micron = os.path.join(path_segmentation_files, 'cell_metadata_micron_space.parquet'),
+                path_transformation_matrix = os.path.join(path_landscape_files, 'xenium_transform.csv'),
+                path_output = os.path.join(path_landscape_files, f"cell_segmentation_{segmentation_parameters['segmentation_approach']}"),
+                tile_size=tile_size,
+                tile_bounds=tile_bounds,
+                image_scale=image_scale)
+
+    clustering(path_landscape_files=path_landscape_files, 
+               segmentation_parameters=segmentation_parameters, 
+               cbg=cbg_custom)
+
+    save_landscape_parameters(technology=segmentation_parameters['technology'], 
+                              path_landscape_files=path_landscape_files, 
+                              image_name="dapi_files", 
+                              tile_size=1000, image_format='.webp',
+                              segmentation_approach=segmentation_parameters['segmentation_approach'])
 
-    max_pyramid_zoom = get_max_zoom_level(path_image_pyramid)
+__all__ = ["landscape", "trx_tile", "boundary_tile"]
 
-    landscape_parameters = {
-        "technology": technology,
-        "max_pyramid_zoom": max_pyramid_zoom,
-        "tile_size": tile_size,
-        "image_info": image_info,
-        "image_format": image_format
-    }
 
-    path_landscape_parameters = f"{path_landscape_files}/landscape_parameters.json"
+"""
+This has three checklist items:
 
-    with open(path_landscape_parameters, "w") as file:
-        json.dump(landscape_parameters, file, indent=4)
+The files meta_gene.parquet and gene_metadata.parquet are the same and that's a bug that they're being saved twice
 
+You'll need a new cell_metadata.parquet, df_sig.parquet, meta_gene.parquet, cbg directory (maybe cbg_my-seg-method)
+But not transcript tiles
 
-__all__ = ["landscape", "trx_tile", "boundary_tile"]
+the cell_clusters directory and files too - 
+I would do something like cell_clusters_my-seg-method and leave the files within the same.
+
+1 make a dega.pre method for adding custom segmentation results into the LandscapeFiles 
+(you can decide where to save them - there will be a new cbg file for instance called cbg_some-custom-segmentation-name), 
+
+2 decide on the organization of the LandscapeFiles (just create new adjacent files so we don't break any backwards compatability - 
+we can reorganize this when we do a 1.0 release), and 
+
+3 make an argument in the Landscape method that lets the user select 
+which segmentation approach to visualize (the landscape_parameters.json can also have a default segmentation result set up to 
+establish a default behavior when no argument is given)
+
+"""
diff --git a/src/celldega/pre/boundary_tile.py b/src/celldega/pre/boundary_tile.py
@@ -166,7 +166,10 @@ def make_cell_boundary_tiles(
     if not os.path.exists(path_output):
         os.makedirs(path_output)
 
-    cells_orig = get_cell_polygons(technology, path_cell_boundaries, path_output, path_meta_cell_micron)
+    if technology == 'custom':
+        cells_orig = gpd.read_parquet(path_cell_boundaries)
+    else:
+        cells_orig = get_cell_polygons(technology, path_cell_boundaries, path_output, path_meta_cell_micron)
 
     # Transform geometries
     cells_orig["GEOMETRY"] = batch_transform_geometries(cells_orig["geometry"], transformation_matrix, image_scale)

diff --git a/src/celldega/pre/landscape.py b/src/celldega/pre/landscape.py
@@ -11,6 +11,18 @@
 # read_cbg_mtx         : Read the cell-by-gene matrix from the mtx files.
 # save_cbg_gene_parquets : Save the cell-by-gene matrix as gene-specific Parquet files.
 # =============================================================================
+def to_dense(series):
+    """Convert sparse Series to dense only if it is sparse"""
+
+    return series.sparse.to_dense() if isinstance(series.dtype, pd.SparseDtype) else series
+
+def to_dense_if_sparse(df):
+    """Convert sparse DataFrame columns to dense only if they are sparse"""
+
+    for col in df.columns:
+        if isinstance(df[col].dtype, pd.SparseDtype):
+            df[col] = df[col].sparse.to_dense()
+    return df
 
 def calc_meta_gene_data(cbg):
     """
@@ -75,14 +87,14 @@ def convert_to_dense(series):
     proportion_nonzero = (cbg != 0).sum(axis=0) / len(cbg)
 
     # Create a DataFrame to hold all these metrics
+
     meta_gene = pd.DataFrame(
         {
-            "mean": mean_expression.sparse.to_dense(),
+            "mean": to_dense(mean_expression),
             "std": std_deviation,
-            "max": max_expression.sparse.to_dense(),
-            "non-zero": proportion_nonzero.sparse.to_dense(),
+            "max": to_dense(max_expression),
+            "non-zero": to_dense(proportion_nonzero),
         }
-
     )
 
     meta_gene_clean = pd.DataFrame(meta_gene.values, index=meta_gene.index.tolist(), columns=meta_gene.columns)
@@ -127,7 +139,7 @@ def read_cbg_mtx(base_path):
 
     return cbg
 
-def save_cbg_gene_parquets(base_path, cbg, verbose=False):
+def save_cbg_gene_parquets(base_path, cbg, verbose=False, custom_segmentation_approach=""):
     """
     Save the cell-by-gene matrix as gene-specific Parquet files.
 
@@ -144,7 +156,7 @@ def save_cbg_gene_parquets(base_path, cbg, verbose=False):
     -------
     None
     """
-    output_dir = os.path.join(base_path, "cbg")
+    output_dir = os.path.join(base_path, f"cbg{custom_segmentation_approach}")
     os.makedirs(output_dir, exist_ok=True)
 
     for index, gene in enumerate(cbg.columns):
@@ -155,7 +167,8 @@ def save_cbg_gene_parquets(base_path, cbg, verbose=False):
         col_df = cbg[[gene]].copy()
 
         # Convert to dense and integer type
-        col_df = col_df.sparse.to_dense().astype(int)
+
+        col_df = to_dense_if_sparse(col_df).astype(int)
 
         # Create a DataFrame necessary to prevent error in to_parquet
         inst_df = pd.DataFrame(

diff --git a/src/celldega/pre/trx_tile.py b/src/celldega/pre/trx_tile.py
@@ -91,10 +91,21 @@ def transform_transcript_coordinates(technology, path_trx, chunk_size, transform
             pl.col("y_location").alias("y")
         ])
 
+    elif technology == "custom":
+        trx_ini = pl.read_parquet(path_trx).select([
+            pl.col("cell_index"),
+            pl.col("transcript_index"),
+            pl.col("gene").alias("name"),
+            pl.col("x"),
+            pl.col("y")
+        ])
+
+
     # Process the data in chunks and apply transformations
     all_chunks = []
 
     for start_row in tqdm(range(0, trx_ini.height, chunk_size), desc="Processing chunks"):
+
         chunk = trx_ini.slice(start_row, chunk_size)
 
         # Apply transformation matrix to the coordinates
@@ -163,7 +174,7 @@ def make_trx_tiles(
         os.makedirs(path_trx_tiles)
 
     transformation_matrix = np.loadtxt(path_transformation_matrix)
-
+    
     trx = transform_transcript_coordinates(technology, path_trx, chunk_size, transformation_matrix, image_scale)
 
     # Get min and max x, y values