diff --git a/js/deck-gl/cell_layer.js b/js/deck-gl/cell_layer.js index 36b99819..6b890353 100644 --- a/js/deck-gl/cell_layer.js +++ b/js/deck-gl/cell_layer.js @@ -90,7 +90,13 @@ const cell_layer_onclick = async (info, d, deck_ist, layers_obj, viz_state) => { export const ini_cell_layer = async (base_url, viz_state) => { - const cell_url = base_url + `/cell_metadata.parquet`; + let cell_url + if (viz_state.seg.version === 'default'){ + cell_url = base_url + `/cell_metadata.parquet`; + } else { + cell_url = base_url + '/cell_metadata_' + viz_state.seg.version + '.parquet'; + } + var cell_arrow_table = await get_arrow_table(cell_url, options.fetch) set_cell_names_array(viz_state.cats, cell_arrow_table) @@ -105,7 +111,9 @@ export const ini_cell_layer = async (base_url, viz_state) => { viz_state.cats.cell_cats = viz_state.cats.cell_names_array.map(name => viz_state.cats.meta_cell[name]) } else { // default clustering - var cluster_arrow_table = await get_arrow_table(base_url + `/cell_clusters/cluster.parquet`, options.fetch) + + var cluster_arrow_table = await get_arrow_table(base_url + `/cell_clusters${viz_state.seg.version && viz_state.seg.version !== 'default' ? '_' + viz_state.seg.version : ''}/cluster.parquet`, + options.fetch) set_cell_cats(viz_state.cats, cluster_arrow_table, 'cluster') } @@ -114,8 +122,13 @@ export const ini_cell_layer = async (base_url, viz_state) => { // Combine names and positions into a single array of objects const new_cell_names_array = cell_arrow_table.getChild("name").toArray() + console.log(cell_arrow_table) + const flatCoordinateArray = viz_state.spatial.cell_scatter_data.attributes.getPosition.value; + console.log('*********************') + console.log(new_cell_names_array) + // save cell positions and categories in one place for updating cluster bar plot viz_state.combo_data.cell = new_cell_names_array.map((name, index) => ({ name: name, diff --git a/js/viz/landscape_ist.js b/js/viz/landscape_ist.js index 00db4592..13e57842 100644 --- a/js/viz/landscape_ist.js +++ b/js/viz/landscape_ist.js @@ -49,9 +49,13 @@ export const landscape_ist = async ( meta_cluster={}, umap={}, landscape_state='spatial', + segmentation='default', view_change_custom_callback=null ) => { + console.log('checking segmentation', segmentation) + + if (width === 0){ width = '100%' } @@ -240,6 +244,10 @@ export const landscape_ist = async ( viz_state.edit.visible = false viz_state.edit.modify_index = null + // starting to set up custom segmentation support + viz_state.seg = {} + viz_state.seg.version = segmentation + if (Object.keys(viz_state.model).length !== 0){ if (Object.keys(viz_state.model.get('region')).length === 0) { diff --git a/js/widget.js b/js/widget.js index 2e36de2c..29226641 100644 --- a/js/widget.js +++ b/js/widget.js @@ -34,6 +34,7 @@ export const render_landscape_ist = async ({ model, el }) => { const meta_cluster = model.get('meta_cluster') const umap = model.get('umap') const landscape_state = model.get('landscape_state') + const segmentation = model.get('segmentation') return landscape_ist( el, @@ -51,7 +52,8 @@ export const render_landscape_ist = async ({ model, el }) => { meta_cell, meta_cluster, umap, - landscape_state + landscape_state, + segmentation ) } diff --git a/package-lock.json b/package-lock.json index eef095f1..210d90c3 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,20 +1,12 @@ { "name": "celldega", -<<<<<<< HEAD - "version": "0.5.0", -======= - "version": "0.5.4", ->>>>>>> main + "version": "0.8.2", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "celldega", -<<<<<<< HEAD - "version": "0.5.0", -======= - "version": "0.5.4", ->>>>>>> main + "version": "0.8.2", "dependencies": { "@deck.gl-community/editable-layers": "^9.0.3", "@loaders.gl/core": "^4.2.0", diff --git a/src/celldega/clust/__init__.py b/src/celldega/clust/__init__.py index a9b5b2f8..1bc5e766 100644 --- a/src/celldega/clust/__init__.py +++ b/src/celldega/clust/__init__.py @@ -31,6 +31,7 @@ import numpy as np import pandas as pd +import os from copy import deepcopy from . import initialize_net @@ -47,6 +48,7 @@ from . import categories from scipy.stats import ttest_ind, mannwhitneyu +from matplotlib.colors import to_hex from sklearn.metrics import pairwise_distances, roc_curve, auc from scipy.spatial.distance import pdist from sklearn.metrics import confusion_matrix @@ -57,6 +59,11 @@ import ipywidgets as widgets import statsmodels.stats.multitest as smm +from spatialdata_io import xenium +import scanpy as sc +import squidpy as sq +import spatialdata as sd + def hc(df, filter_N_top=None, norm_col='total', norm_row='zscore'): """ @@ -98,6 +105,197 @@ def hc(df, filter_N_top=None, norm_col='total', norm_row='zscore'): return network +def calc_cluster_signatures(path_landscape_files, segmentation_parameters, cbg, use_default_clustering=False, use_custom_clustering=False): + + os.makedirs(os.path.join(path_landscape_files, + f"cell_clusters{'_' + segmentation_parameters['segmentation_approach'] if segmentation_parameters['segmentation_approach'] else ''}"), + exist_ok=True) + + if not use_default_clustering: + if use_custom_clustering: + + if segmentation_parameters['technology'] == 'custom' or segmentation_parameters['technology'] == 'Xenium': + sdata = xenium(os.path.dirname(path_landscape_files)) + # could add merscope functionality later on + + adata = sdata.tables["table"] + + sc.pp.calculate_qc_metrics(adata, percent_top=(10, 20, 50, 150), inplace=True) + cprobes = ( + adata.obs["control_probe_counts"].sum() / adata.obs["total_counts"].sum() * 100) + cwords = ( + adata.obs["control_codeword_counts"].sum() / adata.obs["total_counts"].sum() * 100) + + sc.pp.filter_cells(adata, min_counts=10) + sc.pp.filter_genes(adata, min_cells=5) + + adata.layers["counts"] = adata.X.copy() + sc.pp.normalize_total(adata, inplace=True) + sc.pp.log1p(adata) + sc.pp.pca(adata) + sc.pp.neighbors(adata) + sc.tl.umap(adata) + sc.tl.leiden(adata) + + adata.obs.set_index('cell_index', inplace=True) + + meta_cell = adata.obs['leiden'].to_dict() + + clusters = adata.obs['leiden'].cat.categories.tolist() + colors = adata.uns['leiden_colors'] + ser_counts = adata.obs['leiden'].value_counts() + ser_color = pd.Series(colors, index=clusters, name='color') + meta_cluster_df = pd.DataFrame(ser_color) + meta_cluster_df['count'] = ser_counts + + meta_cluster_df.index = [str(x) for x in meta_cluster_df.index] + meta_cluster = meta_cluster_df.to_dict(orient='index') + + meta_cluster.to_parquet(os.path.join(path_landscape_files, f"cell_clusters_{segmentation_parameters['segmentation_approach']}/meta_cluster.parquet")) + + inst_cells = cbg.index.tolist() + df_cluster = pd.DataFrame(index=inst_cells) + df_cluster['cluster'] = pd.Series('0', index=inst_cells) + df_cluster.to_parquet(os.path.join(path_landscape_files, f"cell_clusters_{segmentation_parameters['segmentation_approach']}/cluster.parquet")) + + adata.obs['leiden'] = adata.obs['leiden'].astype('string') + + list_ser = [] + for inst_cat in adata.obs['leiden'].unique(): + if inst_cat is not None: + inst_cells = adata.obs[adata.obs['leiden'] == inst_cat].index.tolist() + inst_ser = pd.Series(adata[inst_cells].X.mean(axis=0).A1, index=adata.var_names) + inst_ser.name = inst_cat + list_ser.append(inst_ser) + + df_sig = pd.concat(list_ser, axis=1) + df_sig.columns = df_sig.columns.tolist() + + keep_genes = df_sig.index.tolist() + keep_genes = [x for x in keep_genes if 'Unassigned' not in x] + keep_genes = [x for x in keep_genes if 'NegControl' not in x] + keep_genes = [x for x in keep_genes if 'DeprecatedCodeword' not in x] + + df_sig = df_sig.loc[keep_genes] + + df_sig.to_parquet( + os.path.join( + path_landscape_files, + f"df_sig{'_' + segmentation_parameters['segmentation_approach'] if segmentation_parameters['segmentation_approach'] else ''}.parquet" + ) + ) + + else: + inst_cells = cbg.index.tolist() + df_cluster = pd.DataFrame(index=inst_cells) + df_cluster['cluster'] = pd.Series('0', index=inst_cells) + df_cluster.to_parquet(os.path.join( + path_landscape_files, + f"cell_clusters{'_' + segmentation_parameters['segmentation_approach'] if segmentation_parameters['segmentation_approach'] else ''}", + "cluster.parquet" + )) + + meta_clust = pd.DataFrame(index=['0']) + meta_clust.loc['0', 'color'] = '#1f77b4' + meta_clust.loc['0', 'count'] = 100 + meta_clust.to_parquet(os.path.join( + path_landscape_files, + f"cell_clusters{'_' + segmentation_parameters['segmentation_approach'] if segmentation_parameters['segmentation_approach'] else ''}", + "meta_cluster.parquet" + )) + + df_cluster['cluster'] = df_cluster['cluster'].astype('string') + + list_ser = [] + for inst_cat in df_cluster['cluster'].unique(): + if inst_cat is not None: + inst_cells = df_cluster[df_cluster['cluster'] == inst_cat].index.tolist() + inst_ser = cbg.loc[inst_cells].sum()/len(inst_cells) + inst_ser.name = inst_cat + list_ser.append(inst_ser) + + df_sig = pd.concat(list_ser, axis=1) + df_sig.columns = df_sig.columns.tolist() + + keep_genes = df_sig.index.tolist() + keep_genes = [x for x in keep_genes if 'Unassigned' not in x] + keep_genes = [x for x in keep_genes if 'NegControl' not in x] + keep_genes = [x for x in keep_genes if 'DeprecatedCodeword' not in x] + + df_sig = df_sig.loc[keep_genes] + + for col in df_sig.columns: + if isinstance(df_sig[col].dtype, pd.SparseDtype): + df_sig[col] = df_sig[col].sparse.to_dense() + + df_sig.to_parquet( + os.path.join( + path_landscape_files, + f"df_sig{'_' + segmentation_parameters['segmentation_approach'] if segmentation_parameters['segmentation_approach'] else ''}.parquet" + ) + ) + + else: + default_clusters_path = os.path.join(os.path.dirname(path_landscape_files), 'analysis/clustering/gene_expression_graphclust/clusters.csv') + default_clustering = pd.read_csv(default_clusters_path, index_col=0) + + default_clustering = pd.DataFrame(default_clustering.values, index=default_clustering.index.tolist(), columns=['cluster']) + + default_clustering_ini = pd.DataFrame(default_clustering.values, index=default_clustering.index.tolist(), columns=['cluster']) + default_clustering_ini['cluster'] = default_clustering_ini['cluster'].astype('string') + + meta_cell = pd.read_parquet(os.path.join(path_landscape_files, 'cell_metadata.parquet')) + + default_clustering = pd.DataFrame(index=meta_cell.index.tolist()) + default_clustering.loc[default_clustering_ini.index.tolist(), 'cluster'] = default_clustering_ini['cluster'] + + default_clustering.to_parquet(os.path.join(path_landscape_files, f"cell_clusters_{segmentation_parameters['segmentation_approach']}/cluster.parquet")) + ser_counts = default_clustering['cluster'].value_counts() + clusters = ser_counts.index.tolist() + palettes = [plt.get_cmap(name).colors for name in plt.colormaps() if "tab" in name] + flat_colors = [color for palette in palettes for color in palette] + flat_colors_hex = [to_hex(color) for color in flat_colors] + + colors = [ + flat_colors_hex[i % len(flat_colors_hex)] if "Blank" not in cluster else "#FFFFFF" + for i, cluster in enumerate(clusters) + ] + + ser_color = pd.Series(colors, index=clusters, name='color') + meta_cluster = pd.DataFrame(ser_color) + meta_cluster['count'] = ser_counts + meta_cluster.to_parquet(os.path.join(path_landscape_files, f"cell_clusters_{segmentation_parameters['segmentation_approach']}/meta_cluster.parquet")) + + df_meta = pd.read_csv(default_clusters_path, index_col=0) + df_meta['Cluster'] = df_meta['Cluster'].astype('string') + df_meta.columns = ['cluster'] + + meta_cell['cluster'] = df_meta['cluster'] + + list_ser = [] + for inst_cat in meta_cell['cluster'].unique().tolist(): + if inst_cat is not None: + inst_cells = meta_cell[meta_cell['cluster'] == inst_cat].index.tolist() + inst_ser = cbg.loc[inst_cells].sum()/len(inst_cells) + inst_ser.name = inst_cat + + list_ser.append(inst_ser) + + df_sig = pd.concat(list_ser, axis=1) + + df_sig = pd.concat(list_ser, axis=1) + df_sig.columns = df_sig.columns.tolist() + df_sig.index = df_sig.index.tolist() + + keep_genes = df_sig.index.tolist() + keep_genes = [x for x in keep_genes if 'Unassigned' not in x] + keep_genes = [x for x in keep_genes if 'NegControl' not in x] + keep_genes = [x for x in keep_genes if 'DeprecatedCodeword' not in x] + + df_sig = df_sig.loc[keep_genes, clusters] + + df_sig.sparse.to_dense().to_parquet(os.path.join(path_landscape_files, f"df_sig_{segmentation_parameters['segmentation_approach']}.parquet")) + class Network(object): ''' Clustergrammer.py takes a matrix as input (either from a file of a Pandas DataFrame), normalizes/filters, hierarchically clusters, and produces the :ref:`visualization_json` for :ref:`clustergrammer_js`. @@ -1568,4 +1766,4 @@ def make_df_from_cols(cols): df_meta = pd.DataFrame(data=mat, index=rows, columns=cat_titles) - return df_meta + return df_meta \ No newline at end of file diff --git a/src/celldega/pre/__init__.py b/src/celldega/pre/__init__.py index 571dd8f3..77227816 100644 --- a/src/celldega/pre/__init__.py +++ b/src/celldega/pre/__init__.py @@ -14,6 +14,7 @@ import hashlib import base64 from shapely.geometry import Point, Polygon +from scipy.sparse import csc_matrix, csr_matrix import matplotlib.pyplot as plt from matplotlib.colors import to_hex @@ -23,6 +24,7 @@ from .landscape import * from .trx_tile import * from .boundary_tile import * +from ..clust import * def convert_long_id_to_short(df): """ @@ -208,7 +210,7 @@ def make_meta_cell_image_coord( path_transformation_matrix, path_meta_cell_micron, path_meta_cell_image, - image_scale + image_scale=1 ): """ Apply an affine transformation to the cell coordinates in microns and save @@ -233,7 +235,7 @@ def make_meta_cell_image_coord( -------- >>> make_meta_cell_image_coord( ... technology='Xenium', - ... path_transformation_matrix='data/transformation_matrix.txt', + ... path_transformation_matrix='data/transformation_matrix.csv', ... path_meta_cell_micron='data/meta_cell_micron.csv', ... path_meta_cell_image='data/meta_cell_image.parquet' ... ) @@ -244,17 +246,27 @@ def make_meta_cell_image_coord( path_transformation_matrix, header=None, sep=" " ).values + sparse_matrix = csr_matrix(transformation_matrix) + if technology == "MERSCOPE": meta_cell = pd.read_csv(path_meta_cell_micron, usecols=["EntityID", "center_x", "center_y"]) meta_cell = convert_long_id_to_short(meta_cell) meta_cell["name"] = meta_cell["cell_id"] meta_cell = meta_cell.set_index('cell_id') + elif technology == "Xenium": usecols = ["cell_id", "x_centroid", "y_centroid"] meta_cell = pd.read_csv(path_meta_cell_micron, index_col=0, usecols=usecols) meta_cell.columns = ["center_x", "center_y"] meta_cell["name"] = pd.Series(meta_cell.index, index=meta_cell.index) + elif technology == "custom": + meta_cell = gpd.read_parquet(path_meta_cell_micron) + meta_cell['center_x'] = meta_cell.centroid.x + meta_cell['center_y'] = meta_cell.centroid.y + meta_cell["name"] = pd.Series(meta_cell.index, index=meta_cell.index).astype('str') + meta_cell.drop(['area', 'centroid'], axis=1, inplace=True) + # Adding a ones column to accommodate for affine transformation meta_cell["ones"] = 1 @@ -262,7 +274,7 @@ def make_meta_cell_image_coord( points = meta_cell[["center_x", "center_y", "ones"]].values # Applying the transformation matrix - transformed_points = np.dot(transformation_matrix, points.T).T + transformed_points = sparse_matrix.dot(points.T).T[:, :2] # Updating the DataFrame with transformed coordinates meta_cell["center_x"] = transformed_points[:, 0] @@ -321,6 +333,9 @@ def make_meta_gene(technology, path_cbg, path_output): # genes = pd.read_csv(path_cbg + 'features.tsv.gz', sep='\t', header=None)[1].values.tolist() cbg = read_cbg_mtx(path_cbg) genes = cbg.columns.tolist() + elif technology == "custom": + cbg = pd.read_parquet(path_cbg) + genes = cbg.columns.tolist() # Get all categorical color palettes from Matplotlib and flatten them into a single list of colors palettes = [plt.get_cmap(name).colors for name in plt.colormaps() if "tab" in name] @@ -375,33 +390,86 @@ def get_max_zoom_level(path_image_pyramid): return max_pyramid_zoom - def save_landscape_parameters( - technology, path_landscape_files, image_name="dapi_files", tile_size=1000, image_info={}, image_format='.webp' -): + technology, path_landscape_files, image_name="dapi_files", tile_size=1000, image_info={}, image_format='.webp', segmentation_approach="default"): """ Save the landscape parameters to a JSON file. """ - path_image_pyramid = f"{path_landscape_files}/pyramid_images/{image_name}" + if os.path.isdir(path_landscape_files) and os.path.exists(f"{path_landscape_files}/landscape_parameters.json"): + + with open(f"{path_landscape_files}/landscape_parameters.json", "r") as file: + landscape_parameters = json.load(file) + + landscape_parameters["segmentation_approach"].append(segmentation_approach) + + path_landscape_parameters = f"{path_landscape_files}/landscape_parameters.json" + + with open(path_landscape_parameters, "w") as file: + json.dump(landscape_parameters, file, indent=4) + + else: + path_image_pyramid = f"{path_landscape_files}/pyramid_images/{image_name}" + + max_pyramid_zoom = get_max_zoom_level(path_image_pyramid) + + landscape_parameters = { + "technology": technology, + "segmentation_approach": [segmentation_approach], + "max_pyramid_zoom": max_pyramid_zoom, + "tile_size": tile_size, + "image_info": image_info, + "image_format": image_format + } + + path_landscape_parameters = f"{path_landscape_files}/landscape_parameters.json" + + with open(path_landscape_parameters, "w") as file: + json.dump(landscape_parameters, file, indent=4) + +def add_custom_segmentation(path_landscape_files, path_segmentation_files, image_scale=1, tile_size=250, use_default_clustering=False, use_custom_clustering=False): - print(path_image_pyramid) + with open(f"{path_segmentation_files}/segmentation_parameters.json", "r") as file: + segmentation_parameters = json.load(file) - max_pyramid_zoom = get_max_zoom_level(path_image_pyramid) + make_meta_gene(technology=segmentation_parameters['technology'], + path_cbg=os.path.join(path_segmentation_files, "cell_by_gene_matrix.parquet"), + path_output=os.path.join(path_landscape_files, f"meta_gene_{segmentation_parameters['segmentation_approach']}.parquet")) - landscape_parameters = { - "technology": technology, - "max_pyramid_zoom": max_pyramid_zoom, - "tile_size": tile_size, - "image_info": image_info, - "image_format": image_format - } + cbg_custom = pd.read_parquet(os.path.join(path_segmentation_files, "cell_by_gene_matrix.parquet")) - path_landscape_parameters = f"{path_landscape_files}/landscape_parameters.json" + save_cbg_gene_parquets(path_landscape_files, + cbg=cbg_custom, + verbose=True, + custom_segmentation_approach=f"{segmentation_parameters['segmentation_approach']}") - with open(path_landscape_parameters, "w") as file: - json.dump(landscape_parameters, file, indent=4) + make_meta_cell_image_coord(technology = segmentation_parameters['technology'], + path_transformation_matrix = os.path.join(path_landscape_files, 'transformation_matrix.csv'), + path_meta_cell_micron = os.path.join(path_segmentation_files, 'cell_metadata_micron_space.parquet'), + path_meta_cell_image = os.path.join(path_landscape_files, f"cell_metadata_{segmentation_parameters['segmentation_approach']}.parquet"), + image_scale=image_scale) + tile_bounds = make_trx_tiles(technology = segmentation_parameters['technology'], + path_trx = os.path.join(path_segmentation_files, 'transcripts.parquet')) + + make_cell_boundary_tiles(technology = segmentation_parameters['technology'], + path_cell_boundaries = os.path.join(path_segmentation_files, "cell_polygons.parquet"), + path_output = os.path.join(path_landscape_files, f"cell_segmentation_{segmentation_parameters['segmentation_approach']}"), + tile_size=tile_size, + tile_bounds=tile_bounds, + image_scale=image_scale) + + calc_cluster_signatures(path_landscape_files=path_landscape_files, + segmentation_parameters=segmentation_parameters, + cbg=cbg_custom, + use_default_clustering=use_default_clustering, + use_custom_clustering=use_custom_clustering) + + save_landscape_parameters(technology=segmentation_parameters['technology'], + path_landscape_files=path_landscape_files, + image_name="dapi_files", + tile_size=1000, image_format='.webp', + segmentation_approach=segmentation_parameters['segmentation_approach']) def to_geometry(coord_list): """ @@ -410,12 +478,12 @@ def to_geometry(coord_list): # If already a Point or Polygon, return it directly if isinstance(coord_list, (Point, Polygon)): return coord_list - + # If it’s a single coordinate pair, create a Point if isinstance(coord_list[0], (int, float)): # Single coordinate pair return Point(coord_list) - + # If it's a list of coordinate pairs, create a Polygon return Polygon(coord_list) -__all__ = ["landscape", "trx_tile", "boundary_tile"] +__all__ = ["landscape", "trx_tile", "boundary_tile"] \ No newline at end of file diff --git a/src/celldega/pre/boundary_tile.py b/src/celldega/pre/boundary_tile.py index d82765e7..57f95b70 100644 --- a/src/celldega/pre/boundary_tile.py +++ b/src/celldega/pre/boundary_tile.py @@ -4,6 +4,7 @@ from tqdm import tqdm import concurrent.futures import geopandas as gpd +from shapely.wkt import loads from shapely.geometry import Point, Polygon, MultiPolygon def numpy_affine_transform(coords, matrix): @@ -12,7 +13,7 @@ def numpy_affine_transform(coords, matrix): coords = np.hstack([coords, np.ones((coords.shape[0], 1))]) transformed_coords = coords @ matrix.T return transformed_coords[:, :2] # Drop the homogeneous coordinate - + def batch_transform_geometries(geometries, transformation_matrix, scale): """ Batch transform geometries (Polygons and Points) using numpy for optimized performance. @@ -23,7 +24,7 @@ def batch_transform_geometries(geometries, transformation_matrix, scale): [transformation_matrix[1, 0], transformation_matrix[1, 1], transformation_matrix[1, 2]], [0, 0, 1] ]) - + def numpy_affine_transform(coords, matrix): # Convert coordinates to homogeneous format homogeneous_coords = np.hstack([coords, np.ones((coords.shape[0], 1))]) @@ -33,27 +34,27 @@ def numpy_affine_transform(coords, matrix): return transformed[:, :2] transformed_geometries = [] - + for geom in geometries: if isinstance(geom, Point): # Transform a single Point geometry point_coords = np.array([[geom.x, geom.y]]) transformed_coords = numpy_affine_transform(point_coords, affine_matrix) / scale transformed_geometries.append(Point(transformed_coords[0])) - + elif isinstance(geom, Polygon): # Transform a Polygon geometry exterior_coords = np.array(geom.exterior.coords) transformed_exterior = numpy_affine_transform(exterior_coords, affine_matrix) / scale - + # Handle interior rings (holes) if they exist interiors = [ numpy_affine_transform(np.array(interior.coords), affine_matrix) / scale for interior in geom.interiors ] - + transformed_geometries.append(Polygon(transformed_exterior, interiors)) - + elif isinstance(geom, MultiPolygon): # Transform the first geometry in MultiPolygon polygons = [ @@ -67,12 +68,12 @@ def numpy_affine_transform(coords, matrix): for p in geom.geoms ] transformed_geometries.append(MultiPolygon(polygons)) - + return transformed_geometries def filter_and_save_fine_boundary(coarse_tile, fine_i, fine_j, fine_tile_x_min, fine_tile_x_max, fine_tile_y_min, fine_tile_y_max, path_output): cell_ids = coarse_tile.index.values - + tile_filter = ( (coarse_tile["center_x"] >= fine_tile_x_min) & (coarse_tile["center_x"] < fine_tile_x_max) & (coarse_tile["center_y"] >= fine_tile_y_min) & (coarse_tile["center_y"] < fine_tile_y_max) @@ -144,16 +145,16 @@ def get_cell_polygons(technology, path_cell_boundaries, path_output=None, path_m def make_cell_boundary_tiles( technology, path_cell_boundaries, - path_meta_cell_micron, - path_transformation_matrix, path_output, + path_meta_cell_micron=None, + path_transformation_matrix=None, coarse_tile_factor=20, tile_size=250, tile_bounds=None, image_scale=1, max_workers=1 ): - + """ Processes cell boundary data and divides it into spatial tiles based on the provided technology. @@ -188,19 +189,29 @@ def make_cell_boundary_tiles( None """ - transformation_matrix = pd.read_csv(path_transformation_matrix, header=None, sep=" ").values - # Ensure the output directory exists if not os.path.exists(path_output): os.makedirs(path_output) - gdf_cells = get_cell_polygons(technology, path_cell_boundaries, path_output, path_meta_cell_micron) + if technology == 'custom': + cells_orig = gpd.read_parquet(path_cell_boundaries) + cells_orig.rename(columns={'geometry_image_space': 'GEOMETRY'}, inplace=True) + + cells_orig["center_x"] = cells_orig["GEOMETRY"].apply(lambda geom: geom.centroid.x) + cells_orig["center_y"] = cells_orig["GEOMETRY"].apply(lambda geom: geom.centroid.y) + + else: + transformation_matrix = pd.read_csv(path_transformation_matrix, header=None, sep=" ").values + + cells_orig = get_cell_polygons(technology, path_cell_boundaries, path_output, path_meta_cell_micron) + + # Transform geometries + cells_orig["GEOMETRY"] = batch_transform_geometries(cells_orig["geometry"], transformation_matrix, image_scale) + + cells_orig["center_x"] = cells_orig["GEOMETRY"].apply(lambda geom: geom.centroid.x) + cells_orig["center_y"] = cells_orig["GEOMETRY"].apply(lambda geom: geom.centroid.y) - # Transform geometries - gdf_cells["GEOMETRY"] = batch_transform_geometries(gdf_cells["geometry"], transformation_matrix, image_scale) - gdf_cells["GEOMETRY"] = gdf_cells["GEOMETRY"].apply(lambda x: x.wkt) - gdf_cells["center_x"] = gdf_cells.geometry.centroid.x - gdf_cells["center_y"] = gdf_cells.geometry.centroid.y + cells_orig["GEOMETRY"] = cells_orig["GEOMETRY"].apply(lambda x: x.wkt) # Calculate tile bounds and fine/coarse tiles x_min, x_max = tile_bounds["x_min"], tile_bounds["x_max"] @@ -219,9 +230,9 @@ def make_cell_boundary_tiles( coarse_tile_y_min = y_min + j * (coarse_tile_factor * tile_size) coarse_tile_y_max = coarse_tile_y_min + (coarse_tile_factor * tile_size) - coarse_tile = gdf_cells[ - (gdf_cells["center_x"] >= coarse_tile_x_min) & (gdf_cells["center_x"] < coarse_tile_x_max) & - (gdf_cells["center_y"] >= coarse_tile_y_min) & (gdf_cells["center_y"] < coarse_tile_y_max) + coarse_tile = cells_orig[ + (cells_orig["center_x"] >= coarse_tile_x_min) & (cells_orig["center_x"] < coarse_tile_x_max) & + (cells_orig["center_y"] >= coarse_tile_y_min) & (cells_orig["center_y"] < coarse_tile_y_max) ] if not coarse_tile.empty: process_fine_boundaries(coarse_tile, i, j, coarse_tile_x_min, coarse_tile_x_max, coarse_tile_y_min, coarse_tile_y_max, tile_size, path_output, x_min, y_min, n_fine_tiles_x, n_fine_tiles_y, max_workers) diff --git a/src/celldega/pre/landscape.py b/src/celldega/pre/landscape.py index 434fa8ea..c12f6a5f 100644 --- a/src/celldega/pre/landscape.py +++ b/src/celldega/pre/landscape.py @@ -12,6 +12,14 @@ # save_cbg_gene_parquets : Save the cell-by-gene matrix as gene-specific Parquet files. # ============================================================================= +def to_dense_if_sparse(df): + """Convert sparse DataFrame columns to dense only if they are sparse""" + + for col in df.columns: + if isinstance(df[col].dtype, pd.SparseDtype): + df[col] = df[col].sparse.to_dense() + return df + def calc_meta_gene_data(cbg): """ Calculate gene metadata from the cell-by-gene matrix @@ -75,14 +83,14 @@ def convert_to_dense(series): proportion_nonzero = (cbg != 0).sum(axis=0) / len(cbg) # Create a DataFrame to hold all these metrics + meta_gene = pd.DataFrame( { - "mean": mean_expression.sparse.to_dense(), + "mean": mean_expression.sparse.to_dense() if isinstance(mean_expression.dtype, pd.SparseDtype) else mean_expression, "std": std_deviation, - "max": max_expression.sparse.to_dense(), - "non-zero": proportion_nonzero.sparse.to_dense(), + "max": max_expression.sparse.to_dense() if isinstance(max_expression.dtype, pd.SparseDtype) else max_expression, + "non-zero": proportion_nonzero.sparse.to_dense() if isinstance(proportion_nonzero.dtype, pd.SparseDtype) else proportion_nonzero, } - ) meta_gene_clean = pd.DataFrame(meta_gene.values, index=meta_gene.index.tolist(), columns=meta_gene.columns) @@ -127,7 +135,7 @@ def read_cbg_mtx(base_path): return cbg -def save_cbg_gene_parquets(base_path, cbg, verbose=False): +def save_cbg_gene_parquets(base_path, cbg, verbose=False, custom_segmentation_approach=None): """ Save the cell-by-gene matrix as gene-specific Parquet files. @@ -144,7 +152,7 @@ def save_cbg_gene_parquets(base_path, cbg, verbose=False): ------- None """ - output_dir = os.path.join(base_path, "cbg") + output_dir = os.path.join(base_path, f"cbg{f'_{custom_segmentation_approach}' if custom_segmentation_approach else ''}") os.makedirs(output_dir, exist_ok=True) for index, gene in enumerate(cbg.columns): @@ -155,7 +163,8 @@ def save_cbg_gene_parquets(base_path, cbg, verbose=False): col_df = cbg[[gene]].copy() # Convert to dense and integer type - col_df = col_df.sparse.to_dense().astype(int) + + col_df = to_dense_if_sparse(col_df).astype(int) # Create a DataFrame necessary to prevent error in to_parquet inst_df = pd.DataFrame( diff --git a/src/celldega/pre/trx_tile.py b/src/celldega/pre/trx_tile.py index 9cdab9dc..d58408f8 100644 --- a/src/celldega/pre/trx_tile.py +++ b/src/celldega/pre/trx_tile.py @@ -4,6 +4,7 @@ from tqdm import tqdm import concurrent.futures import pandas as pd +from scipy.sparse import csr_matrix def process_coarse_tile(trx, i, j, coarse_tile_x_min, coarse_tile_x_max, coarse_tile_y_min, coarse_tile_y_max, tile_size, path_trx_tiles, x_min, y_min, n_fine_tiles_x, n_fine_tiles_y, max_workers=1): # Filter the entire dataset for the current coarse tile @@ -14,14 +15,14 @@ def process_coarse_tile(trx, i, j, coarse_tile_x_min, coarse_tile_x_max, coarse_ if not coarse_tile.is_empty(): # Now process fine tiles using global fine tile indices - process_fine_tiles(coarse_tile, i, j, coarse_tile_x_min, coarse_tile_x_max, coarse_tile_y_min, coarse_tile_y_max, tile_size, path_trx_tiles, x_min, y_min, n_fine_tiles_x, n_fine_tiles_y, max_workers) + process_fine_tiles(coarse_tile, i, j, coarse_tile_x_min, coarse_tile_x_max, coarse_tile_y_min, coarse_tile_y_max, tile_size, path_trx_tiles, x_min, y_min, n_fine_tiles_x, n_fine_tiles_y, max_workers) def process_fine_tiles(coarse_tile, coarse_i, coarse_j, coarse_tile_x_min, coarse_tile_x_max, coarse_tile_y_min, coarse_tile_y_max, tile_size, path_trx_tiles, x_min, y_min, n_fine_tiles_x, n_fine_tiles_y, max_workers=1): # Use ThreadPoolExecutor for parallel processing of fine-grain tiles within the coarse tile with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: futures = [] - + # Iterate over fine-grain tiles within the global bounds for fine_i in range(n_fine_tiles_x): fine_tile_x_min = x_min + fine_i * tile_size @@ -41,7 +42,7 @@ def process_fine_tiles(coarse_tile, coarse_i, coarse_j, coarse_tile_x_min, coars # Submit the task for each fine tile to process in parallel futures.append(executor.submit( - filter_and_save_fine_tile, coarse_tile, coarse_i, coarse_j, fine_i, fine_j, + filter_and_save_fine_tile, coarse_tile, coarse_i, coarse_j, fine_i, fine_j, fine_tile_x_min, fine_tile_x_max, fine_tile_y_min, fine_tile_y_max, path_trx_tiles )) @@ -56,7 +57,7 @@ def filter_and_save_fine_tile(coarse_tile, coarse_i, coarse_j, fine_i, fine_j, f (pl.col("transformed_x") >= fine_tile_x_min) & (pl.col("transformed_x") < fine_tile_x_max) & (pl.col("transformed_y") >= fine_tile_y_min) & (pl.col("transformed_y") < fine_tile_y_max) ) - + if not fine_tile_trx.is_empty(): # Add geometry column as a list of [x, y] pairs fine_tile_trx = fine_tile_trx.with_columns( @@ -95,11 +96,15 @@ def transform_transcript_coordinates(technology, path_trx, chunk_size, transform all_chunks = [] for start_row in tqdm(range(0, trx_ini.height, chunk_size), desc="Processing chunks"): + chunk = trx_ini.slice(start_row, chunk_size) # Apply transformation matrix to the coordinates points = np.hstack([chunk.select(["x", "y"]).to_numpy(), np.ones((chunk.height, 1))]) - transformed_points = np.dot(points, transformation_matrix.T)[:, :2] + sparse_matrix = csr_matrix(transformation_matrix) + transformed_points = sparse_matrix.dot(points.T).T[:, :2] + + #transformed_points = np.dot(points, transformation_matrix.T)[:, :2] # Create new transformed columns and drop original x, y columns transformed_chunk = chunk.with_columns([ @@ -110,14 +115,14 @@ def transform_transcript_coordinates(technology, path_trx, chunk_size, transform # Concatenate all chunks after processing trx = pl.concat(all_chunks) - + return trx def make_trx_tiles( technology, path_trx, - path_transformation_matrix, - path_trx_tiles, + path_transformation_matrix=None, + path_trx_tiles=None, coarse_tile_factor=10, tile_size=250, chunk_size=1000000, @@ -158,55 +163,75 @@ def make_trx_tiles( A dictionary containing the bounds of the processed data in both x and y directions. """ - # Ensure the output directory exists - if not os.path.exists(path_trx_tiles): - os.makedirs(path_trx_tiles) - - transformation_matrix = np.loadtxt(path_transformation_matrix) - - trx = transform_transcript_coordinates(technology, path_trx, chunk_size, transformation_matrix, image_scale) - - # Get min and max x, y values - x_min, y_min = 0, 0 - x_max, y_max = trx.select([ - pl.col("transformed_x").max().alias("x_max"), - pl.col("transformed_y").max().alias("y_max") - ]).row(0) - - # Calculate the number of fine-grain tiles globally - n_fine_tiles_x = int(np.ceil((x_max - x_min) / tile_size)) - n_fine_tiles_y = int(np.ceil((y_max - y_min) / tile_size)) - - # Calculate the number of coarse-grain tiles - n_coarse_tiles_x = int(np.ceil((x_max - x_min) / (coarse_tile_factor * tile_size))) - n_coarse_tiles_y = int(np.ceil((y_max - y_min) / (coarse_tile_factor * tile_size))) - - # Use ThreadPoolExecutor for parallel processing of coarse-grain tiles - with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: - futures = [] - for i in range(n_coarse_tiles_x): - coarse_tile_x_min = x_min + i * (coarse_tile_factor * tile_size) - coarse_tile_x_max = coarse_tile_x_min + (coarse_tile_factor * tile_size) - - for j in range(n_coarse_tiles_y): - coarse_tile_y_min = y_min + j * (coarse_tile_factor * tile_size) - coarse_tile_y_max = coarse_tile_y_min + (coarse_tile_factor * tile_size) - - # Submit each coarse tile for parallel processing - futures.append(executor.submit( - process_coarse_tile, trx, i, j, coarse_tile_x_min, coarse_tile_x_max, coarse_tile_y_min, coarse_tile_y_max, tile_size, path_trx_tiles, x_min, y_min, n_fine_tiles_x, n_fine_tiles_y, max_workers - )) - - # Wait for all coarse tiles to complete - for future in tqdm(concurrent.futures.as_completed(futures), desc="Processing coarse tiles", unit="tile"): - future.result() # Raise exceptions if any occurred during execution - - # Return the tile bounds - tile_bounds = { - "x_min": x_min, - "x_max": x_max, - "y_min": y_min, - "y_max": y_max, - } - - return tile_bounds \ No newline at end of file + if technology == 'custom': + + # Get min and max x, y values + x_min, y_min = 0, 0 + x_max, y_max = pl.read_parquet(path_trx).select([ + pl.col("x_image_coords").max().alias("x_max"), + pl.col("y_image_coords").max().alias("y_max") + ]).row(0) + + # Return the tile bounds + tile_bounds = { + "x_min": x_min, + "x_max": x_max, + "y_min": y_min, + "y_max": y_max, + } + + return tile_bounds + + else: + + # Ensure the output directory exists + if not os.path.exists(path_trx_tiles): + os.makedirs(path_trx_tiles) + + transformation_matrix = np.loadtxt(path_transformation_matrix) + trx = transform_transcript_coordinates(technology, path_trx, chunk_size, transformation_matrix, image_scale) + + # Get min and max x, y values + x_min, y_min = 0, 0 + x_max, y_max = trx.select([ + pl.col("transformed_x").max().alias("x_max"), + pl.col("transformed_y").max().alias("y_max") + ]).row(0) + + # Calculate the number of fine-grain tiles globally + n_fine_tiles_x = int(np.ceil((x_max - x_min) / tile_size)) + n_fine_tiles_y = int(np.ceil((y_max - y_min) / tile_size)) + + # Calculate the number of coarse-grain tiles + n_coarse_tiles_x = int(np.ceil((x_max - x_min) / (coarse_tile_factor * tile_size))) + n_coarse_tiles_y = int(np.ceil((y_max - y_min) / (coarse_tile_factor * tile_size))) + + # Use ThreadPoolExecutor for parallel processing of coarse-grain tiles + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = [] + for i in range(n_coarse_tiles_x): + coarse_tile_x_min = x_min + i * (coarse_tile_factor * tile_size) + coarse_tile_x_max = coarse_tile_x_min + (coarse_tile_factor * tile_size) + + for j in range(n_coarse_tiles_y): + coarse_tile_y_min = y_min + j * (coarse_tile_factor * tile_size) + coarse_tile_y_max = coarse_tile_y_min + (coarse_tile_factor * tile_size) + + # Submit each coarse tile for parallel processing + futures.append(executor.submit( + process_coarse_tile, trx, i, j, coarse_tile_x_min, coarse_tile_x_max, coarse_tile_y_min, coarse_tile_y_max, tile_size, path_trx_tiles, x_min, y_min, n_fine_tiles_x, n_fine_tiles_y, max_workers + )) + + # Wait for all coarse tiles to complete + for future in tqdm(concurrent.futures.as_completed(futures), desc="Processing coarse tiles", unit="tile"): + future.result() # Raise exceptions if any occurred during execution + + # Return the tile bounds + tile_bounds = { + "x_min": x_min, + "x_max": x_max, + "y_min": y_min, + "y_max": y_max, + } + + return tile_bounds \ No newline at end of file diff --git a/src/celldega/viz/widget.py b/src/celldega/viz/widget.py index 988f3cda..2e1bf83e 100644 --- a/src/celldega/viz/widget.py +++ b/src/celldega/viz/widget.py @@ -56,6 +56,8 @@ class Landscape(anywidget.AnyWidget): update_trigger = traitlets.Dict().tag(sync=True) cell_clusters = traitlets.Dict().tag(sync=True) + segmentation = traitlets.Unicode("default").tag(sync=True) + width = traitlets.Int(0).tag(sync=True) height = traitlets.Int(800).tag(sync=True)