Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DEGA-137-Custom-segment-arg-for-Landscape-view #64

Draft
wants to merge 17 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
203 changes: 182 additions & 21 deletions src/celldega/pre/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,15 @@
import os
import hashlib
import base64
import tifffile
from shapely.affinity import affine_transform
from shapely.geometry import MultiPolygon

import matplotlib.pyplot as plt
from matplotlib.colors import to_hex

import json
import shutil

from .landscape import *
from .trx_tile import *
Expand Down Expand Up @@ -209,7 +211,7 @@ def make_meta_cell_image_coord(
path_transformation_matrix,
path_meta_cell_micron,
path_meta_cell_image,
image_scale
image_scale=1
):
"""
Apply an affine transformation to the cell coordinates in microns and save
Expand All @@ -234,14 +236,14 @@ def make_meta_cell_image_coord(
--------
>>> make_meta_cell_image_coord(
... technology='Xenium',
... path_transformation_matrix='data/transformation_matrix.txt',
... path_transformation_matrix='data/xenium_transform.txt',
... path_meta_cell_micron='data/meta_cell_micron.csv',
... path_meta_cell_image='data/meta_cell_image.parquet'
... )

"""

transformation_matrix = pd.read_csv(
xenium_transform = pd.read_csv(
path_transformation_matrix, header=None, sep=" "
).values

Expand All @@ -250,20 +252,28 @@ def make_meta_cell_image_coord(
meta_cell = convert_long_id_to_short(meta_cell)
meta_cell["name"] = meta_cell["cell_id"]
meta_cell = meta_cell.set_index('cell_id')

elif technology == "Xenium":
usecols = ["cell_id", "x_centroid", "y_centroid"]
meta_cell = pd.read_csv(path_meta_cell_micron, index_col=0, usecols=usecols)
meta_cell.columns = ["center_x", "center_y"]
meta_cell["name"] = pd.Series(meta_cell.index, index=meta_cell.index)

elif technology == "custom":
meta_cell = gpd.read_parquet(path_meta_cell_micron)
meta_cell['center_x'] = meta_cell.centroid.x
meta_cell['center_y'] = meta_cell.centroid.y
meta_cell["name"] = pd.Series(meta_cell.index, index=meta_cell.index)
meta_cell.drop(['area', 'centroid'], axis=1, inplace=True)

# Adding a ones column to accommodate for affine transformation
meta_cell["ones"] = 1

# Preparing the data for matrix multiplication
points = meta_cell[["center_x", "center_y", "ones"]].values

# Applying the transformation matrix
transformed_points = np.dot(transformation_matrix, points.T).T
transformed_points = np.dot(xenium_transform, points.T).T

# Updating the DataFrame with transformed coordinates
meta_cell["center_x"] = transformed_points[:, 0]
Expand Down Expand Up @@ -322,7 +332,10 @@ def make_meta_gene(technology, path_cbg, path_output):
# genes = pd.read_csv(path_cbg + 'features.tsv.gz', sep='\t', header=None)[1].values.tolist()
cbg = read_cbg_mtx(path_cbg)
genes = cbg.columns.tolist()

elif technology == "custom":
cbg = pd.read_parquet(path_cbg)
genes = cbg.columns.tolist()

# Get all categorical color palettes from Matplotlib and flatten them into a single list of colors
palettes = [plt.get_cmap(name).colors for name in plt.colormaps() if "tab" in name]
flat_colors = [color for palette in palettes for color in palette]
Expand Down Expand Up @@ -377,31 +390,179 @@ def get_max_zoom_level(path_image_pyramid):
return max_pyramid_zoom


def clustering(path_landscape_files, segmentation_parameters, cbg):

default_clustering = pd.read_csv(path_landscape_files + 'analysis/clustering/gene_expression_graphclust/clusters.csv',
index_col=0)

default_clustering = pd.DataFrame(default_clustering.values, index=default_clustering.index.tolist(), columns=['cluster'])

default_clustering_ini = pd.DataFrame(default_clustering.values, index=default_clustering.index.tolist(), columns=['cluster'])
default_clustering_ini['cluster'] = default_clustering_ini['cluster'].astype('string')

meta_cell = pd.read_parquet(path_landscape_files + 'cell_metadata.parquet')
default_clustering = pd.DataFrame(index=meta_cell.index.tolist())
default_clustering.loc[default_clustering_ini.index.tolist(), 'cluster'] = default_clustering_ini['cluster']
default_clustering.to_parquet(path_landscape_files + f"cell_clusters/cluster_{segmentation_parameters['segmentation_approach']}.parquet")
ser_counts = default_clustering['cluster'].value_counts()
clusters = ser_counts.index.tolist()
palettes = [plt.get_cmap(name).colors for name in plt.colormaps() if "tab" in name]
flat_colors = [color for palette in palettes for color in palette]
flat_colors_hex = [to_hex(color) for color in flat_colors]

colors = [
flat_colors_hex[i % len(flat_colors_hex)] if "Blank" not in cluster else "#FFFFFF"
for i, cluster in enumerate(clusters)
]

ser_color = pd.Series(colors, index=clusters, name='color')
meta_cluster = pd.DataFrame(ser_color)
meta_cluster['count'] = ser_counts
meta_cluster.to_parquet(path_landscape_files + f"cell_clusters/meta_cluster_{segmentation_parameters['segmentation_approach']}.parquet")

df_meta = pd.read_csv(path_landscape_files + 'analysis/clustering/gene_expression_graphclust/clusters.csv', index_col=0)
df_meta['Cluster'] = df_meta['Cluster'].astype('string')
df_meta.columns = ['cluster']

meta_cell['cluster'] = df_meta['cluster']

list_ser = []
for inst_cat in meta_cell['cluster'].unique().tolist():
if inst_cat is not None:
inst_cells = meta_cell[meta_cell['cluster'] == inst_cat].index.tolist()
inst_ser = cbg.loc[inst_cells].sum()/len(inst_cells)
inst_ser.name = inst_cat

list_ser.append(inst_ser)

df_sig = pd.concat(list_ser, axis=1)

df_sig = pd.concat(list_ser, axis=1)
df_sig.columns = df_sig.columns.tolist()
df_sig.index = df_sig.index.tolist()

keep_genes = df_sig.index.tolist()
keep_genes = [x for x in keep_genes if 'Unassigned' not in x]
keep_genes = [x for x in keep_genes if 'NegControl' not in x]
keep_genes = [x for x in keep_genes if 'DeprecatedCodeword' not in x]

df_sig = df_sig.loc[keep_genes, clusters]

df_sig.sparse.to_dense().to_parquet(path_landscape_files + f"df_sig_{segmentation_parameters['segmentation_approach']}.parquet")

def save_landscape_parameters(
technology, path_landscape_files, image_name="dapi_files", tile_size=1000, image_info={}, image_format='.webp'
):
technology, path_landscape_files, image_name="dapi_files", tile_size=1000, image_info={}, image_format='.webp', segmentation_approach="default"):
"""
Save the landscape parameters to a JSON file.
"""

path_image_pyramid = f"{path_landscape_files}/pyramid_images/{image_name}"
if os.path.isdir(path_landscape_files) and os.path.exists(f"{path_landscape_files}/landscape_parameters.json"):

with open(f"{path_landscape_files}/landscape_parameters.json", "r") as file:
landscape_parameters = json.load(file)

landscape_parameters[segmentation_approach] = {"technology": technology,
"max_pyramid_zoom": max_pyramid_zoom,
"tile_size": tile_size,
"image_info": image_info,
"image_format": image_format}

path_landscape_parameters = f"{path_landscape_files}/landscape_parameters.json"

print(path_image_pyramid)
with open(path_landscape_parameters, "w") as file:
json.dump(landscape_parameters, file, indent=4)

else:
path_image_pyramid = f"{path_landscape_files}/pyramid_images/{image_name}"

print(path_image_pyramid)

max_pyramid_zoom = get_max_zoom_level(path_image_pyramid)

landscape_parameters = {
segmentation_approach: {
"technology": technology,
"max_pyramid_zoom": max_pyramid_zoom,
"tile_size": tile_size,
"image_info": image_info,
"image_format": image_format
}}

path_landscape_parameters = f"{path_landscape_files}/landscape_parameters.json"

with open(path_landscape_parameters, "w") as file:
json.dump(landscape_parameters, file, indent=4)

def add_custom_segmentation(path_landscape_files, path_segmentation_files, image_scale=1, tile_size=250):

with open(f"{path_segmentation_files}/segmentation_parameters.json", "r") as file:
segmentation_parameters = json.load(file)

make_meta_gene(technology=segmentation_parameters['technology'],
path_cbg=os.path.join(path_segmentation_files, "cell_by_gene_matrix.parquet"),
path_output=os.path.join(path_landscape_files, f"meta_gene_{segmentation_parameters['segmentation_approach']}.parquet"))

cbg_custom = pd.read_parquet(os.path.join(path_segmentation_files, "cell_by_gene_matrix.parquet"))

save_cbg_gene_parquets(path_landscape_files,
cbg=cbg_custom,
verbose=True,
custom_segmentation_approach=f"_{segmentation_parameters['segmentation_approach']}")

make_meta_cell_image_coord(technology = segmentation_parameters['technology'],
path_transformation_matrix = os.path.join(path_landscape_files, 'xenium_transform.csv'),
path_meta_cell_micron = os.path.join(path_segmentation_files, 'cell_metadata_micron_space.parquet'),
path_meta_cell_image = os.path.join(path_landscape_files, 'cell_metadata.parquet'),
image_scale=image_scale)

tile_bounds = make_trx_tiles(technology = segmentation_parameters['technology'],
path_trx = os.path.join(path_segmentation_files, 'transcripts.parquet'),
path_transformation_matrix = os.path.join(path_landscape_files, 'xenium_transform.csv'),
path_trx_tiles = os.path.join(path_landscape_files, 'transcript_tiles'),
tile_size=tile_size,
image_scale=image_scale)

make_cell_boundary_tiles(technology = segmentation_parameters['technology'],
path_cell_boundaries = os.path.join(path_segmentation_files, "cell_polygons.parquet"),
path_meta_cell_micron = os.path.join(path_segmentation_files, 'cell_metadata_micron_space.parquet'),
path_transformation_matrix = os.path.join(path_landscape_files, 'xenium_transform.csv'),
path_output = os.path.join(path_landscape_files, f"cell_segmentation_{segmentation_parameters['segmentation_approach']}"),
tile_size=tile_size,
tile_bounds=tile_bounds,
image_scale=image_scale)

clustering(path_landscape_files=path_landscape_files,
segmentation_parameters=segmentation_parameters,
cbg=cbg_custom)

save_landscape_parameters(technology=segmentation_parameters['technology'],
path_landscape_files=path_landscape_files,
image_name="dapi_files",
tile_size=1000, image_format='.webp',
segmentation_approach=segmentation_parameters['segmentation_approach'])

max_pyramid_zoom = get_max_zoom_level(path_image_pyramid)
__all__ = ["landscape", "trx_tile", "boundary_tile"]

landscape_parameters = {
"technology": technology,
"max_pyramid_zoom": max_pyramid_zoom,
"tile_size": tile_size,
"image_info": image_info,
"image_format": image_format
}

path_landscape_parameters = f"{path_landscape_files}/landscape_parameters.json"
"""
This has three checklist items:

with open(path_landscape_parameters, "w") as file:
json.dump(landscape_parameters, file, indent=4)
The files meta_gene.parquet and gene_metadata.parquet are the same and that's a bug that they're being saved twice

You'll need a new cell_metadata.parquet, df_sig.parquet, meta_gene.parquet, cbg directory (maybe cbg_my-seg-method)
But not transcript tiles

__all__ = ["landscape", "trx_tile", "boundary_tile"]
the cell_clusters directory and files too -
I would do something like cell_clusters_my-seg-method and leave the files within the same.

1 make a dega.pre method for adding custom segmentation results into the LandscapeFiles
(you can decide where to save them - there will be a new cbg file for instance called cbg_some-custom-segmentation-name),

2 decide on the organization of the LandscapeFiles (just create new adjacent files so we don't break any backwards compatability -
we can reorganize this when we do a 1.0 release), and

3 make an argument in the Landscape method that lets the user select
which segmentation approach to visualize (the landscape_parameters.json can also have a default segmentation result set up to
establish a default behavior when no argument is given)

"""
5 changes: 4 additions & 1 deletion src/celldega/pre/boundary_tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,10 @@ def make_cell_boundary_tiles(
if not os.path.exists(path_output):
os.makedirs(path_output)

cells_orig = get_cell_polygons(technology, path_cell_boundaries, path_output, path_meta_cell_micron)
if technology == 'custom':
cells_orig = gpd.read_parquet(path_cell_boundaries)
else:
cells_orig = get_cell_polygons(technology, path_cell_boundaries, path_output, path_meta_cell_micron)

# Transform geometries
cells_orig["GEOMETRY"] = batch_transform_geometries(cells_orig["geometry"], transformation_matrix, image_scale)
Expand Down
27 changes: 20 additions & 7 deletions src/celldega/pre/landscape.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,18 @@
# read_cbg_mtx : Read the cell-by-gene matrix from the mtx files.
# save_cbg_gene_parquets : Save the cell-by-gene matrix as gene-specific Parquet files.
# =============================================================================
def to_dense(series):
"""Convert sparse Series to dense only if it is sparse"""

return series.sparse.to_dense() if isinstance(series.dtype, pd.SparseDtype) else series

def to_dense_if_sparse(df):
"""Convert sparse DataFrame columns to dense only if they are sparse"""

for col in df.columns:
if isinstance(df[col].dtype, pd.SparseDtype):
df[col] = df[col].sparse.to_dense()
return df

def calc_meta_gene_data(cbg):
"""
Expand Down Expand Up @@ -75,14 +87,14 @@ def convert_to_dense(series):
proportion_nonzero = (cbg != 0).sum(axis=0) / len(cbg)

# Create a DataFrame to hold all these metrics

meta_gene = pd.DataFrame(
{
"mean": mean_expression.sparse.to_dense(),
"mean": to_dense(mean_expression),
"std": std_deviation,
"max": max_expression.sparse.to_dense(),
"non-zero": proportion_nonzero.sparse.to_dense(),
"max": to_dense(max_expression),
"non-zero": to_dense(proportion_nonzero),
}

)

meta_gene_clean = pd.DataFrame(meta_gene.values, index=meta_gene.index.tolist(), columns=meta_gene.columns)
Expand Down Expand Up @@ -127,7 +139,7 @@ def read_cbg_mtx(base_path):

return cbg

def save_cbg_gene_parquets(base_path, cbg, verbose=False):
def save_cbg_gene_parquets(base_path, cbg, verbose=False, custom_segmentation_approach=""):
"""
Save the cell-by-gene matrix as gene-specific Parquet files.

Expand All @@ -144,7 +156,7 @@ def save_cbg_gene_parquets(base_path, cbg, verbose=False):
-------
None
"""
output_dir = os.path.join(base_path, "cbg")
output_dir = os.path.join(base_path, f"cbg{custom_segmentation_approach}")
os.makedirs(output_dir, exist_ok=True)

for index, gene in enumerate(cbg.columns):
Expand All @@ -155,7 +167,8 @@ def save_cbg_gene_parquets(base_path, cbg, verbose=False):
col_df = cbg[[gene]].copy()

# Convert to dense and integer type
col_df = col_df.sparse.to_dense().astype(int)

col_df = to_dense_if_sparse(col_df).astype(int)

# Create a DataFrame necessary to prevent error in to_parquet
inst_df = pd.DataFrame(
Expand Down
13 changes: 12 additions & 1 deletion src/celldega/pre/trx_tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,10 +91,21 @@ def transform_transcript_coordinates(technology, path_trx, chunk_size, transform
pl.col("y_location").alias("y")
])

elif technology == "custom":
trx_ini = pl.read_parquet(path_trx).select([
pl.col("cell_index"),
pl.col("transcript_index"),
pl.col("gene").alias("name"),
pl.col("x"),
pl.col("y")
])


# Process the data in chunks and apply transformations
all_chunks = []

for start_row in tqdm(range(0, trx_ini.height, chunk_size), desc="Processing chunks"):

chunk = trx_ini.slice(start_row, chunk_size)

# Apply transformation matrix to the coordinates
Expand Down Expand Up @@ -163,7 +174,7 @@ def make_trx_tiles(
os.makedirs(path_trx_tiles)

transformation_matrix = np.loadtxt(path_transformation_matrix)

trx = transform_transcript_coordinates(technology, path_trx, chunk_size, transformation_matrix, image_scale)

# Get min and max x, y values
Expand Down
Loading