Skip to content

Commit

Permalink
bug fixes in MtxDirReader and to_mtx. Added tests for mtxtozarr and s…
Browse files Browse the repository at this point in the history
…parsetozarr
  • Loading branch information
parashardhapola committed Jul 18, 2021
1 parent e0ae195 commit 2abcee8
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 4 deletions.
5 changes: 3 additions & 2 deletions scarf/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,14 +354,15 @@ class MtxDirReader(CrReader):
loc: Path for the directory containing the cellranger output.
matFn: The file name for the matrix file.
"""
def __init__(self, loc, file_type: str = None):
def __init__(self, loc, file_type: str = None, mtx_separator: str = ' '):
"""
Args:
loc (str): Path for the directory containing the cellranger output.
file_type (str): Type of sequencing data ('rna' | 'atac')
"""
self.loc: str = loc.rstrip('/') + '/'
self.matFn = None
self.sep = mtx_separator
super().__init__(self._handle_version(), file_type)

def _handle_version(self):
Expand Down Expand Up @@ -436,7 +437,7 @@ def _subset_by_assay(self, v, assay) -> List:
# noinspection DuplicatedCode
def consume(self, batch_size: int, lines_in_mem: int = int(1e5)) -> \
Generator[List[np.ndarray], None, None]:
stream = pd.read_csv(self.matFn, skiprows=3, sep='\t',
stream = pd.read_csv(self.matFn, skiprows=3, sep=self.sep,
header=None, chunksize=lines_in_mem)
start = 1
dfs = []
Expand Down
28 changes: 28 additions & 0 deletions scarf/tests/test_writers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,15 @@ def test_crtozarr_fromdir(crdir_reader):
remove(fn)


def test_mtxtozarr(mtx_reader):
from ..writers import MtxToZarr

fn = full_path('1K_pbmc_citeseq_dir_mtx.zarr')
writer = MtxToZarr(mtx_reader, zarr_fn=fn)
writer.dump()
remove(fn)


def test_h5adtozarr(h5ad_reader):
from ..writers import H5adToZarr

Expand All @@ -37,6 +46,25 @@ def test_loomtozarr(loom_reader):
remove(fn)


def test_sparsetozarr():
from ..writers import SparseToZarr
from scipy.sparse import csr_matrix

cols = [1, 3, 8, 2, 3, 1, 2, 8, 9]
rows = [0, 0, 0, 1, 1, 1, 2, 2, 2]
data = [1, 10, 15, 10, 20, 2, 3, 1, 5]
mat = (data, (cols, rows))
mat = csr_matrix(mat, shape=(10, 3))

fn = full_path('dummy_sparse.zarr')

writer = SparseToZarr(mat, zarr_fn=fn,
cell_ids=[f"cell_{x}" for x in range(3)],
feature_ids=[f"feat_{x}" for x in range(10)])
writer.dump()
remove(fn)


def test_to_h5ad(datastore):
# TODO: Evaluate the resulting H5ad file
from ..writers import to_h5ad
Expand Down
4 changes: 2 additions & 2 deletions scarf/writers.py
Original file line number Diff line number Diff line change
Expand Up @@ -931,7 +931,7 @@ def to_mtx(assay, mtx_directory: str, compress: bool = False):
features_fn = 'features.tsv.gz'
h = gzip.open(os.path.join(mtx_directory, 'matrix.mtx.gz'), 'wt')
else:
barcodes_fn = 'barcodes.tsv.gz'
barcodes_fn = 'barcodes.tsv'
features_fn = 'genes.tsv'
h = open(os.path.join(mtx_directory, 'matrix.mtx'), 'w')
h.write("%%MatrixMarket matrix coordinate integer general\n% Generated by Scarf\n")
Expand All @@ -940,7 +940,7 @@ def to_mtx(assay, mtx_directory: str, compress: bool = False):
for i in tqdm(assay.rawData.blocks, total=assay.rawData.numblocks[0]):
i = coo_matrix((i.compute()))
df = pd.DataFrame({'col': i.col + 1, 'row': i.row + s + 1, 'd': i.data})
df.to_csv(h, sep=' ', header=False, index=False, mode='a')
df.to_csv(h, sep=' ', header=False, index=False, mode='a', line_terminator='\n')
s += i.shape[0]
h.close()
assay.cells.to_pandas_dataframe(['ids']).to_csv(
Expand Down

0 comments on commit 2abcee8

Please sign in to comment.