bug fixes in MtxDirReader and to_mtx. Added tests for mtxtozarr and s…

…parsetozarr
parashardhapola · Jul 18, 2021 · 2abcee8 · 2abcee8
1 parent e0ae195
commit 2abcee8
Show file tree

Hide file tree

Showing 3 changed files with 33 additions and 4 deletions.
diff --git a/scarf/readers.py b/scarf/readers.py
@@ -354,14 +354,15 @@ class MtxDirReader(CrReader):
         loc: Path for the directory containing the cellranger output.
         matFn: The file name for the matrix file.
     """
-    def __init__(self, loc, file_type: str = None):
+    def __init__(self, loc, file_type: str = None, mtx_separator: str = ' '):
         """
         Args:
             loc (str): Path for the directory containing the cellranger output.
             file_type (str): Type of sequencing data ('rna' | 'atac')
         """
         self.loc: str = loc.rstrip('/') + '/'
         self.matFn = None
+        self.sep = mtx_separator
         super().__init__(self._handle_version(), file_type)
 
     def _handle_version(self):
@@ -436,7 +437,7 @@ def _subset_by_assay(self, v, assay) -> List:
     # noinspection DuplicatedCode
     def consume(self, batch_size: int, lines_in_mem: int = int(1e5)) -> \
             Generator[List[np.ndarray], None, None]:
-        stream = pd.read_csv(self.matFn, skiprows=3, sep='\t',
+        stream = pd.read_csv(self.matFn, skiprows=3, sep=self.sep,
                              header=None, chunksize=lines_in_mem)
         start = 1
         dfs = []

diff --git a/scarf/tests/test_writers.py b/scarf/tests/test_writers.py
@@ -19,6 +19,15 @@ def test_crtozarr_fromdir(crdir_reader):
     remove(fn)
 
 
+def test_mtxtozarr(mtx_reader):
+    from ..writers import MtxToZarr
+
+    fn = full_path('1K_pbmc_citeseq_dir_mtx.zarr')
+    writer = MtxToZarr(mtx_reader, zarr_fn=fn)
+    writer.dump()
+    remove(fn)
+
+
 def test_h5adtozarr(h5ad_reader):
     from ..writers import H5adToZarr
 
@@ -37,6 +46,25 @@ def test_loomtozarr(loom_reader):
     remove(fn)
 
 
+def test_sparsetozarr():
+    from ..writers import SparseToZarr
+    from scipy.sparse import csr_matrix
+
+    cols = [1, 3, 8, 2, 3, 1, 2, 8, 9]
+    rows = [0, 0, 0, 1, 1, 1, 2, 2, 2]
+    data = [1, 10, 15, 10, 20, 2, 3, 1, 5]
+    mat = (data, (cols, rows))
+    mat = csr_matrix(mat, shape=(10, 3))
+
+    fn = full_path('dummy_sparse.zarr')
+
+    writer = SparseToZarr(mat, zarr_fn=fn,
+                          cell_ids=[f"cell_{x}" for x in range(3)],
+                          feature_ids=[f"feat_{x}" for x in range(10)])
+    writer.dump()
+    remove(fn)
+
+
 def test_to_h5ad(datastore):
     # TODO: Evaluate the resulting H5ad file
     from ..writers import to_h5ad

diff --git a/scarf/writers.py b/scarf/writers.py
@@ -931,7 +931,7 @@ def to_mtx(assay, mtx_directory: str, compress: bool = False):
         features_fn = 'features.tsv.gz'
         h = gzip.open(os.path.join(mtx_directory, 'matrix.mtx.gz'), 'wt')
     else:
-        barcodes_fn = 'barcodes.tsv.gz'
+        barcodes_fn = 'barcodes.tsv'
         features_fn = 'genes.tsv'
         h = open(os.path.join(mtx_directory, 'matrix.mtx'), 'w')
     h.write("%%MatrixMarket matrix coordinate integer general\n% Generated by Scarf\n")
@@ -940,7 +940,7 @@ def to_mtx(assay, mtx_directory: str, compress: bool = False):
     for i in tqdm(assay.rawData.blocks, total=assay.rawData.numblocks[0]):
         i = coo_matrix((i.compute()))
         df = pd.DataFrame({'col': i.col + 1, 'row': i.row + s + 1, 'd': i.data})
-        df.to_csv(h, sep=' ', header=False, index=False, mode='a')
+        df.to_csv(h, sep=' ', header=False, index=False, mode='a', line_terminator='\n')
         s += i.shape[0]
     h.close()
     assay.cells.to_pandas_dataframe(['ids']).to_csv(