Skip to content

Commit

Permalink
added CSV reader and writer. message display fix in mark_hvgs
Browse files Browse the repository at this point in the history
  • Loading branch information
parashardhapola committed Aug 12, 2022
1 parent 8bb9601 commit 6ab32e5
Show file tree
Hide file tree
Showing 4 changed files with 326 additions and 22 deletions.
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.19.8
0.20.0
2 changes: 1 addition & 1 deletion scarf/assay.py
Original file line number Diff line number Diff line change
Expand Up @@ -944,7 +944,7 @@ def col_renamer(x):
for i in slots:
i = col_renamer(i)
if i not in self.feats.columns:
raise KeyError("ERROR: {i} not found in feature metadata")
raise KeyError(f"ERROR: {i} not found in feature metadata")
c_var = self.feats.remove_trend(
col_renamer("avg"), col_renamer("sigmas"), n_bins, lowess_frac
)
Expand Down
185 changes: 185 additions & 0 deletions scarf/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
"H5adReader",
"NaboH5Reader",
"LoomReader",
"CSVReader",
]


Expand Down Expand Up @@ -920,3 +921,187 @@ def consume(self, batch_size: int = 1000) -> Generator[np.ndarray, None, None]:
last_n = i
if last_n < self.nCells:
yield self.h5[self.matrixKey][:, last_n:].astype(self.matrixDtype).T


class CSVReader:
"""
A class to read in data from a CSV file
Args:
csv_fn: Path to the CSV file
has_header: Does the CSV file has a header. (Default value: True)
id_column: The column number which contains row name. (Default value: None)
rows_are_cells: If True then each row represents a cell and hence each column is a feature. If False then each
row is feature and each column in a cell
sep: The column separator in the CSV file (Default value: ',')
skip_rows: Number of rows to skip from the top of the file. (Default value: 0)
skip_cols: Names of columns to skip. Must be provided as a list even if just one column.
(Default value: None)
cell_data_cols: Names of columns to include in cell metadata rather than count matrix. Must be provided as a
list even if just one column. (Default value: None)
batch_size: Number of lines to read at a time. Decrease this value if you have too many columns.
(Default value: 50,000)
pandas_kwargs: A dictionary of keyword arguments to be passed to Pandas read_csv function.
Attributes:
nFeatures: Number of features in dataset.
nCells: Number of cells in dataset.
"""

def __init__(
self,
csv_fn: str,
has_header: bool = True,
id_column: Optional[int] = None,
rows_are_cells: bool = True,
sep: str = ",",
skip_rows: int = 0,
skip_cols: Optional[List[str]] = None,
cell_data_cols: Optional[List[str]] = None,
batch_size=10000,
pandas_kwargs: Optional[dict] = None,
):
self._fn = csv_fn
if rows_are_cells is False:
raise NotImplementedError(
"Currently Scarf supports only those CSV files where cells are along the rows"
)
if pandas_kwargs is None:
pandas_kwargs = {}
else:
if type(pandas_kwargs) != dict:
logger.error("")
if has_header is False:
has_header = None
else:
has_header = 0
self.pandas_kwargs = pandas_kwargs
self.pandas_kwargs["sep"] = sep
self.pandas_kwargs["header"] = has_header
self.pandas_kwargs["skiprows"] = skip_rows
self.pandas_kwargs["chunksize"] = batch_size
self.pandas_kwargs["index_col"] = id_column

if skip_cols is None:
self.skipCols = []
else:
self.skipCols = skip_cols
if cell_data_cols is None:
self.cellDataCols = []
else:
self.cellDataCols = cell_data_cols
(
self.nCells,
self.nFeatures,
self.cellIds,
self.featureIds,
self.keepCols,
self.cellDataDtypes,
self.cellDataIdx,
) = self._consistency_check()

def _get_streamer(self) -> Generator:
return pd.read_csv(self._fn, **self.pandas_kwargs)

def _consistency_check(
self,
) -> Tuple[
int,
int,
Optional[np.ndarray],
Optional[np.ndarray],
Optional[List[int]],
Optional[List[np.dtype]],
Optional[List[int]],
]:
stream = self._get_streamer()
n_cells = 0
n_features = 0
feature_ids = None
cell_data_dtypes = None
cell_data_idx = None
if self.pandas_kwargs["index_col"] is None:
cell_ids = None
else:
cell_ids = []
for df in tqdmbar(stream, desc="Performing CSV file consistency check"):
n_cells += df.shape[0]
if n_features == 0:
n_features = df.shape[1]
if self.pandas_kwargs["header"] is not None:
feature_ids = df.columns.values
if len(feature_ids) != n_features:
raise ValueError(
"Header length not same as number of features. This can happen if you did not"
" skip the right number of rows."
)
if len(self.cellDataCols) > 0:
cell_data_dtypes = list(df[self.cellDataCols].dtypes.values)
cell_data_idx = [
n
for n, x in enumerate(feature_ids)
if x in self.cellDataCols
]
else:
if n_features != df.shape[1]:
raise ValueError(
"Number of columns changed in the CSV during consistency check."
" Maybe a problem with the delimiter."
)
if cell_ids is not None:
cell_ids = np.ndarray(cell_ids)
keep_cols = None
if feature_ids is not None:
skip_names = list(set(self.skipCols).union(self.cellDataCols))
if len(skip_names) > 0:
keep_cols = [
n for n, x in enumerate(feature_ids) if x not in skip_names
]
feature_ids = feature_ids[keep_cols]
n_features = len(keep_cols)
return (
n_cells,
n_features,
cell_ids,
feature_ids,
keep_cols,
cell_data_dtypes,
cell_data_idx,
)

def _n_features(self):
return np.array([f"feature_{x}" for x in range(self.nFeatures)])

def cell_ids(self) -> np.ndarray:
"""
Returns a list of cell IDs.
"""
if self.cellIds is None:
return np.array([f"cell_{x}" for x in range(self.nCells)])
else:
return self.cellIds

def feature_ids(self) -> np.ndarray:
"""
Returns a list of feature IDs.
"""
if self.featureIds is None:
return np.array([f"feature_{x}" for x in range(self.nFeatures)])
else:
return self.featureIds

def consume(self) -> Generator[Tuple[np.ndarray, Optional[np.ndarray]], None, None]:
"""
Returns a generator that yield chunks of data.
"""
stream = self._get_streamer()
if self.keepCols is None:
for df in stream:
yield df.values, None
else:
if self.cellDataIdx is not None:
for df in stream:
yield df.values[:, self.keepCols], df.values[:, self.cellDataIdx]
else:
for df in stream:
yield df.values[:, self.keepCols], None
Loading

0 comments on commit 6ab32e5

Please sign in to comment.