Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Move inferring of spectrum path to parsing psm list? #99

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions ms2rescore/feature_generators/ms2pip.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
from rich.progress import track

from ms2rescore.feature_generators.base import FeatureGeneratorBase, FeatureGeneratorException
from ms2rescore.utils import infer_spectrum_path

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -184,12 +183,10 @@ def add_features(self, psm_list: PSMList) -> None:
f"Running MS²PIP for PSMs from run ({current_run}/{total_runs}) `{run}`..."
)
psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
spectrum_filename = infer_spectrum_path(self.spectrum_path, run)
logger.debug(f"Using spectrum file `{spectrum_filename}`")
try:
ms2pip_results = correlate(
psms=psm_list_run,
spectrum_file=spectrum_filename,
spectrum_file=run, # Run has already been mapped to a path
spectrum_id_pattern=self.spectrum_id_pattern,
model=self.model,
ms2_tolerance=self.ms2_tolerance,
Expand Down
86 changes: 85 additions & 1 deletion ms2rescore/parse_psms.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import logging
import re
from typing import Dict, Union
from typing import Dict, Union, Optional
from pathlib import Path
from glob import glob

import numpy as np
import psm_utils.io
from psm_utils import PSMList

Expand Down Expand Up @@ -52,6 +55,17 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None], output_file_root: s
new_ids = [_match_psm_ids(old_id, pattern) for old_id in psm_list["spectrum_id"]]
psm_list["spectrum_id"] = new_ids

# Add filename if all values are none
# if (psm_list["run"] == None).all(): # noqa: E711
# # Map inferred spectrum paths
spectrum_path_mapping = {
run: infer_spectrum_path(configured_path=config["spectrum_path"], run_name=run)
for run in set(psm_list["run"])
}
logger.debug(f"Mapped PSM list runs to spectrum file paths: {spectrum_path_mapping}")
psm_list["run"] = np.vectorize(spectrum_path_mapping.get)(psm_list["run"])
exit()

# TODO: Temporary fix until implemented in psm_utils
# Ensure that spectrum IDs are strings (Pydantic 2.0 does not coerce int to str)
psm_list["spectrum_id"] = [str(spec_id) for spec_id in psm_list["spectrum_id"]]
Expand Down Expand Up @@ -117,3 +131,73 @@ def _match_psm_ids(old_id, regex_pattern):
"`psm_id_pattern` could not be matched to all PSM spectrum IDs."
" Ensure that the regex contains a capturing group?"
)


def infer_spectrum_path(
configured_path: Union[str, Path, None],
run_name: Optional[str] = None,
) -> Union[str, Path]:
"""
Infer spectrum path from passed path and expected filename (e.g. from PSM file).

Parameters
----------
configured_path: str, Path, None
User-defined path to spectrum file or directory containing spectrum file
run_name : str, optional
MS run name (stem of spectrum filename), e.g., as expected from PSM file.

"""
# If no spectrum path configured, use expected run_name in default dir
if not configured_path:
if run_name:
resolved_path = Path(".").joinpath(run_name)
else:
raise MS2RescoreConfigurationError(
"Could not resolve spectrum file name: No spectrum path configured "
"and no run name in PSM file found."
)

else:
configured_path = Path(configured_path)
# If passed path is directory, join with run name
if configured_path.is_dir():
if run_name:
resolved_path = configured_path.joinpath(run_name)
else:
raise MS2RescoreConfigurationError(
"Could not resolve spectrum file name: Spectrum path is directory "
"but no run name in PSM file found."
)

# If passed path is file, use that, but warn if basename doesn't match expected
elif configured_path.is_file():
if run_name and configured_path.stem != Path(run_name).stem:
logger.warning(
"Passed spectrum path (`%s`) does not match run name found in PSM "
"file (`%s`). Continuing with passed spectrum path.",
configured_path,
run_name,
)
resolved_path = configured_path
else:
raise MS2RescoreConfigurationError(
"Configured `spectrum_path` must be `None` or a path to an existing file "
"or directory. If `None` or path to directory, spectrum run information "
"should be present in the PSM file."
)

# Match with file extension if not in resolved_path yet
if not re.match(".mgf$|.mzml$", resolved_path, flags=re.IGNORECASE):
for filename in glob(resolved_path + "*"):
if re.match(r".*(\.mgf$|\.mzml$)", filename, flags=re.IGNORECASE):
resolved_path = filename
break
else:
raise MS2RescoreConfigurationError(
f"Resolved spectrum filename '{resolved_path}' does not contain a supported file "
"extension (mgf or mzml) and could not find any matching existing "
"files."
)

return Path(resolved_path).as_posix()
78 changes: 0 additions & 78 deletions ms2rescore/utils.py

This file was deleted.

Loading