Skip to content

Commit 2675856

Browse files
authored
Merge pull request #12 from snipe-bio/minor_fixes_qc
2 parents 3082e02 + a46d927 commit 2675856

File tree

4 files changed

+116
-38
lines changed

4 files changed

+116
-38
lines changed

src/snipe/api/reference_QC.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
import os
1111
import requests
1212
from tqdm import tqdm
13-
import cgi
1413
from urllib.parse import urlparse
1514
from typing import Optional
1615
import sourmash
@@ -1027,7 +1026,7 @@ def saturation_model(x, a, b):
10271026
predicted_coverage = min(predicted_coverage, max_coverage)
10281027

10291028
self.logger.debug("Predicted coverage at %.2f-fold increase: %f", extra_fold, predicted_coverage)
1030-
return predicted_coverage
1029+
return float(predicted_coverage)
10311030

10321031
def calculate_chromosome_metrics(self, chr_to_sig: Dict[str, SnipeSig]) -> Dict[str, Any]:
10331032
r"""
@@ -1110,8 +1109,23 @@ def calculate_chromosome_metrics(self, chr_to_sig: Dict[str, SnipeSig]) -> Dict[
11101109
chr_stats = chr_sample_sig.get_sample_stats
11111110
chr_to_mean_abundance[chr_name] = chr_stats["mean_abundance"]
11121111
self.logger.debug("\t-Mean abundance for %s: %f", chr_name, chr_stats["mean_abundance"])
1113-
1114-
self.chrs_stats.update(chr_to_mean_abundance)
1112+
1113+
# chromosomes are numberd from 1 to ..., sort them by numer (might be string for sex chromosomes) then prefix them with chr-
1114+
def sort_chromosomes(chr_name):
1115+
try:
1116+
# Try to convert to integer for numeric chromosomes
1117+
return (0, int(chr_name))
1118+
except ValueError:
1119+
# Non-numeric chromosomes (like 'x', 'y', 'z', etc.)
1120+
return (1, chr_name)
1121+
1122+
# Create a new dictionary with sorted chromosome names and prefixed with 'chr-'
1123+
sorted_chr_to_mean_abundance = {
1124+
f"chr-{chr_name}": chr_to_mean_abundance[chr_name]
1125+
for chr_name in sorted(chr_to_mean_abundance, key=sort_chromosomes)
1126+
}
1127+
1128+
self.chrs_stats.update(sorted_chr_to_mean_abundance)
11151129

11161130
# chr_to_mean_abundance but without any chr with partial name sex
11171131
autosomal_chr_to_mean_abundance = {}

src/snipe/api/snipe_sig.py

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
import heapq
22
import logging
3+
4+
import sourmash.save_load
35
from snipe.api.enums import SigType
46
from typing import Any, Dict, Iterator, List, Optional, Union
57
import numpy as np
68
import sourmash
7-
9+
import os
810

911
# Configure the root logger to CRITICAL to suppress unwanted logs by default
1012
logging.basicConfig(level=logging.CRITICAL)
@@ -150,6 +152,13 @@ def __init__(self, *, sourmash_sig: Union[str, sourmash.signature.SourmashSignat
150152
self._name = _sourmash_sig.name
151153
self._filename = _sourmash_sig.filename
152154
self._track_abundance = _sourmash_sig.minhash.track_abundance
155+
156+
if self._name.endswith("-snipesample"):
157+
self._name = self._name.replace("-snipesample", "")
158+
self.logger.debug("Found a sample signature with the snipe suffix `-snipesample`. Restoring original name `%s`.", self._name)
159+
elif self._name.endswith("-snipeamplicon"):
160+
self._name = self._name.replace("-snipeamplicon", "")
161+
self.logger.debug("Found an amplicon signature with the snipe suffix `-snipeamplicon`. Restoring original name `%s`.", self._name)
153162

154163
# If the signature does not track abundance, assume abundance of 1 for all hashes
155164
if not self._track_abundance:
@@ -485,16 +494,34 @@ def _convert_to_sourmash_signature(self):
485494
self.sourmash_sig = sourmash.signature.SourmashSignature(mh, name=self._name, filename=self._filename)
486495
self.logger.debug("Conversion to sourmash.signature.SourmashSignature completed.")
487496

488-
def export(self, path) -> None:
497+
def export(self, path, force=False) -> None:
489498
r"""
490499
Export the signature to a file.
491500
492501
Parameters:
493502
path (str): The path to save the signature to.
503+
force (bool): Flag to overwrite the file if it already exists.
494504
"""
495505
self._convert_to_sourmash_signature()
496-
with open(str(path), "wb") as fp:
497-
sourmash.signature.save_signatures_to_json([self.sourmash_sig], fp)
506+
if path.endswith(".sig"):
507+
self.logger.debug("Exporting signature to a .sig file.")
508+
with open(str(path), "wb") as fp:
509+
sourmash.signature.save_signatures_to_json([self.sourmash_sig], fp)
510+
# sourmash.save_load.SaveSignatures_SigFile
511+
512+
elif path.endswith(".zip"):
513+
if os.path.exists(path):
514+
raise FileExistsError("Output file already exists.")
515+
try:
516+
with sourmash.save_load.SaveSignatures_ZipFile(path) as save_sigs:
517+
save_sigs.add(self.sourmash_sig)
518+
except Exception as e:
519+
logging.error("Failed to export signatures to zip: %s", e)
520+
raise Exception(f"Failed to export signatures to zip: {e}") from e
521+
else:
522+
raise ValueError("Output file must be either a .sig or .zip file.")
523+
524+
498525

499526
def export_to_string(self):
500527
r"""
@@ -924,8 +951,6 @@ def sum_signatures(cls, signatures: List['SnipeSig'], name: str = "summed_signat
924951
for sig in signatures[1:]:
925952
if sig.ksize != ksize or sig.scale != scale:
926953
raise ValueError("All signatures must have the same ksize and scale.")
927-
if sig.track_abundance != track_abundance:
928-
raise ValueError("All signatures must have the same track_abundance setting.")
929954

930955
# Initialize iterators for each signature's hashes and abundances
931956
iterators = []

src/snipe/cli/cli_qc.py

Lines changed: 66 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
from snipe.api.snipe_sig import SnipeSig
1515
from snipe.api.reference_QC import ReferenceQC
1616

17+
import concurrent.futures
18+
import signal
1719

1820
def process_sample(sample_path: str, ref_path: str, amplicon_path: Optional[str],
1921
advanced: bool, roi: bool, debug: bool,
@@ -426,7 +428,7 @@ def qc(ref: str, sample: List[str], samples_from_file: Optional[str],
426428
if samples_from_file:
427429
logger.debug(f"Reading samples from file: {samples_from_file}")
428430
try:
429-
with open(samples_from_file, encoding='utf-8') as f:
431+
with open(samples_from_file, 'r', encoding='utf-8') as f:
430432
file_samples = {line.strip() for line in f if line.strip()}
431433
samples_set.update(file_samples)
432434
logger.debug(f"Collected {len(file_samples)} samples from file.")
@@ -492,32 +494,58 @@ def qc(ref: str, sample: List[str], samples_from_file: Optional[str],
492494
"ychr": ychr,
493495
"vars_paths": vars_paths
494496
})
497+
498+
results = []
495499

500+
# Define a handler for graceful shutdown
501+
def shutdown(signum, frame):
502+
logger.warning("Shutdown signal received. Terminating all worker processes...")
503+
executor.shutdown(wait=False, cancel_futures=True)
504+
sys.exit(1)
505+
506+
# Register signal handlers
507+
signal.signal(signal.SIGINT, shutdown)
508+
signal.signal(signal.SIGTERM, shutdown)
496509

497-
# Process samples in parallel with progress bar
498-
results = []
499-
with concurrent.futures.ProcessPoolExecutor(max_workers=cores) as executor:
500-
# Submit all tasks
501-
futures = {
502-
executor.submit(process_sample, **args): args for args in dict_process_args
503-
}
504-
# Iterate over completed futures with a progress bar
505-
for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Processing samples"):
506-
sample = futures[future]
507-
try:
508-
result = future.result()
509-
results.append(result)
510-
except Exception as exc:
511-
logger.error(f"Sample {sample} generated an exception: {exc}")
512-
results.append({
513-
"sample": os.path.splitext(os.path.basename(sample))[0],
514-
"file_path": sample,
515-
"QC_Error": str(exc)
516-
})
517-
518-
# Create pandas DataFrame
519-
logger.info("Aggregating results into DataFrame.")
520-
df = pd.DataFrame(results)
510+
try:
511+
with concurrent.futures.ProcessPoolExecutor(max_workers=cores) as executor:
512+
futures = {
513+
executor.submit(process_sample, **args): args for args in dict_process_args
514+
}
515+
516+
for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Processing samples"):
517+
sample = futures[future]
518+
try:
519+
result = future.result()
520+
results.append(result)
521+
except Exception as exc:
522+
logger.error(f"Sample {sample['sample_path']} generated an exception: {exc}")
523+
results.append({
524+
"sample": os.path.splitext(os.path.basename(sample['sample_path']))[0],
525+
"file_path": sample['sample_path'],
526+
"QC_Error": str(exc)
527+
})
528+
except KeyboardInterrupt:
529+
logger.warning("KeyboardInterrupt received. Shutting down...")
530+
sys.exit(1)
531+
except Exception as e:
532+
logger.error(f"An unexpected error occurred: {e}")
533+
sys.exit(1)
534+
535+
# Separate successful and failed results
536+
succeeded = [res for res in results if "QC_Error" not in res]
537+
failed = [res for res in results if "QC_Error" in res]
538+
539+
# Handle complete failure
540+
if len(succeeded) == 0:
541+
logger.error("All samples failed during QC processing. Output TSV will not be generated.")
542+
sys.exit(1)
543+
544+
# Prepare the command-line invocation for comments
545+
command_invocation = ' '.join(sys.argv)
546+
547+
# Create pandas DataFrame for succeeded samples
548+
df = pd.DataFrame(succeeded)
521549

522550
# Reorder columns to have 'sample' and 'file_path' first, if they exist
523551
cols = list(df.columns)
@@ -529,14 +557,25 @@ def qc(ref: str, sample: List[str], samples_from_file: Optional[str],
529557
reordered_cols += cols
530558
df = df[reordered_cols]
531559

532-
# Export to TSV
560+
# Export to TSV with comments
533561
try:
534-
df.to_csv(output, sep='\t', index=False)
562+
with open(output, 'w', encoding='utf-8') as f:
563+
# Write comment with command invocation
564+
f.write(f"# Command: {command_invocation}\n")
565+
# Write the DataFrame to the file
566+
df.to_csv(f, sep='\t', index=False)
535567
logger.info(f"QC results successfully exported to {output}")
536568
except Exception as e:
537569
logger.error(f"Failed to export QC results to {output}: {e}")
538570
sys.exit(1)
539571

572+
# Report failed samples if any
573+
if failed:
574+
failed_samples = [res['sample'] for res in failed]
575+
logger.warning(f"The following {len(failed_samples)} sample(s) failed during QC processing:")
576+
for sample in failed_samples:
577+
logger.warning(f"- {sample}")
578+
540579
end_time = time.time()
541580
elapsed_time = end_time - start_time
542581
logger.info(f"QC process completed in {elapsed_time:.2f} seconds.")

tests/api/test_reference_qc.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -611,7 +611,7 @@ def test_predict_coverage_already_full_coverage(self):
611611
)
612612
current_coverage = qc.genome_stats["Genome coverage index"]
613613
self.assertEqual(current_coverage, 1.0)
614-
predicted_coverage = qc.predict_coverage(extra_fold=1.0, n=10)
614+
predicted_coverage = qc.predict_coverage(extra_fold=2.0)
615615
# Predicted coverage should still be 1.0
616616
self.assertEqual(predicted_coverage, 1.0)
617617

0 commit comments

Comments
 (0)