Skip to content

Commit d008ce6

Browse files
committed
modified repfree variance
1 parent a774267 commit d008ce6

File tree

1 file changed

+29
-2
lines changed

1 file changed

+29
-2
lines changed

src/snipe/api/multisig_reference_QC.py

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -680,11 +680,25 @@ def process_sample(self, sample_sig, predict_extra_folds: Optional[List[int]] =
680680
sample_genome_non_repetitive = sample_genome.copy()
681681
sample_genome_non_repetitive &= self.reference_without_repeats
682682
abundance_based_sample_genome_stats = sample_genome_non_repetitive.get_sample_stats
683+
684+
### --------- variance calculation heuristics -------------
685+
_projected_repfree_genomic_abundance_array_length = len(self.reference_without_repeats)
686+
# Extend np array with zeros to match reference length
687+
pad_length = _projected_repfree_genomic_abundance_array_length - len(sample_genome_non_repetitive.abundances)
688+
_custom_genomic_repfree_abundances = np.concatenate([
689+
sample_genome_non_repetitive.abundances,
690+
np.zeros(pad_length)
691+
])
692+
# Remove top 1% of the abundances by value
693+
_custom_genomic_repfree_abundances = np.sort(_custom_genomic_repfree_abundances)
694+
_custom_genomic_repfree_abundances = _custom_genomic_repfree_abundances[:int(len(_custom_genomic_repfree_abundances) * 0.99)]
695+
696+
683697
genome_stats.update({
684698
"Genomic repfree unique k-mers": abundance_based_sample_genome_stats["num_hashes"],
685699
"Genomic repfree k-mers total abundance": abundance_based_sample_genome_stats["total_abundance"],
686-
"Genomic repfree k-mers mean abundance": (abundance_based_sample_genome_stats["total_abundance"] / len(self.reference_without_repeats)
687-
if len(self.reference_without_repeats) > 0 and abundance_based_sample_genome_stats["total_abundance"] is not None else 0),
700+
"Genomic repfree k-mers mean abundance": (abundance_based_sample_genome_stats["total_abundance"] / len(self.reference_without_repeats) if len(self.reference_without_repeats) > 0 and abundance_based_sample_genome_stats["total_abundance"] is not None else 0),
701+
"Genomic repfree k-mers variance abundance": np.var(_custom_genomic_repfree_abundances),
688702
"Genomic repfree k-mers mean abundance - no_zero_cov": abundance_based_sample_genome_stats["mean_abundance"],
689703
"Genomic repfree k-mers median abundance - no_zero_cov": abundance_based_sample_genome_stats["median_abundance"],
690704
"Genomic repfree k-mers coverage index": (abundance_based_sample_genome_stats["num_hashes"] / len(self.reference_without_repeats)
@@ -814,12 +828,25 @@ def process_sample(self, sample_sig, predict_extra_folds: Optional[List[int]] =
814828
self.amplicon_without_repeats = self.amplicon_sig & self.reference_without_repeats
815829
abundance_based_sample_amplicon_stats = sample_amplicon_non_repetitive.get_sample_stats
816830

831+
### --------- amplicon variance calculation heuristics -------------
832+
_projected_repfree_amplicon_abundance_array_length = len(self.amplicon_without_repeats)
833+
# Extend np array with zeros to match amplicon reference length
834+
pad_length = _projected_repfree_amplicon_abundance_array_length - len(sample_amplicon_non_repetitive.abundances)
835+
_custom_amplicon_repfree_abundances = np.concatenate([
836+
sample_amplicon_non_repetitive.abundances,
837+
np.zeros(pad_length)
838+
])
839+
# Remove top 1% of the abundances by value
840+
_custom_amplicon_repfree_abundances = np.sort(_custom_amplicon_repfree_abundances)
841+
_custom_amplicon_repfree_abundances = _custom_amplicon_repfree_abundances[:int(len(_custom_amplicon_repfree_abundances) * 0.99)]
842+
817843
amplicon_stats["Amplicon repfree k-mers total abundance"] = \
818844
abundance_based_sample_amplicon_stats["total_abundance"]
819845
amplicon_stats["Amplicon repfree k-mers mean abundance"] = (
820846
abundance_based_sample_amplicon_stats["total_abundance"] / len(self.amplicon_without_repeats)
821847
if len(self.amplicon_without_repeats) > 0 and abundance_based_sample_amplicon_stats["total_abundance"] is not None else 0
822848
)
849+
amplicon_stats["Amplicon repfree k-mers variance abundance"] = np.var(_custom_amplicon_repfree_abundances)
823850
amplicon_stats["Amplicon repfree k-mers mean abundance - no_zero_cov"] = \
824851
abundance_based_sample_amplicon_stats["mean_abundance"]
825852
amplicon_stats["Amplicon repfree k-mers median abundance - no_zero_cov"] = \

0 commit comments

Comments
 (0)