@@ -680,11 +680,25 @@ def process_sample(self, sample_sig, predict_extra_folds: Optional[List[int]] =
680
680
sample_genome_non_repetitive = sample_genome .copy ()
681
681
sample_genome_non_repetitive &= self .reference_without_repeats
682
682
abundance_based_sample_genome_stats = sample_genome_non_repetitive .get_sample_stats
683
+
684
+ ### --------- variance calculation heuristics -------------
685
+ _projected_repfree_genomic_abundance_array_length = len (self .reference_without_repeats )
686
+ # Extend np array with zeros to match reference length
687
+ pad_length = _projected_repfree_genomic_abundance_array_length - len (sample_genome_non_repetitive .abundances )
688
+ _custom_genomic_repfree_abundances = np .concatenate ([
689
+ sample_genome_non_repetitive .abundances ,
690
+ np .zeros (pad_length )
691
+ ])
692
+ # Remove top 1% of the abundances by value
693
+ _custom_genomic_repfree_abundances = np .sort (_custom_genomic_repfree_abundances )
694
+ _custom_genomic_repfree_abundances = _custom_genomic_repfree_abundances [:int (len (_custom_genomic_repfree_abundances ) * 0.99 )]
695
+
696
+
683
697
genome_stats .update ({
684
698
"Genomic repfree unique k-mers" : abundance_based_sample_genome_stats ["num_hashes" ],
685
699
"Genomic repfree k-mers total abundance" : abundance_based_sample_genome_stats ["total_abundance" ],
686
- "Genomic repfree k-mers mean abundance" : (abundance_based_sample_genome_stats ["total_abundance" ] / len (self .reference_without_repeats )
687
- if len ( self . reference_without_repeats ) > 0 and abundance_based_sample_genome_stats [ "total_abundance" ] is not None else 0 ),
700
+ "Genomic repfree k-mers mean abundance" : (abundance_based_sample_genome_stats ["total_abundance" ] / len (self .reference_without_repeats ) if len ( self . reference_without_repeats ) > 0 and abundance_based_sample_genome_stats [ "total_abundance" ] is not None else 0 ),
701
+ "Genomic repfree k-mers variance abundance" : np . var ( _custom_genomic_repfree_abundances ),
688
702
"Genomic repfree k-mers mean abundance - no_zero_cov" : abundance_based_sample_genome_stats ["mean_abundance" ],
689
703
"Genomic repfree k-mers median abundance - no_zero_cov" : abundance_based_sample_genome_stats ["median_abundance" ],
690
704
"Genomic repfree k-mers coverage index" : (abundance_based_sample_genome_stats ["num_hashes" ] / len (self .reference_without_repeats )
@@ -814,12 +828,25 @@ def process_sample(self, sample_sig, predict_extra_folds: Optional[List[int]] =
814
828
self .amplicon_without_repeats = self .amplicon_sig & self .reference_without_repeats
815
829
abundance_based_sample_amplicon_stats = sample_amplicon_non_repetitive .get_sample_stats
816
830
831
+ ### --------- amplicon variance calculation heuristics -------------
832
+ _projected_repfree_amplicon_abundance_array_length = len (self .amplicon_without_repeats )
833
+ # Extend np array with zeros to match amplicon reference length
834
+ pad_length = _projected_repfree_amplicon_abundance_array_length - len (sample_amplicon_non_repetitive .abundances )
835
+ _custom_amplicon_repfree_abundances = np .concatenate ([
836
+ sample_amplicon_non_repetitive .abundances ,
837
+ np .zeros (pad_length )
838
+ ])
839
+ # Remove top 1% of the abundances by value
840
+ _custom_amplicon_repfree_abundances = np .sort (_custom_amplicon_repfree_abundances )
841
+ _custom_amplicon_repfree_abundances = _custom_amplicon_repfree_abundances [:int (len (_custom_amplicon_repfree_abundances ) * 0.99 )]
842
+
817
843
amplicon_stats ["Amplicon repfree k-mers total abundance" ] = \
818
844
abundance_based_sample_amplicon_stats ["total_abundance" ]
819
845
amplicon_stats ["Amplicon repfree k-mers mean abundance" ] = (
820
846
abundance_based_sample_amplicon_stats ["total_abundance" ] / len (self .amplicon_without_repeats )
821
847
if len (self .amplicon_without_repeats ) > 0 and abundance_based_sample_amplicon_stats ["total_abundance" ] is not None else 0
822
848
)
849
+ amplicon_stats ["Amplicon repfree k-mers variance abundance" ] = np .var (_custom_amplicon_repfree_abundances )
823
850
amplicon_stats ["Amplicon repfree k-mers mean abundance - no_zero_cov" ] = \
824
851
abundance_based_sample_amplicon_stats ["mean_abundance" ]
825
852
amplicon_stats ["Amplicon repfree k-mers median abundance - no_zero_cov" ] = \
0 commit comments