Skip to content

Commit 1f34958

Browse files
committed
Split speech features normalize
1 parent b62dd46 commit 1f34958

2 files changed

Lines changed: 158 additions & 136 deletions

File tree

shared/api/speech_features.hpp

Lines changed: 6 additions & 136 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
#include <math/dlib/stft_norm.hpp>
88
#include "nemo_mel_spectrogram.h"
99

10+
#include "speech_features_normalize.hpp"
11+
1012
#ifndef M_PI
1113
#define M_PI 3.14159265358979323846
1214
#endif
@@ -660,141 +662,9 @@ class Phi4AudioEmbed {
660662
int64_t qformer_compression_rate_{1};
661663
};
662664

663-
// Per-feature (per-mel-bin) normalization: for each feature row,
664-
// compute mean and std across time, then normalize.
665-
// Input: [1, num_features, num_frames] (feature_first) or [1, num_frames, num_features]
666-
// Output: same shape, normalized.
667-
class PerFeatureNormalize {
668-
public:
669-
template <typename DictT>
670-
OrtxStatus Init(const DictT& attrs) {
671-
for (const auto& [key, value] : attrs) {
672-
if (key == "eps") {
673-
eps_ = static_cast<float>(std::get<double>(value));
674-
} else if (key == "feature_first") {
675-
feature_first_ = std::get<int64_t>(value);
676-
} else if (key != "_comment") {
677-
return {kOrtxErrorInvalidArgument, "[PerFeatureNormalize]: Invalid key in the JSON configuration."};
678-
}
679-
}
680-
return {};
681-
}
682-
683-
OrtxStatus Compute(const ortc::Tensor<float>& input, ortc::Tensor<float>& output) {
684-
const auto& shape = input.Shape();
685-
int64_t num_features, num_frames;
686-
687-
if (shape.size() == 2) {
688-
// 2D: [features, frames] or [frames, features]
689-
num_features = feature_first_ ? shape[0] : shape[1];
690-
num_frames = feature_first_ ? shape[1] : shape[0];
691-
} else if (shape.size() == 3 && shape[0] == 1) {
692-
// 3D: [1, features, frames] or [1, frames, features]
693-
num_features = feature_first_ ? shape[1] : shape[2];
694-
num_frames = feature_first_ ? shape[2] : shape[1];
695-
} else {
696-
return {kOrtxErrorInvalidArgument, "[PerFeatureNormalize]: Expected input shape [features, frames] or [1, features, frames]."};
697-
}
698-
699-
const float* in_data = input.Data();
700-
float* out_data = output.Allocate(shape);
701-
702-
// Copy input to output first
703-
std::memcpy(out_data, in_data, num_features * num_frames * sizeof(float));
665+
// PerFeatureNormalize and NemoLogMel have been moved to
666+
// "speech_features_normalize.hpp" (included above). They remain available
667+
// to consumers of this header unchanged.
704668

705-
// Need at least 2 frames for sample std (N-1 denominator)
706-
if (num_frames <= 1) {
707-
// Single frame or empty: output zeros (value - mean = 0 for constant)
708-
std::memset(out_data, 0, num_features * num_frames * sizeof(float));
709-
return {};
710-
}
711-
712-
for (int64_t f = 0; f < num_features; ++f) {
713-
// Compute mean
714-
float sum = 0.0f;
715-
for (int64_t t = 0; t < num_frames; ++t) {
716-
int64_t idx = feature_first_ ? (f * num_frames + t) : (t * num_features + f);
717-
sum += out_data[idx];
718-
}
719-
float mean = sum / static_cast<float>(num_frames);
720-
721-
// Compute std (sample std, divide by N-1)
722-
float var_sum = 0.0f;
723-
for (int64_t t = 0; t < num_frames; ++t) {
724-
int64_t idx = feature_first_ ? (f * num_frames + t) : (t * num_features + f);
725-
float d = out_data[idx] - mean;
726-
var_sum += d * d;
727-
}
728-
float std_val = std::sqrt(var_sum / static_cast<float>(num_frames - 1)) + eps_;
729-
730-
// Normalize
731-
for (int64_t t = 0; t < num_frames; ++t) {
732-
int64_t idx = feature_first_ ? (f * num_frames + t) : (t * num_features + f);
733-
out_data[idx] = (out_data[idx] - mean) / std_val;
734-
}
735-
}
736-
737-
return {};
738-
}
739-
740-
private:
741-
float eps_{1e-5f};
742-
int64_t feature_first_{1}; // 1 = [1, features, frames], 0 = [1, frames, features]
743-
};
744-
745-
// NeMo-compatible log-mel spectrogram kernel.
746-
// Wraps nemo_mel::NemoComputeLogMelBatch for use in the SpeechFeatureExtractor pipeline.
747-
// Input: [num_samples] or [1, num_samples] float32 PCM audio
748-
// Output: [num_mels, num_frames] float32 log-mel spectrogram per example;
749-
// StackTensors adds the batch dimension later in the pipeline.
750-
class NemoLogMel {
751-
public:
752-
template <typename DictT>
753-
OrtxStatus Init(const DictT& attrs) {
754-
for (const auto& [key, value] : attrs) {
755-
if (key == "num_mels") {
756-
cfg_.num_mels = static_cast<int>(std::get<int64_t>(value));
757-
} else if (key == "fft_size") {
758-
cfg_.fft_size = static_cast<int>(std::get<int64_t>(value));
759-
} else if (key == "hop_length") {
760-
cfg_.hop_length = static_cast<int>(std::get<int64_t>(value));
761-
} else if (key == "win_length") {
762-
cfg_.win_length = static_cast<int>(std::get<int64_t>(value));
763-
} else if (key == "sample_rate") {
764-
cfg_.sample_rate = static_cast<int>(std::get<int64_t>(value));
765-
} else if (key == "preemph") {
766-
cfg_.preemph = static_cast<float>(std::get<double>(value));
767-
} else if (key == "log_eps") {
768-
cfg_.log_eps = static_cast<float>(std::get<double>(value));
769-
} else if (key != "_comment") {
770-
return {kOrtxErrorInvalidArgument, "[NemoLogMel]: Invalid key in the JSON configuration."};
771-
}
772-
}
773-
return {};
774-
}
775-
776-
OrtxStatus Compute(const ortc::Tensor<float>& pcm, ortc::Tensor<float>& logmel) {
777-
const auto& shape = pcm.Shape();
778-
size_t num_samples;
779-
if (shape.size() == 1) {
780-
num_samples = static_cast<size_t>(shape[0]);
781-
} else if (shape.size() == 2 && shape[0] == 1) {
782-
num_samples = static_cast<size_t>(shape[1]);
783-
} else {
784-
return {kOrtxErrorInvalidArgument, "[NemoLogMel]: Expected input shape [num_samples] or [1, num_samples]."};
785-
}
786-
787-
int num_frames = 0;
788-
auto mel_data = nemo_mel::NemoComputeLogMelBatch(pcm.Data(), num_samples, cfg_, num_frames);
789-
790-
// Output [num_mels, num_frames] (no batch dim) — StackTensors adds the batch dim
791-
auto* out = logmel.Allocate({cfg_.num_mels, num_frames});
792-
std::memcpy(out, mel_data.data(), mel_data.size() * sizeof(float));
793-
return {};
794-
}
795-
796-
private:
797-
nemo_mel::NemoMelConfig cfg_{128, 512, 160, 400, 16000, 0.97f, 5.96046448e-08f};
798-
};
799669

800-
} // namespace ort_extensions
670+
} // namespace ort_extensions
Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
// Copyright (c) Microsoft Corporation. All rights reserved.
2+
// Licensed under the MIT License.
3+
4+
#pragma once
5+
6+
#include "ocos.h"
7+
#include "nemo_mel_spectrogram.h"
8+
9+
#include <cmath>
10+
#include <cstring>
11+
#include <cstdint>
12+
13+
namespace ort_extensions {
14+
15+
// Per-feature (per-mel-bin) normalization: for each feature row,
16+
// compute mean and std across time, then normalize.
17+
// Input: [1, num_features, num_frames] (feature_first) or [1, num_frames, num_features]
18+
// Output: same shape, normalized.
19+
class PerFeatureNormalize {
20+
public:
21+
template <typename DictT>
22+
OrtxStatus Init(const DictT& attrs) {
23+
for (const auto& [key, value] : attrs) {
24+
if (key == "eps") {
25+
eps_ = static_cast<float>(std::get<double>(value));
26+
} else if (key == "feature_first") {
27+
feature_first_ = std::get<int64_t>(value);
28+
} else if (key != "_comment") {
29+
return {kOrtxErrorInvalidArgument, "[PerFeatureNormalize]: Invalid key in the JSON configuration."};
30+
}
31+
}
32+
return {};
33+
}
34+
35+
OrtxStatus Compute(const ortc::Tensor<float>& input, ortc::Tensor<float>& output) {
36+
const auto& shape = input.Shape();
37+
int64_t num_features, num_frames;
38+
39+
if (shape.size() == 2) {
40+
// 2D: [features, frames] or [frames, features]
41+
num_features = feature_first_ ? shape[0] : shape[1];
42+
num_frames = feature_first_ ? shape[1] : shape[0];
43+
} else if (shape.size() == 3 && shape[0] == 1) {
44+
// 3D: [1, features, frames] or [1, frames, features]
45+
num_features = feature_first_ ? shape[1] : shape[2];
46+
num_frames = feature_first_ ? shape[2] : shape[1];
47+
} else {
48+
return {kOrtxErrorInvalidArgument, "[PerFeatureNormalize]: Expected input shape [features, frames] or [1, features, frames]."};
49+
}
50+
51+
const float* in_data = input.Data();
52+
float* out_data = output.Allocate(shape);
53+
54+
// Copy input to output first
55+
std::memcpy(out_data, in_data, num_features * num_frames * sizeof(float));
56+
57+
// Need at least 2 frames for sample std (N-1 denominator)
58+
if (num_frames <= 1) {
59+
// Single frame or empty: output zeros (value - mean = 0 for constant)
60+
std::memset(out_data, 0, num_features * num_frames * sizeof(float));
61+
return {};
62+
}
63+
64+
for (int64_t f = 0; f < num_features; ++f) {
65+
// Compute mean
66+
float sum = 0.0f;
67+
for (int64_t t = 0; t < num_frames; ++t) {
68+
int64_t idx = feature_first_ ? (f * num_frames + t) : (t * num_features + f);
69+
sum += out_data[idx];
70+
}
71+
float mean = sum / static_cast<float>(num_frames);
72+
73+
// Compute std (sample std, divide by N-1)
74+
float var_sum = 0.0f;
75+
for (int64_t t = 0; t < num_frames; ++t) {
76+
int64_t idx = feature_first_ ? (f * num_frames + t) : (t * num_features + f);
77+
float d = out_data[idx] - mean;
78+
var_sum += d * d;
79+
}
80+
float std_val = std::sqrt(var_sum / static_cast<float>(num_frames - 1)) + eps_;
81+
82+
// Normalize
83+
for (int64_t t = 0; t < num_frames; ++t) {
84+
int64_t idx = feature_first_ ? (f * num_frames + t) : (t * num_features + f);
85+
out_data[idx] = (out_data[idx] - mean) / std_val;
86+
}
87+
}
88+
89+
return {};
90+
}
91+
92+
private:
93+
float eps_{1e-5f};
94+
int64_t feature_first_{1}; // 1 = [1, features, frames], 0 = [1, frames, features]
95+
};
96+
97+
// NeMo-compatible log-mel spectrogram kernel.
98+
// Wraps nemo_mel::NemoComputeLogMelBatch for use in the SpeechFeatureExtractor pipeline.
99+
// Input: [num_samples] or [1, num_samples] float32 PCM audio
100+
// Output: [num_mels, num_frames] float32 log-mel spectrogram per example;
101+
// StackTensors adds the batch dimension later in the pipeline.
102+
class NemoLogMel {
103+
public:
104+
template <typename DictT>
105+
OrtxStatus Init(const DictT& attrs) {
106+
for (const auto& [key, value] : attrs) {
107+
if (key == "num_mels") {
108+
cfg_.num_mels = static_cast<int>(std::get<int64_t>(value));
109+
} else if (key == "fft_size") {
110+
cfg_.fft_size = static_cast<int>(std::get<int64_t>(value));
111+
} else if (key == "hop_length") {
112+
cfg_.hop_length = static_cast<int>(std::get<int64_t>(value));
113+
} else if (key == "win_length") {
114+
cfg_.win_length = static_cast<int>(std::get<int64_t>(value));
115+
} else if (key == "sample_rate") {
116+
cfg_.sample_rate = static_cast<int>(std::get<int64_t>(value));
117+
} else if (key == "preemph") {
118+
cfg_.preemph = static_cast<float>(std::get<double>(value));
119+
} else if (key == "log_eps") {
120+
cfg_.log_eps = static_cast<float>(std::get<double>(value));
121+
} else if (key != "_comment") {
122+
return {kOrtxErrorInvalidArgument, "[NemoLogMel]: Invalid key in the JSON configuration."};
123+
}
124+
}
125+
return {};
126+
}
127+
128+
OrtxStatus Compute(const ortc::Tensor<float>& pcm, ortc::Tensor<float>& logmel) {
129+
const auto& shape = pcm.Shape();
130+
size_t num_samples;
131+
if (shape.size() == 1) {
132+
num_samples = static_cast<size_t>(shape[0]);
133+
} else if (shape.size() == 2 && shape[0] == 1) {
134+
num_samples = static_cast<size_t>(shape[1]);
135+
} else {
136+
return {kOrtxErrorInvalidArgument, "[NemoLogMel]: Expected input shape [num_samples] or [1, num_samples]."};
137+
}
138+
139+
int num_frames = 0;
140+
auto mel_data = nemo_mel::NemoComputeLogMelBatch(pcm.Data(), num_samples, cfg_, num_frames);
141+
142+
// Output [num_mels, num_frames] (no batch dim) — StackTensors adds the batch dim
143+
auto* out = logmel.Allocate({cfg_.num_mels, num_frames});
144+
std::memcpy(out, mel_data.data(), mel_data.size() * sizeof(float));
145+
return {};
146+
}
147+
148+
private:
149+
nemo_mel::NemoMelConfig cfg_{128, 512, 160, 400, 16000, 0.97f, 5.96046448e-08f};
150+
};
151+
152+
} // namespace ort_extensions

0 commit comments

Comments
 (0)