|
2 | 2 | import os |
3 | 3 | import time |
4 | 4 | import subprocess |
5 | | -from collections import Counter |
6 | | - |
7 | | -import requests |
8 | | - |
9 | 5 | from noisereduce import reduce_noise |
10 | 6 | from scipy.io import wavfile |
11 | 7 | from telegram import Update |
|
20 | 16 | from utilities.wrapper import send_text |
21 | 17 | from databases.db import push_user_survey_progress, init_user, get_user_audio |
22 | 18 |
|
23 | | -from env_config import (DEBUG_MODE, |
24 | | - DEBUG_ON, DEBUG_OFF, TOKEN) |
| 19 | +from env_config import (DEBUG_MODE, DEBUG_ON) |
25 | 20 | from kafka.kafka_producer import produce_message |
26 | 21 |
|
27 | 22 |
|
28 | | -def split_audio(wav_filename, unique_file_id, min_chunk_length=30000, max_chunk_length=40000, silence_thresh=-40, min_silence_len=500): |
29 | | - audio = AudioSegment.from_wav(wav_filename) |
30 | | - chunk_dir_name = os.path.join('emotion_recognition', 'input_files') |
31 | | - if not os.path.exists(chunk_dir_name): |
32 | | - os.makedirs(chunk_dir_name) |
33 | | - chunk_filenames = [] |
34 | | - chunk_start_times = [] |
35 | | - |
36 | | - if len(audio) <= min_chunk_length: |
37 | | - chunk_filename = os.path.join(chunk_dir_name, unique_file_id + "_chunk_0.wav") |
38 | | - audio.export(chunk_filename, format="wav") |
39 | | - return [chunk_filename], [0] |
40 | | - |
| 23 | +def get_silence_points(audio, min_silence_len, silence_thresh): |
41 | 24 | silence_ranges = detect_silence(audio, min_silence_len=min_silence_len, silence_thresh=silence_thresh) |
42 | | - silence_points = [(start + end) / 2 for start, end in silence_ranges] |
| 25 | + return [(start + end) / 2 for start, end in silence_ranges] |
43 | 26 |
|
| 27 | + |
| 28 | +def generate_chunks(audio, silence_points, min_len, max_len): |
44 | 29 | chunks = [] |
| 30 | + starts = [] |
45 | 31 | start = 0 |
46 | 32 |
|
47 | 33 | for silence in silence_points: |
48 | 34 | chunk_length = silence - start |
49 | | - if min_chunk_length <= chunk_length <= max_chunk_length: |
50 | | - chunks.append(audio[start:silence]) |
51 | | - chunk_start_times.append(start) |
52 | | - start = silence |
53 | | - elif chunk_length > max_chunk_length: |
54 | | - split_point = start + max_chunk_length |
| 35 | + if min_len <= chunk_length <= max_len: |
| 36 | + chunks.append(audio[start:int(silence)]) |
| 37 | + starts.append(start) |
| 38 | + start = int(silence) |
| 39 | + elif chunk_length > max_len: |
| 40 | + split_point = start + max_len |
55 | 41 | chunks.append(audio[start:split_point]) |
56 | | - chunk_start_times.append(start) |
| 42 | + starts.append(start) |
57 | 43 | start = split_point |
58 | 44 |
|
59 | 45 | if start < len(audio): |
60 | 46 | chunks.append(audio[start:]) |
61 | | - chunk_start_times.append(start) |
| 47 | + starts.append(start) |
| 48 | + |
| 49 | + return chunks, starts |
| 50 | + |
62 | 51 |
|
| 52 | +def export_chunks(chunks, base_dir, file_id): |
| 53 | + filenames = [] |
63 | 54 | for i, chunk in enumerate(chunks): |
64 | | - chunk_filename = os.path.join(chunk_dir_name, unique_file_id + f"_chunk_{i}.wav") |
65 | | - chunk.export(chunk_filename, format="wav") |
66 | | - chunk_filenames.append(chunk_filename) |
| 55 | + filename = os.path.join(base_dir, f"{file_id}_chunk_{i}.wav") |
| 56 | + chunk.export(filename, format="wav") |
| 57 | + filenames.append(filename) |
| 58 | + return filenames |
67 | 59 |
|
68 | | - return chunk_filenames, chunk_start_times |
| 60 | + |
| 61 | +def split_audio(wav_filename, unique_file_id, min_chunk_length=30000, max_chunk_length=40000, silence_thresh=-40, |
| 62 | + min_silence_len=500): |
| 63 | + audio = AudioSegment.from_wav(wav_filename) |
| 64 | + chunk_dir = os.path.join('emotion_recognition', 'input_files') |
| 65 | + if not os.path.exists(chunk_dir): |
| 66 | + os.makedirs(chunk_dir) |
| 67 | + |
| 68 | + if len(audio) <= min_chunk_length: |
| 69 | + filename = os.path.join(chunk_dir, f"{unique_file_id}_chunk_0.wav") |
| 70 | + audio.export(filename, format="wav") |
| 71 | + return [filename], [0] |
| 72 | + |
| 73 | + silence_points = get_silence_points(audio, min_silence_len, silence_thresh) |
| 74 | + chunks, start_times = generate_chunks(audio, silence_points, min_chunk_length, max_chunk_length) |
| 75 | + filenames = export_chunks(chunks, chunk_dir, unique_file_id) |
| 76 | + |
| 77 | + return filenames, start_times |
69 | 78 |
|
70 | 79 |
|
71 | 80 | def download_voice(update: Update): |
@@ -121,46 +130,63 @@ def work_with_audio(update: Update, context: CallbackContext): |
121 | 130 | produce_message('stt', json.dumps(message)) |
122 | 131 |
|
123 | 132 |
|
124 | | -def audio_to_text(filename, ogg_filename, chunk_filenames, chunk_start_times, update_id, user): |
125 | | - processing_start_time = time.time() |
126 | | - input_sentence, stats_sentence = "", "" |
127 | | - emotions = Counter() |
128 | | - audio_emotions_statistics = [] |
| 133 | +def process_chunk(chunk_filename, start_time): |
| 134 | + response = get_att_whisper(chunk_filename) |
129 | 135 |
|
130 | | - for chunk_filename, start_time in zip(chunk_filenames, chunk_start_times): |
131 | | - response = get_att_whisper(chunk_filename) |
| 136 | + if response.status_code != 200: |
| 137 | + return None |
132 | 138 |
|
133 | | - if response.status_code == 200: |
134 | | - chunk_input_sentence = RecognizedSentence(response.json()) |
135 | | - else: |
136 | | - return |
| 139 | + sentence = RecognizedSentence(response.json()) |
| 140 | + word, emotion = associate_words_with_emotions(os.path.basename(chunk_filename), sentence.get_text()) |
137 | 141 |
|
138 | | - word, emotion = associate_words_with_emotions(chunk_filename.split('/')[-1], chunk_input_sentence.get_text()) |
139 | | - emotions.update([emotion]) |
| 142 | + return { |
| 143 | + "text": sentence.get_text(), |
| 144 | + "stats": sentence.generate_stats(), |
| 145 | + "emotion": emotion, |
| 146 | + "word": word, |
| 147 | + "start_time": start_time, |
| 148 | + "filename": chunk_filename, |
| 149 | + } |
140 | 150 |
|
141 | | - text = chunk_input_sentence.get_text() |
142 | 151 |
|
143 | | - audio_emotions_statistics.append({"filename": chunk_filename, "emotion": emotion, "word": word, "text": text, "start_time": start_time}) |
| 152 | +def audio_to_text(filename, ogg_filename, chunk_filenames, chunk_start_times, update_id, user): |
| 153 | + start_time = time.time() |
| 154 | + full_text = [] |
| 155 | + stats_blocks = [] |
| 156 | + emotion_stats = [] |
| 157 | + |
| 158 | + for chunk_filename, start_time_chunk in zip(chunk_filenames, chunk_start_times): |
| 159 | + result = process_chunk(chunk_filename, start_time_chunk) |
| 160 | + if not result: |
| 161 | + return |
144 | 162 |
|
145 | | - input_sentence += text |
146 | | - stats_sentence += chunk_input_sentence.generate_stats() + "\n" |
| 163 | + full_text.append(result["text"]) |
| 164 | + stats_blocks.append(result["stats"]) |
| 165 | + emotion_stats.append({ |
| 166 | + "filename": result["filename"], |
| 167 | + "emotion": result["emotion"], |
| 168 | + "word": result["word"], |
| 169 | + "text": result["text"], |
| 170 | + "start_time": result["start_time"] |
| 171 | + }) |
147 | 172 |
|
148 | 173 | if DEBUG_MODE == DEBUG_ON: |
149 | | - processing_end_time = time.time() |
150 | | - processing_time = processing_end_time - processing_start_time |
151 | | - send_text(user.id, f"Processing time: {processing_time:.2f} seconds") |
| 174 | + elapsed = time.time() - start_time |
| 175 | + send_text(user.id, f"Processing time: {elapsed:.2f} seconds") |
152 | 176 |
|
153 | 177 | push_user_survey_progress( |
154 | | - user, |
155 | | - init_user(user).get_last_focus(), |
156 | | - update_id, |
157 | | - user_answer=input_sentence, |
158 | | - stats=stats_sentence, |
| 178 | + user=user, |
| 179 | + focus=init_user(user).get_last_focus(), |
| 180 | + id_=update_id, |
| 181 | + user_answer="".join(full_text), |
| 182 | + stats="\n".join(stats_blocks), |
159 | 183 | audio_file=open(ogg_filename, 'rb'), # pylint: disable=consider-using-with |
160 | | - audio_emotions_statistics=audio_emotions_statistics |
| 184 | + audio_emotions_statistics=emotion_stats |
161 | 185 | ) |
| 186 | + |
162 | 187 | os.remove(ogg_filename) |
163 | 188 |
|
164 | 189 | if DEBUG_MODE == DEBUG_ON: |
165 | 190 | print(get_user_audio(user)) |
166 | | - send_text(user.id, "ID записи с твоим аудиосообщением в базе данных: " + str(json.loads(json_util.dumps(get_user_audio(user))))) |
| 191 | + send_text(user.id, "ID записи с твоим аудиосообщением в базе данных: " + |
| 192 | + str(json.loads(json_util.dumps(get_user_audio(user))))) |
0 commit comments