-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.c
280 lines (226 loc) · 10.1 KB
/
main.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
#include <stdio.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <getopt.h>
#include <sndfile.h>
#include <rnnoise.h>
#define RNNOISE_SAMPLE_RATE 48000
static struct option long_options[] = {
{ "model", required_argument, 0, 'm' },
{ "amplify", required_argument, 0, 'a' },
{ "prefeed", required_argument, 0, 'p' },
{ 0, 0, 0, 0 }
};
struct channel_state {
DenoiseState * ds;
float * input;
float * output;
};
void print_help() {
printf("Usage: denoiseit [OPTIONS...] INPUT OUTPUT\n");
printf("\n");
printf("Denoise the INPUT audio file with RNNoise and save the result to OUTPUT.\n");
printf("OUTPUT must have the same extension/file format as INPUT.\n");
printf("\n");
printf("Options:\n");
printf(" -m, --model Path to the custom RNNoise model\n");
printf(" -a, --amplify Amplify the input before denoising (default: 1.0)\n");
printf(" -p, --prefeed Number of seconds to read ahead to teach RNNoise\n");
printf(" what noise to remove (useful for noisy starts)\n");
printf("\n");
printf("More info: https://github.com/DragoonAethis/DenoiseIt\n");
}
int main(int argc, char ** argv) {
int option_index, option;
float amplify_factor = 1.0f;
float prefeed_seconds = 0.0f;
RNNModel * rnnoise_model = NULL;
do {
option = getopt_long(argc, argv, "m:a:p:", long_options, &option_index);
if (option == -1) break;
switch (option) {
case 'm': {
printf("Trying to use RNNoise model: %s\n", optarg);
FILE * model_file = fopen(optarg, "r");
if (model_file == NULL) {
printf("Could not read the provided RNNoise model: %s\n", strerror(errno));
return -1;
}
rnnoise_model = rnnoise_model_from_file(model_file);
if (rnnoise_model == NULL) {
printf("RNNoise could not load the provided file as a valid model.\n");
return -1;
}
fclose(model_file);
} break;
case 'a': {
amplify_factor = strtof(optarg, NULL);
if (errno != 0) {
printf("Provided amplification factor is not a valid floating point value.\n");
return -1;
}
printf("Using amplification factor: %f\n", amplify_factor);
} break;
case 'p': {
prefeed_seconds = strtof(optarg, NULL);
if (errno != 0) {
printf("Provided prefeed seconds value is not a valid floating point value.\n");
return -1;
}
printf("Prefeeding RNNoise with %f seconds of audio\n", amplify_factor);
} break;
default: {
// getopt_long prints an error already, so...
print_help();
return -1;
}
}
} while (1);
int remaining_options = argc - optind;
if (remaining_options != 2) {
if (remaining_options < 2) {
printf("Error: Not enough arguments given.\n");
} else {
printf("Error: Too many arguments given.\n");
}
print_help();
return -1;
}
const char * input_path = argv[optind];
const char * output_path = argv[optind + 1];
// RNNoise can only process single-channel frames with this many samples:
int rnnoise_frame_size = rnnoise_get_frame_size();
SF_INFO input_info = {0};
SNDFILE * input_file = sf_open(input_path, SFM_READ, &input_info);
if (input_file == NULL) {
printf("Could not open the input file: %s\n", sf_strerror(input_file));
return -1;
}
if (input_info.seekable == 0) {
printf("Input file is not seekable and cannot be processed.\n");
return -1;
}
if (input_info.samplerate != RNNOISE_SAMPLE_RATE) {
printf("Input file sample rate is %dHz, RNNoise can process only %dHz.\n",
input_info.samplerate, RNNOISE_SAMPLE_RATE);
printf("(Try using ffmpeg or sox to convert it to WAV first.)\n");
return -1;
}
if (!sf_format_check(&input_info)) {
printf("Provided input file format cannot be used for writing the output file.\n");
printf("(Try using ffmpeg or sox to convert it to WAV first.)\n");
return -1;
}
sf_count_t input_frames = sf_seek(input_file, 0, SEEK_END);
if (input_frames < rnnoise_frame_size) {
printf("Input file is too short to be processed with RNNoise.\n");
return -1;
}
// Rewind the input to the start:
sf_seek(input_file, 0, SEEK_SET);
SF_INFO output_info = {0};
memcpy(&output_info, &input_info, sizeof(SF_INFO));
SNDFILE * output_file = sf_open(output_path, SFM_WRITE, &output_info);
if (output_file == NULL) {
printf("Could not open the output file: %s\n", sf_strerror(output_file));
return -1;
}
// Create the per-channel denoising state:
int channels = input_info.channels;
struct channel_state state[channels];
for (int ch = 0; ch < channels; ch++) {
state[ch].ds = rnnoise_create(rnnoise_model);
state[ch].input = calloc(sizeof(float), rnnoise_frame_size);
state[ch].output = calloc(sizeof(float), rnnoise_frame_size);
}
// Create the multi-channel frame:
float * full_frame = calloc(sizeof(float), rnnoise_frame_size * channels);
int prefeed_stages_remaining = 0;
if (prefeed_seconds > 0.0f) prefeed_stages_remaining = 2;
sf_count_t prefeed_frames_left = (sf_count_t)(prefeed_seconds * (float)input_info.samplerate);
printf("Needs %ld frames for prefeed\n", prefeed_frames_left);
if (prefeed_frames_left > input_frames) {
printf("Prefeed exceeds file length - capping to %ld frames\n", input_frames);
prefeed_frames_left = input_frames;
}
sf_count_t current_frame = 0;
while (current_frame < input_frames) {
sf_count_t write_from = 0;
sf_count_t write_frames = rnnoise_frame_size;
if ((current_frame + rnnoise_frame_size) > input_frames) {
// Because we're not able to consume the whole frame for RNNoise,
// sometimes we need to "borrow" some audio from the previous
// frame and write only the missing trailer.
write_frames = input_frames - current_frame;
// For RNNoise frame size = 480, if we have 300 frames to
// write, we need to start writing to the output from the
// 180th sndfile frame. This is the full_frame offset:
write_from = (rnnoise_frame_size - write_frames) * channels;
// And seek to the (end - frame size) for the actual "borrow":
sf_seek(input_file, -rnnoise_frame_size, SEEK_END);
}
sf_readf_float(input_file, full_frame, rnnoise_frame_size);
// RNNoise can only operate on a single channel at any given time.
// Split the read audio into separate channel buffers and process
// each buffer accordingly:
for (int ch = 0; ch < channels; ch++) {
for (int sample = 0; sample < rnnoise_frame_size; sample++) {
// RNNoise needs really high values for its inputs...
// Amplify the input here - don't divide by preamp later.
state[ch].input[sample] = full_frame[ch + (sample * channels)] * 32768.0f * amplify_factor;
}
rnnoise_process_frame(state[ch].ds, state[ch].output, state[ch].input);
bzero(state[ch].input, rnnoise_frame_size * sizeof(float));
if (!prefeed_stages_remaining) {
// We still need the original audio, don't denoise the prefed period twice!
for (int sample = 0; sample < rnnoise_frame_size; sample++) {
full_frame[ch + (sample * channels)] = state[ch].output[sample] / 32768.0f;
}
}
}
if (!prefeed_stages_remaining) {
// At this point, full_frame contains denoised audio (hopefully).
// Write it to the output and move our frame pointer forward.
if (sf_writef_float(output_file, full_frame+write_from, write_frames) != write_frames) {
// Out of disk space? Something else?
printf("Failed to write enough frames to the output file!\n");
return -1;
}
} else {
prefeed_frames_left -= write_frames;
if (prefeed_frames_left <= 0) {
// Move to the next prefeed/processing stage, rewind the input file.
// The reason we're doing 2 prefeed stages is that you usually want
// prefeed due to the audio file starting with noise to be removed.
// If we prefeed a few seconds of audio to the denoiser, it'll be
// able to detect that noise, but when the denoiser input changes
// from someone speaking to noise in an instant, it'll output a
// short "pop". The 2nd prefeed stage feeds it some raw noise, so
// that this "pop" is not present in the output. It's still not
// completely perfect, but masks the issue well enough.
prefeed_stages_remaining -= 1;
prefeed_frames_left = rnnoise_frame_size;
current_frame = 0;
sf_seek(input_file, 0, SEEK_SET);
printf("Prefeed stage completed (%d remaining)\n", prefeed_stages_remaining);
continue; // Don't bump current frames below at this point.
}
}
current_frame += write_frames;
}
// And we're done - audio denoised, output written. Close everything cleanly.
for (int ch = 0; ch < channels; ch++) {
rnnoise_destroy(state[ch].ds);
free(state[ch].output);
free(state[ch].input);
}
free(full_frame);
sf_close(output_file);
sf_close(input_file);
if (rnnoise_model != NULL) {
rnnoise_model_free(rnnoise_model);
}
printf("Done, processed %ld frames.\n", current_frame);
return 0;
}