forked from ReaLLMASIC/nanoGPT
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request ReaLLMASIC#312 from xinyixuu/add_zh_snac
Updated scripts to extract zh snac tokens
- Loading branch information
Showing
2 changed files
with
240 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
# !/bin/bash | ||
|
||
# Set strict error handling | ||
set -euo pipefail | ||
|
||
# Install python dependencies for Hugging face | ||
pip install -U "huggingface_hub[cli]" | ||
|
||
# Authentication with Hugging Face | ||
# Replace with your hugging face tokens | ||
##### You can find and create your own tokens here: https://huggingface.co/settings/tokens ###### | ||
##### "Token Type" of "Read" is recommended. ######## | ||
HF_TOKEN="" | ||
|
||
# Authenticate with hugging face | ||
echo "Authenticating with Hugging Face..." | ||
huggingface-cli login --token "${HF_TOKEN}" | ||
|
||
# Get current script directory | ||
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" &> /dev/null && pwd)" | ||
|
||
url="https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0" | ||
out_dir="transcription" | ||
|
||
if [[ ! -d "${out_dir}" ]]; then | ||
mkdir -p "${out_dir}" | ||
fi | ||
|
||
# Download transcription files under "transcription" directory. | ||
pushd "${out_dir}" | ||
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "dev.tsv" "${url}/resolve/main/transcript/zh-CN/dev.tsv?download=true" | ||
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "other.tsv" "${url}/resolve/main/transcript/zh-CN/other.tsv?download=true" | ||
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "test.tsv" "${url}/resolve/main/transcript/zh-CN/test.tsv?download=true" | ||
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "train.tsv" "${url}/resolve/main/transcript/zh-CN/train.tsv?download=true" | ||
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "validated.tsv" "${url}/resolve/main/transcript/zh-CN/validated.tsv?download=true" | ||
|
||
echo "transcripts downloaded and saved to transcription." | ||
popd | ||
|
||
audio_zip_dir="zh_tar_audio" | ||
audio_dir="zh_audio" | ||
|
||
if [[ ! -d "${audio_zip_dir}" ]]; then | ||
mkdir -p "${audio_zip_dir}" | ||
fi | ||
|
||
# Download audio files under "zh_audio" directory. | ||
pushd "${audio_zip_dir}" | ||
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "zh-CN_dev_0.tar" "${url}/resolve/main/audio/zh-CN/dev/zh-CN_dev_0.tar?download=true" | ||
for i in $(seq 0 14); do | ||
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "zh-CN_other_${i}.tar" "${url}/resolve/main/audio/zh-CN/other/zh-CN_other_${i}.tar?download=true" | ||
done | ||
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "zh-CN_test_0.tar" "${url}/resolve/main/audio/zh-CN/test/zh-CN_test_0.tar?download=true" | ||
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "zh-CN_train_0.tar" "${url}/resolve/main/audio/zh-CN/train/zh-CN_train_0.tar?download=true" | ||
|
||
# Create directory to store all the audio files | ||
if [[ ! -d "${audio_dir}" ]]; then | ||
mkdir -p "${audio_dir}" | ||
fi | ||
|
||
# Loop through each .tar file and extract them | ||
for tarfile in *.tar; do | ||
# Check if the .tar file exists (handles the case where no .tar files are present) | ||
if [ -f "$tarfile" ]; then | ||
echo "Extracting $tarfile..." | ||
tar --strip-components=1 -xvf "$tarfile" -C "${audio_dir}" > /dev/null | ||
fi | ||
done | ||
popd | ||
|
||
json_dir="json_outs" | ||
|
||
if [[ ! -d "${json_dir}" ]]; then | ||
mkdir -p "${json_dir}" | ||
fi | ||
|
||
# Run program to get snac, text combined json file | ||
for tsvfile in "$out_dir"/*.tsv; do | ||
# Check if the .tsv file exists (handles the case where no .tsv files are present) | ||
if [ -f "$tsvfile" ]; then | ||
echo "Processing $tsvfile..." | ||
# Get the filename without the extension for output filename | ||
filename=$(basename "${tsvfile%.tsv}") | ||
output_file="$json_dir/$filename.json" | ||
python3 snac_text_zh.py "$audio_zip_dir/$audio_dir" "$tsvfile" "$output_file" | ||
fi | ||
done | ||
|
||
echo "All .tsv files have been processed." |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
import argparse | ||
import tempfile | ||
import json | ||
import csv | ||
import sys | ||
import os | ||
from pydub import AudioSegment | ||
from rich import print | ||
from rich.progress import Progress, track | ||
from dragonmapper import hanzi | ||
from snac_converter import ( | ||
SpeechTokenizer, | ||
preprocess_audio_to_24khz, | ||
load_mp3_as_tensor, | ||
) | ||
|
||
|
||
def save_audio_temp(audio_segment, format="mp3"): | ||
"""Save the specific audio segment temporarily""" | ||
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=f".{format}") | ||
audio_segment.export(temp_file.name, format=format) | ||
return temp_file.name | ||
|
||
|
||
def append_to_json_file(file_path, data): | ||
"""Append data to a JSON file incrementally""" | ||
if os.path.exists(file_path): | ||
with open(file_path, "r+") as file: | ||
existing_data = json.load(file) | ||
existing_data.append(data) | ||
file.seek(0) | ||
json.dump(existing_data, file, indent=4) | ||
else: | ||
with open(file_path, "w") as file: | ||
json.dump([data], file, indent=4) | ||
|
||
|
||
def flatten_tensors(tensors): | ||
"""Flatten the tensors using the specified pattern""" | ||
flattened = [] | ||
separator_token = 4097 | ||
i = 0 | ||
|
||
while i < len(tensors[0][0]): | ||
if i < len(tensors[0][0]): | ||
flattened.append(tensors[0][0][i].item()) | ||
if 2 * i + 1 < len(tensors[1][0]): | ||
flattened.extend( | ||
[tensors[1][0][2 * i].item(), tensors[1][0][2 * i + 1].item()] | ||
) | ||
if 4 * i + 3 < len(tensors[2][0]): | ||
flattened.extend( | ||
[ | ||
tensors[2][0][4 * i].item(), | ||
tensors[2][0][4 * i + 1].item(), | ||
tensors[2][0][4 * i + 2].item(), | ||
tensors[2][0][4 * i + 3].item(), | ||
] | ||
) | ||
flattened.append(separator_token) | ||
i += 1 | ||
|
||
return flattened | ||
|
||
|
||
parser = argparse.ArgumentParser(description="Encode and decode audio using SNAC") | ||
parser.add_argument("input", help="Input file path or directory (for encode)") | ||
parser.add_argument("transcription", help="Input file path or directory for transcription outputs") | ||
parser.add_argument("output", help="Output file path for the new JSON") | ||
|
||
args = parser.parse_args() | ||
|
||
snac_model = SpeechTokenizer("cuda") | ||
|
||
data = [] | ||
|
||
with open(args.transcription, "r") as file: | ||
reader = csv.reader(file, delimiter='\t') | ||
|
||
next(reader) | ||
for row in reader: | ||
out = { | ||
"text": row[3], | ||
"path": row[1] | ||
} | ||
data.append(out) | ||
|
||
# variables initialization to avoid object not defined error. | ||
temp_path = " " | ||
temp_wav_path = " " | ||
|
||
with Progress() as progress: | ||
task = progress.add_task( | ||
"[cyan]Processing transcription entries...", total=len(data) | ||
) | ||
|
||
for entry in data: | ||
# Encode the audio segment into SNAC tokens and save the results | ||
try: | ||
filename = str(entry['path']) | ||
file_path = os.path.join(args.input, filename) | ||
text = entry["text"] | ||
audio_section = AudioSegment.from_mp3(file_path) | ||
temp_path = save_audio_temp(audio_section) | ||
|
||
temp_wav_path = "temp.wav" | ||
preprocess_audio_to_24khz(temp_path, temp_wav_path) | ||
|
||
# Load and process the audio segment | ||
audio_snac = load_mp3_as_tensor(temp_wav_path) | ||
audio_snac = audio_snac.to(snac_model.device) | ||
codes = snac_model.encode(audio_snac) | ||
code_list = [c.tolist() for c in codes] | ||
|
||
# Flatten the tensors using the specified pattern | ||
sequential_snac_tokens = flatten_tensors(codes) | ||
|
||
# Print token length | ||
snac_token_len = len(sequential_snac_tokens) | ||
text_len = len(text) | ||
|
||
print(f"Snac token Length [bold]{snac_token_len}[/bold]") | ||
print(f"Text char Length [bold]{text_len}[/bold]") | ||
print(f"Text [bold]{text}[/bold]") | ||
|
||
# Collect results | ||
result = { | ||
"snac_tokens": code_list, | ||
"sequential_snac_tokens": sequential_snac_tokens, | ||
"snac_token_len": snac_token_len, | ||
"text": text, | ||
"text_len": text_len | ||
} | ||
|
||
# Append result to JSON file | ||
append_to_json_file(args.output, result) | ||
|
||
except Exception as e: | ||
print( | ||
f"[red]Error processing audio {entry['path']}:[/red] {e}" | ||
) | ||
finally: | ||
# Ensure temporary files are deleted | ||
if os.path.exists(temp_path): | ||
os.remove(temp_path) | ||
if os.path.exists(temp_wav_path): | ||
os.remove(temp_wav_path) | ||
|
||
progress.update(task, advance=1) | ||
|
||
print(f"[blue]Results saved to {args.output}[/blue]") |