Skip to content

Commit

Permalink
Merge pull request ReaLLMASIC#312 from xinyixuu/add_zh_snac
Browse files Browse the repository at this point in the history
Updated scripts to extract zh snac tokens
  • Loading branch information
gkielian authored Dec 5, 2024
2 parents 4e12fae + 990d7bf commit 721db7b
Show file tree
Hide file tree
Showing 2 changed files with 240 additions and 0 deletions.
89 changes: 89 additions & 0 deletions data/snac/get_zh_snac.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# !/bin/bash

# Set strict error handling
set -euo pipefail

# Install python dependencies for Hugging face
pip install -U "huggingface_hub[cli]"

# Authentication with Hugging Face
# Replace with your hugging face tokens
##### You can find and create your own tokens here: https://huggingface.co/settings/tokens ######
##### "Token Type" of "Read" is recommended. ########
HF_TOKEN=""

# Authenticate with hugging face
echo "Authenticating with Hugging Face..."
huggingface-cli login --token "${HF_TOKEN}"

# Get current script directory
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" &> /dev/null && pwd)"

url="https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0"
out_dir="transcription"

if [[ ! -d "${out_dir}" ]]; then
mkdir -p "${out_dir}"
fi

# Download transcription files under "transcription" directory.
pushd "${out_dir}"
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "dev.tsv" "${url}/resolve/main/transcript/zh-CN/dev.tsv?download=true"
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "other.tsv" "${url}/resolve/main/transcript/zh-CN/other.tsv?download=true"
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "test.tsv" "${url}/resolve/main/transcript/zh-CN/test.tsv?download=true"
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "train.tsv" "${url}/resolve/main/transcript/zh-CN/train.tsv?download=true"
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "validated.tsv" "${url}/resolve/main/transcript/zh-CN/validated.tsv?download=true"

echo "transcripts downloaded and saved to transcription."
popd

audio_zip_dir="zh_tar_audio"
audio_dir="zh_audio"

if [[ ! -d "${audio_zip_dir}" ]]; then
mkdir -p "${audio_zip_dir}"
fi

# Download audio files under "zh_audio" directory.
pushd "${audio_zip_dir}"
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "zh-CN_dev_0.tar" "${url}/resolve/main/audio/zh-CN/dev/zh-CN_dev_0.tar?download=true"
for i in $(seq 0 14); do
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "zh-CN_other_${i}.tar" "${url}/resolve/main/audio/zh-CN/other/zh-CN_other_${i}.tar?download=true"
done
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "zh-CN_test_0.tar" "${url}/resolve/main/audio/zh-CN/test/zh-CN_test_0.tar?download=true"
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "zh-CN_train_0.tar" "${url}/resolve/main/audio/zh-CN/train/zh-CN_train_0.tar?download=true"

# Create directory to store all the audio files
if [[ ! -d "${audio_dir}" ]]; then
mkdir -p "${audio_dir}"
fi

# Loop through each .tar file and extract them
for tarfile in *.tar; do
# Check if the .tar file exists (handles the case where no .tar files are present)
if [ -f "$tarfile" ]; then
echo "Extracting $tarfile..."
tar --strip-components=1 -xvf "$tarfile" -C "${audio_dir}" > /dev/null
fi
done
popd

json_dir="json_outs"

if [[ ! -d "${json_dir}" ]]; then
mkdir -p "${json_dir}"
fi

# Run program to get snac, text combined json file
for tsvfile in "$out_dir"/*.tsv; do
# Check if the .tsv file exists (handles the case where no .tsv files are present)
if [ -f "$tsvfile" ]; then
echo "Processing $tsvfile..."
# Get the filename without the extension for output filename
filename=$(basename "${tsvfile%.tsv}")
output_file="$json_dir/$filename.json"
python3 snac_text_zh.py "$audio_zip_dir/$audio_dir" "$tsvfile" "$output_file"
fi
done

echo "All .tsv files have been processed."
151 changes: 151 additions & 0 deletions data/snac/snac_text_zh.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
import argparse
import tempfile
import json
import csv
import sys
import os
from pydub import AudioSegment
from rich import print
from rich.progress import Progress, track
from dragonmapper import hanzi
from snac_converter import (
SpeechTokenizer,
preprocess_audio_to_24khz,
load_mp3_as_tensor,
)


def save_audio_temp(audio_segment, format="mp3"):
"""Save the specific audio segment temporarily"""
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=f".{format}")
audio_segment.export(temp_file.name, format=format)
return temp_file.name


def append_to_json_file(file_path, data):
"""Append data to a JSON file incrementally"""
if os.path.exists(file_path):
with open(file_path, "r+") as file:
existing_data = json.load(file)
existing_data.append(data)
file.seek(0)
json.dump(existing_data, file, indent=4)
else:
with open(file_path, "w") as file:
json.dump([data], file, indent=4)


def flatten_tensors(tensors):
"""Flatten the tensors using the specified pattern"""
flattened = []
separator_token = 4097
i = 0

while i < len(tensors[0][0]):
if i < len(tensors[0][0]):
flattened.append(tensors[0][0][i].item())
if 2 * i + 1 < len(tensors[1][0]):
flattened.extend(
[tensors[1][0][2 * i].item(), tensors[1][0][2 * i + 1].item()]
)
if 4 * i + 3 < len(tensors[2][0]):
flattened.extend(
[
tensors[2][0][4 * i].item(),
tensors[2][0][4 * i + 1].item(),
tensors[2][0][4 * i + 2].item(),
tensors[2][0][4 * i + 3].item(),
]
)
flattened.append(separator_token)
i += 1

return flattened


parser = argparse.ArgumentParser(description="Encode and decode audio using SNAC")
parser.add_argument("input", help="Input file path or directory (for encode)")
parser.add_argument("transcription", help="Input file path or directory for transcription outputs")
parser.add_argument("output", help="Output file path for the new JSON")

args = parser.parse_args()

snac_model = SpeechTokenizer("cuda")

data = []

with open(args.transcription, "r") as file:
reader = csv.reader(file, delimiter='\t')

next(reader)
for row in reader:
out = {
"text": row[3],
"path": row[1]
}
data.append(out)

# variables initialization to avoid object not defined error.
temp_path = " "
temp_wav_path = " "

with Progress() as progress:
task = progress.add_task(
"[cyan]Processing transcription entries...", total=len(data)
)

for entry in data:
# Encode the audio segment into SNAC tokens and save the results
try:
filename = str(entry['path'])
file_path = os.path.join(args.input, filename)
text = entry["text"]
audio_section = AudioSegment.from_mp3(file_path)
temp_path = save_audio_temp(audio_section)

temp_wav_path = "temp.wav"
preprocess_audio_to_24khz(temp_path, temp_wav_path)

# Load and process the audio segment
audio_snac = load_mp3_as_tensor(temp_wav_path)
audio_snac = audio_snac.to(snac_model.device)
codes = snac_model.encode(audio_snac)
code_list = [c.tolist() for c in codes]

# Flatten the tensors using the specified pattern
sequential_snac_tokens = flatten_tensors(codes)

# Print token length
snac_token_len = len(sequential_snac_tokens)
text_len = len(text)

print(f"Snac token Length [bold]{snac_token_len}[/bold]")
print(f"Text char Length [bold]{text_len}[/bold]")
print(f"Text [bold]{text}[/bold]")

# Collect results
result = {
"snac_tokens": code_list,
"sequential_snac_tokens": sequential_snac_tokens,
"snac_token_len": snac_token_len,
"text": text,
"text_len": text_len
}

# Append result to JSON file
append_to_json_file(args.output, result)

except Exception as e:
print(
f"[red]Error processing audio {entry['path']}:[/red] {e}"
)
finally:
# Ensure temporary files are deleted
if os.path.exists(temp_path):
os.remove(temp_path)
if os.path.exists(temp_wav_path):
os.remove(temp_wav_path)

progress.update(task, advance=1)

print(f"[blue]Results saved to {args.output}[/blue]")

0 comments on commit 721db7b

Please sign in to comment.