Merge pull request ReaLLMASIC#312 from xinyixuu/add_zh_snac

Updated scripts to extract zh snac tokens
mmoffatt2 · Dec 5, 2024 · 721db7b · 721db7b
2 parents 4e12fae + 990d7bf
commit 721db7b
Show file tree

Hide file tree

Showing 2 changed files with 240 additions and 0 deletions.
diff --git a/data/snac/get_zh_snac.sh b/data/snac/get_zh_snac.sh
@@ -0,0 +1,89 @@
+# !/bin/bash
+
+# Set strict error handling
+set -euo pipefail
+
+# Install python dependencies for Hugging face
+pip install -U "huggingface_hub[cli]"
+
+# Authentication with Hugging Face
+# Replace with your hugging face tokens
+##### You can find and create your own tokens here: https://huggingface.co/settings/tokens ######
+##### "Token Type" of "Read" is recommended. ########
+HF_TOKEN=""
+
+# Authenticate with hugging face
+echo "Authenticating with Hugging Face..."
+huggingface-cli login --token "${HF_TOKEN}"
+
+# Get current script directory
+script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" &> /dev/null && pwd)"
+
+url="https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0"
+out_dir="transcription"
+
+if [[ ! -d "${out_dir}" ]]; then
+  mkdir -p "${out_dir}"
+fi
+
+# Download transcription files under "transcription" directory.
+pushd "${out_dir}"
+wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "dev.tsv" "${url}/resolve/main/transcript/zh-CN/dev.tsv?download=true"
+wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "other.tsv" "${url}/resolve/main/transcript/zh-CN/other.tsv?download=true"
+wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "test.tsv" "${url}/resolve/main/transcript/zh-CN/test.tsv?download=true"
+wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "train.tsv" "${url}/resolve/main/transcript/zh-CN/train.tsv?download=true"
+wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "validated.tsv" "${url}/resolve/main/transcript/zh-CN/validated.tsv?download=true"
+
+echo "transcripts downloaded and saved to transcription."
+popd
+
+audio_zip_dir="zh_tar_audio"
+audio_dir="zh_audio"
+
+if [[ ! -d "${audio_zip_dir}" ]]; then
+  mkdir -p "${audio_zip_dir}"
+fi
+
+# Download audio files under "zh_audio" directory.
+pushd "${audio_zip_dir}"
+wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "zh-CN_dev_0.tar" "${url}/resolve/main/audio/zh-CN/dev/zh-CN_dev_0.tar?download=true"
+for i in $(seq 0 14); do
+    wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "zh-CN_other_${i}.tar" "${url}/resolve/main/audio/zh-CN/other/zh-CN_other_${i}.tar?download=true"
+done
+wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "zh-CN_test_0.tar" "${url}/resolve/main/audio/zh-CN/test/zh-CN_test_0.tar?download=true"
+wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "zh-CN_train_0.tar" "${url}/resolve/main/audio/zh-CN/train/zh-CN_train_0.tar?download=true"
+
+# Create directory to store all the audio files
+if [[ ! -d "${audio_dir}" ]]; then
+    mkdir -p "${audio_dir}"
+fi
+
+# Loop through each .tar file and extract them
+for tarfile in *.tar; do
+    # Check if the .tar file exists (handles the case where no .tar files are present)
+    if [ -f "$tarfile" ]; then
+        echo "Extracting $tarfile..."
+        tar --strip-components=1 -xvf "$tarfile" -C "${audio_dir}" > /dev/null
+    fi
+done
+popd
+
+json_dir="json_outs"
+
+if [[ ! -d "${json_dir}" ]]; then
+  mkdir -p "${json_dir}"
+fi
+
+# Run program to get snac, text combined json file
+for tsvfile in "$out_dir"/*.tsv; do
+    # Check if the .tsv file exists (handles the case where no .tsv files are present)
+    if [ -f "$tsvfile" ]; then
+        echo "Processing $tsvfile..."
+        # Get the filename without the extension for output filename
+        filename=$(basename "${tsvfile%.tsv}")
+        output_file="$json_dir/$filename.json"
+        python3 snac_text_zh.py "$audio_zip_dir/$audio_dir" "$tsvfile" "$output_file"
+    fi
+done
+
+echo "All .tsv files have been processed."
diff --git a/data/snac/snac_text_zh.py b/data/snac/snac_text_zh.py
@@ -0,0 +1,151 @@
+import argparse
+import tempfile
+import json
+import csv
+import sys
+import os
+from pydub import AudioSegment
+from rich import print
+from rich.progress import Progress, track
+from dragonmapper import hanzi
+from snac_converter import (
+    SpeechTokenizer,
+    preprocess_audio_to_24khz,
+    load_mp3_as_tensor,
+)
+
+
+def save_audio_temp(audio_segment, format="mp3"):
+    """Save the specific audio segment temporarily"""
+    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=f".{format}")
+    audio_segment.export(temp_file.name, format=format)
+    return temp_file.name
+
+
+def append_to_json_file(file_path, data):
+    """Append data to a JSON file incrementally"""
+    if os.path.exists(file_path):
+        with open(file_path, "r+") as file:
+            existing_data = json.load(file)
+            existing_data.append(data)
+            file.seek(0)
+            json.dump(existing_data, file, indent=4)
+    else:
+        with open(file_path, "w") as file:
+            json.dump([data], file, indent=4)
+
+
+def flatten_tensors(tensors):
+    """Flatten the tensors using the specified pattern"""
+    flattened = []
+    separator_token = 4097
+    i = 0
+
+    while i < len(tensors[0][0]):
+        if i < len(tensors[0][0]):
+            flattened.append(tensors[0][0][i].item())
+        if 2 * i + 1 < len(tensors[1][0]):
+            flattened.extend(
+                [tensors[1][0][2 * i].item(), tensors[1][0][2 * i + 1].item()]
+            )
+        if 4 * i + 3 < len(tensors[2][0]):
+            flattened.extend(
+                [
+                    tensors[2][0][4 * i].item(),
+                    tensors[2][0][4 * i + 1].item(),
+                    tensors[2][0][4 * i + 2].item(),
+                    tensors[2][0][4 * i + 3].item(),
+                ]
+            )
+        flattened.append(separator_token)
+        i += 1
+
+    return flattened
+
+
+parser = argparse.ArgumentParser(description="Encode and decode audio using SNAC")
+parser.add_argument("input", help="Input file path or directory (for encode)")
+parser.add_argument("transcription", help="Input file path or directory for transcription outputs")
+parser.add_argument("output", help="Output file path for the new JSON")
+
+args = parser.parse_args()
+
+snac_model = SpeechTokenizer("cuda")
+
+data = []
+
+with open(args.transcription, "r") as file:
+    reader = csv.reader(file, delimiter='\t')
+
+    next(reader)
+    for row in reader:
+        out = {
+            "text": row[3],
+            "path": row[1]
+        }
+        data.append(out)
+
+# variables initialization to avoid object not defined error.
+temp_path = " "
+temp_wav_path = " " 
+
+with Progress() as progress:
+    task = progress.add_task(
+        "[cyan]Processing transcription entries...", total=len(data)
+    )
+
+    for entry in data:
+        # Encode the audio segment into SNAC tokens and save the results
+        try:
+            filename = str(entry['path'])
+            file_path = os.path.join(args.input, filename)
+            text = entry["text"]
+            audio_section = AudioSegment.from_mp3(file_path)
+            temp_path = save_audio_temp(audio_section)
+
+            temp_wav_path = "temp.wav"
+            preprocess_audio_to_24khz(temp_path, temp_wav_path)
+
+            # Load and process the audio segment
+            audio_snac = load_mp3_as_tensor(temp_wav_path)
+            audio_snac = audio_snac.to(snac_model.device)
+            codes = snac_model.encode(audio_snac)
+            code_list = [c.tolist() for c in codes]
+
+            # Flatten the tensors using the specified pattern
+            sequential_snac_tokens = flatten_tensors(codes)
+
+            # Print token length
+            snac_token_len = len(sequential_snac_tokens)
+            text_len = len(text)
+
+            print(f"Snac token Length [bold]{snac_token_len}[/bold]")
+            print(f"Text char Length [bold]{text_len}[/bold]")
+            print(f"Text [bold]{text}[/bold]")
+
+            # Collect results
+            result = {
+                "snac_tokens": code_list,
+                "sequential_snac_tokens": sequential_snac_tokens,
+                "snac_token_len": snac_token_len,
+                "text": text,
+                "text_len": text_len
+            }
+
+            # Append result to JSON file
+            append_to_json_file(args.output, result)
+
+        except Exception as e:
+            print(
+                f"[red]Error processing audio {entry['path']}:[/red] {e}"
+            )
+        finally:
+            # Ensure temporary files are deleted
+            if os.path.exists(temp_path):
+                os.remove(temp_path)
+            if os.path.exists(temp_wav_path):
+                os.remove(temp_wav_path)
+
+        progress.update(task, advance=1)
+
+print(f"[blue]Results saved to {args.output}[/blue]")