update models

xinjli · May 18, 2023 · 676ca0a · 676ca0a
1 parent 10081d5
commit 676ca0a
Show file tree

Hide file tree

Showing 25 changed files with 7,794 additions and 115 deletions.
diff --git a/.gitignore b/.gitignore
@@ -135,4 +135,5 @@ temp/
 alqalign/ctc_segmentation/ctc_segmentation_dyn.c
 alqalign/scripts
 .DS_Store
-scripts/
+scripts/
+samples/output/
diff --git a/README.md b/README.md
@@ -1,8 +1,10 @@
 # ALQAlign
 
-Under Construction
+under construction.
 
-alqalign is a multilingual speech alignment tool
+alqalign is a phoneme-based multilingual speech alignment toolkit. 
+
+It is supposed to be able to handle ~8k language (at least theoretically). See the [full list](./doc/language.md) of supported languages. 
 
 ## Install
 
@@ -12,9 +14,142 @@ pip install git+https://github.com/xinjli/allosaurus.git@v2
 python setup.py install
 ```
 
-
 ## Usage
 
+The basic usage is as follows:
+
 ```bash
-python -m alqalign.run --audio=<path to an audio file> --text=<path to an text file> --lang=<iso id> --output=<output dir>
+python -m alqalign.run  --lang=<your target language> --audio <path to your audio file> --text <path to your text file> --output=<path to an output directory>
+```
+
+where 
+
+- `lang`: the target language id you want to use. it is default to `eng` (English). See the language section for details.
+- `audio` is the path to your audio file or your audio directory. If it is a directory, all the audio files in the directory will be processed. the stem filename will be used as the utterance id to be aligned with the text file.
+- `text` is the path to your text file or your text directory. If it is a directory, all the text files in the directory will be processed. the stem filename will be used as the utterance id to be aligned with the audio file.
+- `output` is the path to your output directory. All results/artifacts will be stored here. See the output section for details
+
+### Output
+
+### Mode
+
+There are three alignment `modes` in alqalign, it is default to `sentence`
+
+- `sentence`: align at the sentence level. Each line in the text file is considered a separate sentence. timestamp of each sentence will be computed. In case there's only a single line, we will split the sentence by heuristics.  
+- `word`: align at the word level. every word in the text file will be aggregated and get aligned. sentence boundary will not be considered.
+- `phoneme`: align at the phoneme level. phonemes are derived from each word. 
+
+### Slice
+
+You can turn the `slice` flag to true, if you want to extract the aligned clip of each sentence/word/phoneme.
+
+When it is true, the `output` directory will contain a `audios` directory, which will looks like the following where each file is an aligned audio clip.
+
+```text
+$ ls ./audios
+000.wav  003.wav  006.wav  009.wav  012.wav  015.wav  018.wav  021.wav  024.wav  027.wav
+001.wav  004.wav  007.wav  010.wav  013.wav  016.wav  019.wav  022.wav  025.wav
+002.wav  005.wav  008.wav  011.wav  014.wav  017.wav  020.wav  023.wav  026.wav
 ```
+
+### Language
+
+You can specify the `lang`to your target language's ISO id. 3-char id or 2-char id is both fine, 2 char will be automatically remapped to 3-char interanlly. (e.g: `en` -> `eng`)
+
+As mentioned previously, this toolkit is supposed to be able to handle ~8k language at least theoretically. Unfortunately, I cannot verify every language. If you find any language that is not working properly, please open an issue.
+
+For some common languages, check the following table. For the full list of supported languages, see the [doc here](./doc/language.md).
+
+| ISO id     |  language name     |
+|--------------------|--------------------|
+| abk      | Abkhazian |
+| arb      | Standard Arabic |
+| asm      | Assamese |
+| ast      | Asturian |
+| azb      | South Azerbaijani |
+| bak      | Bashkir |
+| bas      | Basa (Cameroon) |
+| bel      | Belarusian |
+| ben      | Bengali |
+| bre      | Breton |
+| bul      | Bulgarian |
+| cat      | Catalan |
+| ces      | Czech |
+| chv      | Chuvash |
+| ckb      | Central Kurdish |
+| cmn      | Mandarin Chinese |
+| cnh      | Haka Chin |
+| cym      | Welsh |
+| dan      | Danish |
+| deu      | German |
+| div      | Dhivehi |
+| ekk      | Standard Estonian |
+| ell      | Modern Greek (1453-) |
+| eng      | English |
+| epo      | Esperanto |
+| eus      | Basque |
+| fin      | Finnish |
+| fra      | French |
+| glg      | Galician |
+| grn      | Paraguayan Guaraní |
+| gug      | Paraguayan Guaraní |
+| hau      | Hausa |
+| hin      | Hindi |
+| hsb      | Upper Sorbian |
+| hun      | Hungarian |
+| ibo      | Igbo |
+| ina      | Interlingua (International Auxiliary Language Association) |
+| ind      | Indonesian |
+| ita      | Italian |
+| jpn      | Japanese |
+| kab      | Kabyle |
+| kat      | Georgian |
+| kaz      | Kazakh |
+| kin      | Kinyarwanda |
+| kir      | Kirghiz |
+| kmr      | Northern Kurdish |
+| lav      | Latvian |
+| lit      | Lithuanian |
+| lug      | Ganda |
+| mal      | Malayalam |
+| mar      | Marathi |
+| mdf      | Moksha |
+| mhr      | Eastern Mari |
+| mkd      | Macedonian |
+| mlt      | Maltese |
+| mon      | Mongolian |
+| mrj      | Western Mari |
+| myv      | Erzya |
+| nld      | Dutch |
+| ory      | Oriya (individual language) |
+| pes      | Iranian Persian |
+| pol      | Polish |
+| por      | Portuguese |
+| ron      | Romanian |
+| rus      | Russian |
+| sah      | Yakut |
+| skr      | Saraiki |
+| slk      | Slovak |
+| slv      | Slovenian |
+| spa      | Spanish |
+| sro      | Campidanese Sardinian |
+| srp      | Serbian |
+| swa      | Swahili (macrolanguage) |
+| tam      | Tamil |
+| tat      | Tatar |
+| tha      | Thai |
+| tig      | Tigre |
+| tir      | Tigrinya |
+| tur      | Turkish |
+| twi      | Twi |
+| uig      | Uighur |
+| ukr      | Ukrainian |
+| urd      | Urdu |
+| uzb      | Uzbek |
+| vie      | Vietnamese |
+| vot      | Votic |
+
+
+## Samples
+
+There are a few samples in the `samples` directory.
diff --git a/alqalign/__init__.py b/alqalign/__init__.py
diff --git a/alqalign/config.py b/alqalign/config.py
@@ -1,7 +1,7 @@
 import logging
 from pathlib import Path
 
-logging.basicConfig(level=logging.INFO)
+#logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger('alqalign')
 
 root_path=Path(__file__).absolute().parent.parent

diff --git a/alqalign/ctc_segmentation/ctc_segmentation.py b/alqalign/ctc_segmentation/ctc_segmentation.py
@@ -149,6 +149,7 @@ def ctc_segmentation(config, lpz, ground_truth):
         raise AssertionError("Audio is shorter than text!")
     window_size = config.min_window_size
     # Try multiple window lengths if it fails
+
     while True:
         # Create table of alignment probabilities
         table = np.zeros(
@@ -157,6 +158,12 @@ def ctc_segmentation(config, lpz, ground_truth):
         table.fill(config.max_prob)
         # Use array to log window offsets per character
         offsets = np.zeros([len(ground_truth)], dtype=np.int64)
+
+        print(table.shape)
+        print(lpz.shape)
+        print(ground_truth)
+        print(offsets)
+
         # Run actual alignment of utterances
         t, c = cython_fill_table(
             table,
@@ -166,6 +173,7 @@ def ctc_segmentation(config, lpz, ground_truth):
             config.blank,
             config.flags,
         )
+
         if config.backtrack_from_max_t:
             t = table.shape[0] - 1
         logger.info(

diff --git a/alqalign/model.py b/alqalign/model.py
@@ -1,7 +1,7 @@
 from transphone.tokenizer import read_tokenizer
 import panphon
 from phonepiece.inventory import read_inventory
-from allosaurus.recognizer import read_recognizer
+from allosaurus.app import read_recognizer
 
 # models are singletons
 am_ = None
@@ -25,7 +25,7 @@ def read_am(lang_id):
     if am_ is not None:
         return am_
 
-    am_ = read_recognizer('23020401')
+    am_ = read_recognizer()
     return am_
 
 

diff --git a/alqalign/process_alignment.py b/alqalign/process_alignment.py
@@ -22,6 +22,7 @@
 
 def align(audio_file, text_file, lang_id, data_dir, utt_id=None, mode='sentence', threshold=-100.0, slice=False, verbose=False):
 
+    print("Here")
     logit_file = data_dir / 'logit.npz'
     lpz = np.load(logit_file, allow_pickle=True)
 
@@ -57,15 +58,18 @@ def align(audio_file, text_file, lang_id, data_dir, utt_id=None, mode='sentence'
         config, lpz, ground_truth_mat
     )
 
-
     text = []
+
     for line in open(data_dir / 'postprocess_text.txt', 'r'):
         line = line.strip()
         if len(line) == 0:
             continue
         text.append(line)
 
-    assert len(id_lst) == len(text), f"text file ({data_dir / 'postprocess_text.txt'}) has {len(text)} lines but id file ({data_dir / 'ids.txt'}) has {len(id_lst)} lines"
+    print(text)
+
+    if len(id_lst) != len(text):
+        print(f"text file ({data_dir / 'postprocess_text.txt'}) has {len(text)} lines but id file ({data_dir / 'ids.txt'}) has {len(id_lst)} lines")
 
     phoneme = []
     for line in open(data_dir / 'phonemes.txt', 'r'):
@@ -78,8 +82,8 @@ def align(audio_file, text_file, lang_id, data_dir, utt_id=None, mode='sentence'
 
     assert len(text) == len(phoneme), f"text file ({data_dir / 'postprocess_text.txt'}) has {len(text)} lines but phoneme file ({data_dir / 'phonemes.txt'}) has {len(phoneme)} lines"
 
-    print(timings)
-    print(utt_begin_indices)
+    logger.info(timings)
+    logger.info(utt_begin_indices)
 
     segments = determine_utterance_segments(
         config, utt_begin_indices, char_probs, timings, text, mode='sentence', verbose=verbose
@@ -89,10 +93,10 @@ def align(audio_file, text_file, lang_id, data_dir, utt_id=None, mode='sentence'
 
     w_log = open(data_dir / 'log.txt', 'w')
 
-    w_ctm = open(data_dir / 'res.ctm', 'w')
+    w_ctm = open(data_dir / 'result.ctm', 'w')
 
     if slice:
-        output_dir = Path(data_dir / 'align')
+        output_dir = Path(data_dir / 'audios')
         output_dir.mkdir(exist_ok=True, parents=True)
 
     all_count = len(segments)
@@ -125,5 +129,6 @@ def align(audio_file, text_file, lang_id, data_dir, utt_id=None, mode='sentence'
 
     log = f'successfully aligned {success_count} / {all_count}'
     print(log)
+
     w_log.write(log+'\n')
     w_log.close()
diff --git a/alqalign/process_audio.py b/alqalign/process_audio.py
@@ -1,18 +1,11 @@
-from allosaurus.audio import read_audio, write_audio, split_audio, silent_audio, concatenate_audio, Audio
-import numpy as np
-from pathlib import Path
-from tqdm import tqdm
 from alqalign.config import logger
 from alqalign.model import read_am
-import datetime
-import kaldiio
-import torch
 from alqalign.utils import read_audio_rspecifier
 
 
-def transcribe_audio(audio_file, lang_id, data_dir, duration=15.0, batch_size=8, verbose=False):
+def transcribe_audio(audio_file, lang_id, data_dir, duration=15.0, batch_size=8, force=False):
 
-    if (data_dir / f'logit.npz').exists():
+    if (data_dir / f'logit.npz').exists() and not force:
         return
 
     data_dir.mkdir(parents=True, exist_ok=True)
@@ -21,54 +14,42 @@ def transcribe_audio(audio_file, lang_id, data_dir, duration=15.0, batch_size=8,
 
     logger.info(f"total audio duration: {audio.duration()}")
 
-    audio_lst = split_audio(audio, duration=duration)
-
     am = read_am(lang_id)
 
-    logits, decode_info_lst = am.get_logits_batch(audio_lst, lang_id, batch_size=batch_size)
-
-    logit_lst = []
-
-    for logit in logits:
-      lpz = logit[1]
-      lpz = np.concatenate([lpz, lpz[-1][np.newaxis, :]], axis=0)
-      logit_lst.append(lpz)
-
-    lpz = np.concatenate(logit_lst, axis=0)
-
-    lpz.dump(data_dir / f'logit.npz')
-
-    w = open(data_dir / f'decoded.txt', 'w')
-
-    #print(decode_info_lst)
-
-    for i, token_pair in enumerate(decode_info_lst):
-        utt_id = token_pair[0]
-        decoded_info = token_pair[1]
-
-        assert int(utt_id) == i
-
-        chunk_start_time = i*duration
-        # chunk_end_time = str(datetime.timedelta(seconds=(i+1)*duration))
-
-        for phone_info in decoded_info:
-            start_time = phone_info['start'] + chunk_start_time
-            duration = phone_info['duration']
-            phone = phone_info['phone']
-            prob = phone_info['prob']
-
-            w.write(f"{utt_id} {start_time:.3f} {duration:.3f} {phone} {prob}\n")
+    # dump output results
+    am.recognize(audio, lang_id, output=data_dir, batch_size=batch_size, segment_duration=duration, verbose=True, logit=True)
 
-    w.close()
 
-
-    # if am is None:
-    #     am = read_recognizer('xlsr_transformer', '/home/xinjianl/Git/asr2k/data/model/031901/model_0.231203.pt', 'phone', 'raw')
+    # logit_lst = []
+    #
+    # for logit in logits:
+    #   lpz = logit[1]
+    #   lpz = np.concatenate([lpz, lpz[-1][np.newaxis, :]], axis=0)
+    #   logit_lst.append(lpz)
+    #
+    # lpz = np.concatenate(logit_lst, axis=0)
+    #
+    # lpz.dump(data_dir / f'logit.npz')
+    #
+    # w = open(data_dir / f'decode.txt', 'w')
+    #
+    # print(decode_info_lst)
+    #
+    # for i, token_pair in enumerate(decode_info_lst):
+    #     utt_id = token_pair[0]
+    #     decoded_info = token_pair[1]
+    #
+    #     assert int(utt_id[-4:]) == i
+    #
+    #     chunk_start_time = i*duration
+    #     # chunk_end_time = str(datetime.timedelta(seconds=(i+1)*duration))
+    #
+    #     for phone_info in decoded_info:
+    #         start_time = phone_info['start'] + chunk_start_time
+    #         duration = phone_info['duration']
+    #         phone = phone_info['phone']
+    #         prob = phone_info['prob']
+    #
+    #         w.write(f"{utt_id} {start_time:.3f} {duration:.3f} {phone} {prob}\n")
     #
-    # for file in tqdm(sorted(audio_dir.glob('*.wav'))):
-    #     name = file.stem
-    #     print('transcribing audio ', file)
-    #     res = am.get_logits(file, lang_id)
-    #     lpz = res[0][0].cpu().detach().numpy()
-    #     lpz = np.concatenate([lpz, lpz[-1][np.newaxis, :]], axis=0)
-    #     lpz.dump(logit_dir / f'{name}.npz')
+    # w.close()