Merge pull request #50 from oxygen-dioxide/diffsinger

fix diffsinger pitch error on short spaces
xunmengshe · Jul 22, 2023 · 289618b · 289618b
2 parents 27090a2 + d49984a
commit 289618b
Show file tree

Hide file tree

Showing 5 changed files with 59 additions and 44 deletions.
diff --git a/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs b/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs
@@ -30,7 +30,7 @@ public class DsConfig {
         public string variance;
         public int hop_size = 512;
         public int sample_rate = 44100;
-
+        public bool predict_dur = true;
         public float frameMs(){
             return 1000f * hop_size / sample_rate;
         }

diff --git a/OpenUtau.Core/DiffSinger/DiffSingerPitch.cs b/OpenUtau.Core/DiffSinger/DiffSingerPitch.cs
@@ -3,6 +3,7 @@
 using System.IO;
 using System.Linq;
 using System.Text;
+using System.Transactions;
 using Microsoft.ML.OnnxRuntime;
 using Microsoft.ML.OnnxRuntime.Tensors;
 
@@ -24,7 +25,6 @@ public class DsPitch
         float frameMs;
         const float headMs = DiffSingerUtils.headMs;
 
-        //Get vocoder by package name
         public DsPitch(string rootPath)
         {
             this.rootPath = rootPath;
@@ -36,9 +36,9 @@ public DsPitch(string rootPath)
             phonemes = File.ReadLines(phonemesPath, Encoding.UTF8).ToList();
             //Load models
             var linguisticModelPath = Path.Join(rootPath, dsConfig.linguistic);
-            linguisticModel = new InferenceSession(linguisticModelPath);
+            linguisticModel = Onnx.getInferenceSession(linguisticModelPath);
             var pitchModelPath = Path.Join(rootPath, dsConfig.pitch);
-            pitchModel = new InferenceSession(pitchModelPath);
+            pitchModel = Onnx.getInferenceSession(pitchModelPath);
             frameMs = 1000f * dsConfig.hop_size / dsConfig.sample_rate;
             //Load g2p
             g2p = LoadG2p(rootPath);
@@ -63,33 +63,44 @@ public RenderPitchResult Process(RenderPhrase phrase){
             var endMs = phrase.notes[^1].endMs;
             int n_frames = (int)(endMs/frameMs)-(int)(startMs/frameMs);
             //Linguistic Encoder
+            var linguisticInputs = new List<NamedOnnxValue>();
             var tokens = phrase.phones
                 .Select(p => (Int64)phonemes.IndexOf(p.phoneme))
                 .Prepend((Int64)phonemes.IndexOf("SP"))
                 .ToArray();
-            var vowelIds = Enumerable.Range(0,phrase.phones.Length)
-                .Where(i=>g2p.IsVowel(phrase.phones[i].phoneme))
-                .Append(phrase.phones.Length)
-                .ToArray();
-            var word_div = vowelIds.Zip(vowelIds.Skip(1),(a,b)=>(Int64)(b-a))
-                .Prepend(vowelIds[0] + 1)
-                .ToArray();
-            var word_dur = vowelIds.Zip(vowelIds.Skip(1),
-                    (a,b)=>(Int64)(phrase.phones[b-1].endMs/frameMs) - (Int64)(phrase.phones[a].positionMs/frameMs))
-                .Prepend((Int64)(phrase.phones[vowelIds[0]].positionMs/frameMs) - (Int64)(startMs/frameMs))
+            var ph_dur = phrase.phones
+                .Select(p=>(Int64)(p.endMs/frameMs) - (Int64)(p.positionMs/frameMs))
+                .Prepend((Int64)(phrase.phones[0].positionMs/frameMs) - (Int64)(startMs/frameMs))
                 .ToArray();
-
-            //Call Diffsinger Linguistic Encoder model
-            var linguisticInputs = new List<NamedOnnxValue>();
             linguisticInputs.Add(NamedOnnxValue.CreateFromTensor("tokens",
                 new DenseTensor<Int64>(tokens, new int[] { tokens.Length }, false)
                 .Reshape(new int[] { 1, tokens.Length })));
-            linguisticInputs.Add(NamedOnnxValue.CreateFromTensor("word_div",
-                new DenseTensor<Int64>(word_div, new int[] { word_div.Length }, false)
-                .Reshape(new int[] { 1, word_div.Length })));
-            linguisticInputs.Add(NamedOnnxValue.CreateFromTensor("word_dur",
-                new DenseTensor<Int64>(word_dur, new int[] { word_dur.Length }, false)
-                .Reshape(new int[] { 1, word_dur.Length })));
+            if(dsConfig.predict_dur){
+                //if predict_dur is true, use word encode mode
+                var vowelIds = Enumerable.Range(0,phrase.phones.Length)
+                    .Where(i=>g2p.IsVowel(phrase.phones[i].phoneme))
+                    .Append(phrase.phones.Length)
+                    .ToArray();
+                var word_div = vowelIds.Zip(vowelIds.Skip(1),(a,b)=>(Int64)(b-a))
+                    .Prepend(vowelIds[0] + 1)
+                    .ToArray();
+                var word_dur = vowelIds.Zip(vowelIds.Skip(1),
+                        (a,b)=>(Int64)(phrase.phones[b-1].endMs/frameMs) - (Int64)(phrase.phones[a].positionMs/frameMs))
+                    .Prepend((Int64)(phrase.phones[vowelIds[0]].positionMs/frameMs) - (Int64)(startMs/frameMs))
+                    .ToArray();
+                linguisticInputs.Add(NamedOnnxValue.CreateFromTensor("word_div",
+                    new DenseTensor<Int64>(word_div, new int[] { word_div.Length }, false)
+                    .Reshape(new int[] { 1, word_div.Length })));
+                linguisticInputs.Add(NamedOnnxValue.CreateFromTensor("word_dur",
+                    new DenseTensor<Int64>(word_dur, new int[] { word_dur.Length }, false)
+                    .Reshape(new int[] { 1, word_dur.Length })));
+            }else{
+                //if predict_dur is true, use phoneme encode mode
+                linguisticInputs.Add(NamedOnnxValue.CreateFromTensor("ph_dur",
+                    new DenseTensor<Int64>(ph_dur, new int[] { ph_dur.Length }, false)
+                    .Reshape(new int[] { 1, ph_dur.Length })));
+            }
+
             var linguisticOutputs = linguisticModel.Run(linguisticInputs);
             Tensor<float> encoder_out = linguisticOutputs
                 .Where(o => o.Name == "encoder_out")
@@ -104,18 +115,17 @@ public RenderPitchResult Process(RenderPhrase phrase){
                 .Select(n=>(float)n.tone)
                 .Prepend((float)phrase.notes[0].tone)
                 .ToArray();
-            var note_dur = phrase.notes
-                .Select(n=> (Int64)(n.endMs/frameMs) - (Int64)(n.positionMs/frameMs))
+            //use the delta of the positions of the next note and the current note 
+            //to prevent incorrect timing when there is a small space between two notes
+            var note_dur = phrase.notes.Zip(phrase.notes.Skip(1),
+                    (curr,next)=> (Int64)(next.positionMs/frameMs) - (Int64)(curr.positionMs/frameMs))
                 .Prepend((Int64)(phrase.notes[0].positionMs/frameMs) - (Int64)(startMs/frameMs))
+                .Append((Int64)(phrase.notes[^1].endMs/frameMs)-(Int64)(phrase.notes[^1].positionMs/frameMs))
                 .ToArray();
-            var ph_dur = phrase.phones
-                .Select(p=>(Int64)(p.endMs/frameMs) - (Int64)(p.positionMs/frameMs))
-                .Prepend((Int64)(phrase.phones[0].positionMs/frameMs) - (Int64)(startMs/frameMs))
-                .ToArray();
+
             var pitch = Enumerable.Repeat(60f, n_frames).ToArray();
             var retake = Enumerable.Repeat(true, n_frames).ToArray();
             var speedup = Preferences.Default.DiffsingerSpeedup;
-            //Call Diffsinger Pitch Predictor model
             var pitchInputs = new List<NamedOnnxValue>();
             pitchInputs.Add(NamedOnnxValue.CreateFromTensor("encoder_out", encoder_out));
             pitchInputs.Add(NamedOnnxValue.CreateFromTensor("note_midi",

diff --git a/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs b/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs
@@ -237,10 +237,8 @@ float[] InvokeDiffsinger(RenderPhrase phrase,int speedup) {
             return samples;
         }
 
-
-        //Loading rendered pitch isn't currently supported
         public RenderPitchResult LoadRenderedPitch(RenderPhrase phrase) {
-            return (phrase.singer as DiffSingerSinger).getPitchGenerator().Process(phrase);
+            return (phrase.singer as DiffSingerSinger).getPitchPredictor().Process(phrase);
         }
 
         public UExpressionDescriptor[] GetSuggestedExpressions(USinger singer, URenderSettings renderSettings) {

diff --git a/OpenUtau.Core/DiffSinger/DiffSingerSinger.cs b/OpenUtau.Core/DiffSinger/DiffSingerSinger.cs
@@ -39,7 +39,7 @@ class DiffSingerSinger : USinger {
         public DsConfig dsConfig;
         public InferenceSession acousticSession = null;
         public DsVocoder vocoder = null;
-        public DsPitch pitchGenerator = null;
+        public DsPitch pitchPredictor = null;
         public NDArray speakerEmbeds = null;
 
 
@@ -108,8 +108,7 @@ public override byte[] LoadPortrait() {
 
         public InferenceSession getAcousticSession() {
             if (acousticSession is null) {
-                var acousticModel = File.ReadAllBytes(Path.Combine(Location, dsConfig.acoustic));
-                acousticSession = Onnx.getInferenceSession(acousticModel);
+                acousticSession = Onnx.getInferenceSession(Path.Combine(Location, dsConfig.acoustic));
             }
             return acousticSession;
         }
@@ -121,15 +120,15 @@ public DsVocoder getVocoder() {
             return vocoder;
         }
 
-        public DsPitch getPitchGenerator(){
-            if(pitchGenerator is null) {
+        public DsPitch getPitchPredictor(){
+            if(pitchPredictor is null) {
                 if(File.Exists(Path.Join(Location, "dspitch", "dsconfig.yaml"))){
-                    pitchGenerator = new DsPitch(Path.Join(Location, "dspitch"));
-                    return pitchGenerator;
+                    pitchPredictor = new DsPitch(Path.Join(Location, "dspitch"));
+                    return pitchPredictor;
                 }
-                pitchGenerator = new DsPitch(Location);
+                pitchPredictor = new DsPitch(Location);
             }
-            return pitchGenerator;
+            return pitchPredictor;
         }
 
         public NDArray loadSpeakerEmbed(string speaker) {

diff --git a/OpenUtau.Core/Util/Onnx.cs b/OpenUtau.Core/Util/Onnx.cs
@@ -56,7 +56,7 @@ public static List<GpuInfo> getGpuInfo() {
             return gpuList;
         }
 
-        public static InferenceSession getInferenceSession(byte[] model) {
+        private static SessionOptions getOnnxSessionOptions(){
             SessionOptions options = new SessionOptions();
             List<string> runnerOptions = getRunnerOptions();
             string runner = Preferences.Default.OnnxRunner;
@@ -74,7 +74,15 @@ public static InferenceSession getInferenceSession(byte[] model) {
                     options.AppendExecutionProvider_CoreML(CoreMLFlags.COREML_FLAG_ENABLE_ON_SUBGRAPH);
                     break;
             }
-            return new InferenceSession(model,options);
+            return options;
+        }
+
+        public static InferenceSession getInferenceSession(byte[] model) {
+            return new InferenceSession(model,getOnnxSessionOptions());
+        }
+
+        public static InferenceSession getInferenceSession(string modelPath) {
+            return new InferenceSession(modelPath,getOnnxSessionOptions());
         }
     }
 }