diff --git a/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs b/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs index 82418747e..2e1ab100d 100644 --- a/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs +++ b/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs @@ -30,7 +30,7 @@ public class DsConfig { public string variance; public int hop_size = 512; public int sample_rate = 44100; - + public bool predict_dur = true; public float frameMs(){ return 1000f * hop_size / sample_rate; } diff --git a/OpenUtau.Core/DiffSinger/DiffSingerPitch.cs b/OpenUtau.Core/DiffSinger/DiffSingerPitch.cs index faf97373e..07d83e02f 100644 --- a/OpenUtau.Core/DiffSinger/DiffSingerPitch.cs +++ b/OpenUtau.Core/DiffSinger/DiffSingerPitch.cs @@ -3,6 +3,7 @@ using System.IO; using System.Linq; using System.Text; +using System.Transactions; using Microsoft.ML.OnnxRuntime; using Microsoft.ML.OnnxRuntime.Tensors; @@ -24,7 +25,6 @@ public class DsPitch float frameMs; const float headMs = DiffSingerUtils.headMs; - //Get vocoder by package name public DsPitch(string rootPath) { this.rootPath = rootPath; @@ -36,9 +36,9 @@ public DsPitch(string rootPath) phonemes = File.ReadLines(phonemesPath, Encoding.UTF8).ToList(); //Load models var linguisticModelPath = Path.Join(rootPath, dsConfig.linguistic); - linguisticModel = new InferenceSession(linguisticModelPath); + linguisticModel = Onnx.getInferenceSession(linguisticModelPath); var pitchModelPath = Path.Join(rootPath, dsConfig.pitch); - pitchModel = new InferenceSession(pitchModelPath); + pitchModel = Onnx.getInferenceSession(pitchModelPath); frameMs = 1000f * dsConfig.hop_size / dsConfig.sample_rate; //Load g2p g2p = LoadG2p(rootPath); @@ -63,33 +63,44 @@ public RenderPitchResult Process(RenderPhrase phrase){ var endMs = phrase.notes[^1].endMs; int n_frames = (int)(endMs/frameMs)-(int)(startMs/frameMs); //Linguistic Encoder + var linguisticInputs = new List(); var tokens = phrase.phones .Select(p => (Int64)phonemes.IndexOf(p.phoneme)) .Prepend((Int64)phonemes.IndexOf("SP")) .ToArray(); - var vowelIds = Enumerable.Range(0,phrase.phones.Length) - .Where(i=>g2p.IsVowel(phrase.phones[i].phoneme)) - .Append(phrase.phones.Length) - .ToArray(); - var word_div = vowelIds.Zip(vowelIds.Skip(1),(a,b)=>(Int64)(b-a)) - .Prepend(vowelIds[0] + 1) - .ToArray(); - var word_dur = vowelIds.Zip(vowelIds.Skip(1), - (a,b)=>(Int64)(phrase.phones[b-1].endMs/frameMs) - (Int64)(phrase.phones[a].positionMs/frameMs)) - .Prepend((Int64)(phrase.phones[vowelIds[0]].positionMs/frameMs) - (Int64)(startMs/frameMs)) + var ph_dur = phrase.phones + .Select(p=>(Int64)(p.endMs/frameMs) - (Int64)(p.positionMs/frameMs)) + .Prepend((Int64)(phrase.phones[0].positionMs/frameMs) - (Int64)(startMs/frameMs)) .ToArray(); - - //Call Diffsinger Linguistic Encoder model - var linguisticInputs = new List(); linguisticInputs.Add(NamedOnnxValue.CreateFromTensor("tokens", new DenseTensor(tokens, new int[] { tokens.Length }, false) .Reshape(new int[] { 1, tokens.Length }))); - linguisticInputs.Add(NamedOnnxValue.CreateFromTensor("word_div", - new DenseTensor(word_div, new int[] { word_div.Length }, false) - .Reshape(new int[] { 1, word_div.Length }))); - linguisticInputs.Add(NamedOnnxValue.CreateFromTensor("word_dur", - new DenseTensor(word_dur, new int[] { word_dur.Length }, false) - .Reshape(new int[] { 1, word_dur.Length }))); + if(dsConfig.predict_dur){ + //if predict_dur is true, use word encode mode + var vowelIds = Enumerable.Range(0,phrase.phones.Length) + .Where(i=>g2p.IsVowel(phrase.phones[i].phoneme)) + .Append(phrase.phones.Length) + .ToArray(); + var word_div = vowelIds.Zip(vowelIds.Skip(1),(a,b)=>(Int64)(b-a)) + .Prepend(vowelIds[0] + 1) + .ToArray(); + var word_dur = vowelIds.Zip(vowelIds.Skip(1), + (a,b)=>(Int64)(phrase.phones[b-1].endMs/frameMs) - (Int64)(phrase.phones[a].positionMs/frameMs)) + .Prepend((Int64)(phrase.phones[vowelIds[0]].positionMs/frameMs) - (Int64)(startMs/frameMs)) + .ToArray(); + linguisticInputs.Add(NamedOnnxValue.CreateFromTensor("word_div", + new DenseTensor(word_div, new int[] { word_div.Length }, false) + .Reshape(new int[] { 1, word_div.Length }))); + linguisticInputs.Add(NamedOnnxValue.CreateFromTensor("word_dur", + new DenseTensor(word_dur, new int[] { word_dur.Length }, false) + .Reshape(new int[] { 1, word_dur.Length }))); + }else{ + //if predict_dur is true, use phoneme encode mode + linguisticInputs.Add(NamedOnnxValue.CreateFromTensor("ph_dur", + new DenseTensor(ph_dur, new int[] { ph_dur.Length }, false) + .Reshape(new int[] { 1, ph_dur.Length }))); + } + var linguisticOutputs = linguisticModel.Run(linguisticInputs); Tensor encoder_out = linguisticOutputs .Where(o => o.Name == "encoder_out") @@ -104,18 +115,17 @@ public RenderPitchResult Process(RenderPhrase phrase){ .Select(n=>(float)n.tone) .Prepend((float)phrase.notes[0].tone) .ToArray(); - var note_dur = phrase.notes - .Select(n=> (Int64)(n.endMs/frameMs) - (Int64)(n.positionMs/frameMs)) + //use the delta of the positions of the next note and the current note + //to prevent incorrect timing when there is a small space between two notes + var note_dur = phrase.notes.Zip(phrase.notes.Skip(1), + (curr,next)=> (Int64)(next.positionMs/frameMs) - (Int64)(curr.positionMs/frameMs)) .Prepend((Int64)(phrase.notes[0].positionMs/frameMs) - (Int64)(startMs/frameMs)) + .Append((Int64)(phrase.notes[^1].endMs/frameMs)-(Int64)(phrase.notes[^1].positionMs/frameMs)) .ToArray(); - var ph_dur = phrase.phones - .Select(p=>(Int64)(p.endMs/frameMs) - (Int64)(p.positionMs/frameMs)) - .Prepend((Int64)(phrase.phones[0].positionMs/frameMs) - (Int64)(startMs/frameMs)) - .ToArray(); + var pitch = Enumerable.Repeat(60f, n_frames).ToArray(); var retake = Enumerable.Repeat(true, n_frames).ToArray(); var speedup = Preferences.Default.DiffsingerSpeedup; - //Call Diffsinger Pitch Predictor model var pitchInputs = new List(); pitchInputs.Add(NamedOnnxValue.CreateFromTensor("encoder_out", encoder_out)); pitchInputs.Add(NamedOnnxValue.CreateFromTensor("note_midi", diff --git a/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs b/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs index 1258c78e0..379d10536 100644 --- a/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs +++ b/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs @@ -237,10 +237,8 @@ float[] InvokeDiffsinger(RenderPhrase phrase,int speedup) { return samples; } - - //Loading rendered pitch isn't currently supported public RenderPitchResult LoadRenderedPitch(RenderPhrase phrase) { - return (phrase.singer as DiffSingerSinger).getPitchGenerator().Process(phrase); + return (phrase.singer as DiffSingerSinger).getPitchPredictor().Process(phrase); } public UExpressionDescriptor[] GetSuggestedExpressions(USinger singer, URenderSettings renderSettings) { diff --git a/OpenUtau.Core/DiffSinger/DiffSingerSinger.cs b/OpenUtau.Core/DiffSinger/DiffSingerSinger.cs index 7911a2e71..c1478c7c7 100644 --- a/OpenUtau.Core/DiffSinger/DiffSingerSinger.cs +++ b/OpenUtau.Core/DiffSinger/DiffSingerSinger.cs @@ -39,7 +39,7 @@ class DiffSingerSinger : USinger { public DsConfig dsConfig; public InferenceSession acousticSession = null; public DsVocoder vocoder = null; - public DsPitch pitchGenerator = null; + public DsPitch pitchPredictor = null; public NDArray speakerEmbeds = null; @@ -108,8 +108,7 @@ public override byte[] LoadPortrait() { public InferenceSession getAcousticSession() { if (acousticSession is null) { - var acousticModel = File.ReadAllBytes(Path.Combine(Location, dsConfig.acoustic)); - acousticSession = Onnx.getInferenceSession(acousticModel); + acousticSession = Onnx.getInferenceSession(Path.Combine(Location, dsConfig.acoustic)); } return acousticSession; } @@ -121,15 +120,15 @@ public DsVocoder getVocoder() { return vocoder; } - public DsPitch getPitchGenerator(){ - if(pitchGenerator is null) { + public DsPitch getPitchPredictor(){ + if(pitchPredictor is null) { if(File.Exists(Path.Join(Location, "dspitch", "dsconfig.yaml"))){ - pitchGenerator = new DsPitch(Path.Join(Location, "dspitch")); - return pitchGenerator; + pitchPredictor = new DsPitch(Path.Join(Location, "dspitch")); + return pitchPredictor; } - pitchGenerator = new DsPitch(Location); + pitchPredictor = new DsPitch(Location); } - return pitchGenerator; + return pitchPredictor; } public NDArray loadSpeakerEmbed(string speaker) { diff --git a/OpenUtau.Core/Util/Onnx.cs b/OpenUtau.Core/Util/Onnx.cs index 7d95371cd..fb1de5b5f 100644 --- a/OpenUtau.Core/Util/Onnx.cs +++ b/OpenUtau.Core/Util/Onnx.cs @@ -56,7 +56,7 @@ public static List getGpuInfo() { return gpuList; } - public static InferenceSession getInferenceSession(byte[] model) { + private static SessionOptions getOnnxSessionOptions(){ SessionOptions options = new SessionOptions(); List runnerOptions = getRunnerOptions(); string runner = Preferences.Default.OnnxRunner; @@ -74,7 +74,15 @@ public static InferenceSession getInferenceSession(byte[] model) { options.AppendExecutionProvider_CoreML(CoreMLFlags.COREML_FLAG_ENABLE_ON_SUBGRAPH); break; } - return new InferenceSession(model,options); + return options; + } + + public static InferenceSession getInferenceSession(byte[] model) { + return new InferenceSession(model,getOnnxSessionOptions()); + } + + public static InferenceSession getInferenceSession(string modelPath) { + return new InferenceSession(modelPath,getOnnxSessionOptions()); } } }