Skip to content

Commit

Permalink
Merge pull request #50 from oxygen-dioxide/diffsinger
Browse files Browse the repository at this point in the history
fix diffsinger pitch error on short spaces
  • Loading branch information
oxygen-dioxide authored Jul 22, 2023
2 parents 27090a2 + d49984a commit 289618b
Show file tree
Hide file tree
Showing 5 changed files with 59 additions and 44 deletions.
2 changes: 1 addition & 1 deletion OpenUtau.Core/DiffSinger/DiffSingerConfig.cs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ public class DsConfig {
public string variance;
public int hop_size = 512;
public int sample_rate = 44100;

public bool predict_dur = true;
public float frameMs(){
return 1000f * hop_size / sample_rate;
}
Expand Down
68 changes: 39 additions & 29 deletions OpenUtau.Core/DiffSinger/DiffSingerPitch.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
using System.IO;
using System.Linq;
using System.Text;
using System.Transactions;
using Microsoft.ML.OnnxRuntime;
using Microsoft.ML.OnnxRuntime.Tensors;

Expand All @@ -24,7 +25,6 @@ public class DsPitch
float frameMs;
const float headMs = DiffSingerUtils.headMs;

//Get vocoder by package name
public DsPitch(string rootPath)
{
this.rootPath = rootPath;
Expand All @@ -36,9 +36,9 @@ public DsPitch(string rootPath)
phonemes = File.ReadLines(phonemesPath, Encoding.UTF8).ToList();
//Load models
var linguisticModelPath = Path.Join(rootPath, dsConfig.linguistic);
linguisticModel = new InferenceSession(linguisticModelPath);
linguisticModel = Onnx.getInferenceSession(linguisticModelPath);
var pitchModelPath = Path.Join(rootPath, dsConfig.pitch);
pitchModel = new InferenceSession(pitchModelPath);
pitchModel = Onnx.getInferenceSession(pitchModelPath);
frameMs = 1000f * dsConfig.hop_size / dsConfig.sample_rate;
//Load g2p
g2p = LoadG2p(rootPath);
Expand All @@ -63,33 +63,44 @@ public RenderPitchResult Process(RenderPhrase phrase){
var endMs = phrase.notes[^1].endMs;
int n_frames = (int)(endMs/frameMs)-(int)(startMs/frameMs);
//Linguistic Encoder
var linguisticInputs = new List<NamedOnnxValue>();
var tokens = phrase.phones
.Select(p => (Int64)phonemes.IndexOf(p.phoneme))
.Prepend((Int64)phonemes.IndexOf("SP"))
.ToArray();
var vowelIds = Enumerable.Range(0,phrase.phones.Length)
.Where(i=>g2p.IsVowel(phrase.phones[i].phoneme))
.Append(phrase.phones.Length)
.ToArray();
var word_div = vowelIds.Zip(vowelIds.Skip(1),(a,b)=>(Int64)(b-a))
.Prepend(vowelIds[0] + 1)
.ToArray();
var word_dur = vowelIds.Zip(vowelIds.Skip(1),
(a,b)=>(Int64)(phrase.phones[b-1].endMs/frameMs) - (Int64)(phrase.phones[a].positionMs/frameMs))
.Prepend((Int64)(phrase.phones[vowelIds[0]].positionMs/frameMs) - (Int64)(startMs/frameMs))
var ph_dur = phrase.phones
.Select(p=>(Int64)(p.endMs/frameMs) - (Int64)(p.positionMs/frameMs))
.Prepend((Int64)(phrase.phones[0].positionMs/frameMs) - (Int64)(startMs/frameMs))
.ToArray();

//Call Diffsinger Linguistic Encoder model
var linguisticInputs = new List<NamedOnnxValue>();
linguisticInputs.Add(NamedOnnxValue.CreateFromTensor("tokens",
new DenseTensor<Int64>(tokens, new int[] { tokens.Length }, false)
.Reshape(new int[] { 1, tokens.Length })));
linguisticInputs.Add(NamedOnnxValue.CreateFromTensor("word_div",
new DenseTensor<Int64>(word_div, new int[] { word_div.Length }, false)
.Reshape(new int[] { 1, word_div.Length })));
linguisticInputs.Add(NamedOnnxValue.CreateFromTensor("word_dur",
new DenseTensor<Int64>(word_dur, new int[] { word_dur.Length }, false)
.Reshape(new int[] { 1, word_dur.Length })));
if(dsConfig.predict_dur){
//if predict_dur is true, use word encode mode
var vowelIds = Enumerable.Range(0,phrase.phones.Length)
.Where(i=>g2p.IsVowel(phrase.phones[i].phoneme))
.Append(phrase.phones.Length)
.ToArray();
var word_div = vowelIds.Zip(vowelIds.Skip(1),(a,b)=>(Int64)(b-a))
.Prepend(vowelIds[0] + 1)
.ToArray();
var word_dur = vowelIds.Zip(vowelIds.Skip(1),
(a,b)=>(Int64)(phrase.phones[b-1].endMs/frameMs) - (Int64)(phrase.phones[a].positionMs/frameMs))
.Prepend((Int64)(phrase.phones[vowelIds[0]].positionMs/frameMs) - (Int64)(startMs/frameMs))
.ToArray();
linguisticInputs.Add(NamedOnnxValue.CreateFromTensor("word_div",
new DenseTensor<Int64>(word_div, new int[] { word_div.Length }, false)
.Reshape(new int[] { 1, word_div.Length })));
linguisticInputs.Add(NamedOnnxValue.CreateFromTensor("word_dur",
new DenseTensor<Int64>(word_dur, new int[] { word_dur.Length }, false)
.Reshape(new int[] { 1, word_dur.Length })));
}else{
//if predict_dur is true, use phoneme encode mode
linguisticInputs.Add(NamedOnnxValue.CreateFromTensor("ph_dur",
new DenseTensor<Int64>(ph_dur, new int[] { ph_dur.Length }, false)
.Reshape(new int[] { 1, ph_dur.Length })));
}

var linguisticOutputs = linguisticModel.Run(linguisticInputs);
Tensor<float> encoder_out = linguisticOutputs
.Where(o => o.Name == "encoder_out")
Expand All @@ -104,18 +115,17 @@ public RenderPitchResult Process(RenderPhrase phrase){
.Select(n=>(float)n.tone)
.Prepend((float)phrase.notes[0].tone)
.ToArray();
var note_dur = phrase.notes
.Select(n=> (Int64)(n.endMs/frameMs) - (Int64)(n.positionMs/frameMs))
//use the delta of the positions of the next note and the current note
//to prevent incorrect timing when there is a small space between two notes
var note_dur = phrase.notes.Zip(phrase.notes.Skip(1),
(curr,next)=> (Int64)(next.positionMs/frameMs) - (Int64)(curr.positionMs/frameMs))
.Prepend((Int64)(phrase.notes[0].positionMs/frameMs) - (Int64)(startMs/frameMs))
.Append((Int64)(phrase.notes[^1].endMs/frameMs)-(Int64)(phrase.notes[^1].positionMs/frameMs))
.ToArray();
var ph_dur = phrase.phones
.Select(p=>(Int64)(p.endMs/frameMs) - (Int64)(p.positionMs/frameMs))
.Prepend((Int64)(phrase.phones[0].positionMs/frameMs) - (Int64)(startMs/frameMs))
.ToArray();

var pitch = Enumerable.Repeat(60f, n_frames).ToArray();
var retake = Enumerable.Repeat(true, n_frames).ToArray();
var speedup = Preferences.Default.DiffsingerSpeedup;
//Call Diffsinger Pitch Predictor model
var pitchInputs = new List<NamedOnnxValue>();
pitchInputs.Add(NamedOnnxValue.CreateFromTensor("encoder_out", encoder_out));
pitchInputs.Add(NamedOnnxValue.CreateFromTensor("note_midi",
Expand Down
4 changes: 1 addition & 3 deletions OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -237,10 +237,8 @@ float[] InvokeDiffsinger(RenderPhrase phrase,int speedup) {
return samples;
}


//Loading rendered pitch isn't currently supported
public RenderPitchResult LoadRenderedPitch(RenderPhrase phrase) {
return (phrase.singer as DiffSingerSinger).getPitchGenerator().Process(phrase);
return (phrase.singer as DiffSingerSinger).getPitchPredictor().Process(phrase);
}

public UExpressionDescriptor[] GetSuggestedExpressions(USinger singer, URenderSettings renderSettings) {
Expand Down
17 changes: 8 additions & 9 deletions OpenUtau.Core/DiffSinger/DiffSingerSinger.cs
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ class DiffSingerSinger : USinger {
public DsConfig dsConfig;
public InferenceSession acousticSession = null;
public DsVocoder vocoder = null;
public DsPitch pitchGenerator = null;
public DsPitch pitchPredictor = null;
public NDArray speakerEmbeds = null;


Expand Down Expand Up @@ -108,8 +108,7 @@ public override byte[] LoadPortrait() {

public InferenceSession getAcousticSession() {
if (acousticSession is null) {
var acousticModel = File.ReadAllBytes(Path.Combine(Location, dsConfig.acoustic));
acousticSession = Onnx.getInferenceSession(acousticModel);
acousticSession = Onnx.getInferenceSession(Path.Combine(Location, dsConfig.acoustic));
}
return acousticSession;
}
Expand All @@ -121,15 +120,15 @@ public DsVocoder getVocoder() {
return vocoder;
}

public DsPitch getPitchGenerator(){
if(pitchGenerator is null) {
public DsPitch getPitchPredictor(){
if(pitchPredictor is null) {
if(File.Exists(Path.Join(Location, "dspitch", "dsconfig.yaml"))){
pitchGenerator = new DsPitch(Path.Join(Location, "dspitch"));
return pitchGenerator;
pitchPredictor = new DsPitch(Path.Join(Location, "dspitch"));
return pitchPredictor;
}
pitchGenerator = new DsPitch(Location);
pitchPredictor = new DsPitch(Location);
}
return pitchGenerator;
return pitchPredictor;
}

public NDArray loadSpeakerEmbed(string speaker) {
Expand Down
12 changes: 10 additions & 2 deletions OpenUtau.Core/Util/Onnx.cs
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ public static List<GpuInfo> getGpuInfo() {
return gpuList;
}

public static InferenceSession getInferenceSession(byte[] model) {
private static SessionOptions getOnnxSessionOptions(){
SessionOptions options = new SessionOptions();
List<string> runnerOptions = getRunnerOptions();
string runner = Preferences.Default.OnnxRunner;
Expand All @@ -74,7 +74,15 @@ public static InferenceSession getInferenceSession(byte[] model) {
options.AppendExecutionProvider_CoreML(CoreMLFlags.COREML_FLAG_ENABLE_ON_SUBGRAPH);
break;
}
return new InferenceSession(model,options);
return options;
}

public static InferenceSession getInferenceSession(byte[] model) {
return new InferenceSession(model,getOnnxSessionOptions());
}

public static InferenceSession getInferenceSession(string modelPath) {
return new InferenceSession(modelPath,getOnnxSessionOptions());
}
}
}

0 comments on commit 289618b

Please sign in to comment.