diff --git a/pyproject.toml b/pyproject.toml index ac742a44510..3ffb9858c56 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -352,6 +352,20 @@ medhelm = [ "transformers~=4.45,<4.50", ] +proxy_tuning = [ + "transformers~=4.51.0,<4.53.0", #for qwen MoE models + "bitsandbytes>=0.43", + "accelerate~=0.25", + + "crfm-helm[openai]", + "crfm-helm[yandex]", + "crfm-helm[scenarios]", + "lxml~=5.3", + "openpyxl~=3.1", + "python-docx~=1.1", + "lm-format-enforcer~=0.11.3" +] + audiolm = [ "crfm-helm[openai]", "crfm-helm[google]", diff --git a/src/helm/benchmark/presentation/run_entries_medhelm_private_proxy_tuning.conf b/src/helm/benchmark/presentation/run_entries_medhelm_private_proxy_tuning.conf new file mode 100644 index 00000000000..0c10f8b8443 --- /dev/null +++ b/src/helm/benchmark/presentation/run_entries_medhelm_private_proxy_tuning.conf @@ -0,0 +1,192 @@ +# MedHELM RunSpecs for the private benchmarks from Stanford with the inference-time domain adaptation models in the Proxy Tuning class. + +entries: [ + + ########## Clinical Decision Support ########## + + ### Supporting Diagnostic Decisions ### + + #Alcohol Dependence + {description: "clear:condition=alcohol_dependence,max_eval_instances=100,model=proxy_tuning/llama-70b-chat_none_none_1.0_logits_20,model_deployment=proxy_tuning/llama-70b-chat_none_none_1.0_logits_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=alcohol_dependence,max_eval_instances=100,model=proxy_tuning/mellama-70b-chat_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/mellama-70b-chat_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=alcohol_dependence,max_eval_instances=100,model=proxy_tuning/mellama-13b-chat_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/mellama-13b-chat_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=alcohol_dependence,max_eval_instances=100,model=proxy_tuning/qwen3-30b_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=alcohol_dependence,max_eval_instances=100,model=proxy_tuning/qwen3-30b_mellama-13b-chat_llama-13b-base_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_mellama-13b-chat_llama-13b-base_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=alcohol_dependence,max_eval_instances=100,model=proxy_tuning/llama-70b-chat_mellama-13b-chat_llama-13b-base_1.0_logits_20,model_deployment=proxy_tuning/llama-70b-chat_mellama-13b-chat_llama-13b-base_1.0_logits_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=alcohol_dependence,max_eval_instances=100,model=proxy_tuning/qwen3-30b_mellama-13b-chat_none_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_mellama-13b-chat_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + + #Attention Deficit Hyperactivity Disorder + {description: "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=proxy_tuning/llama-70b-chat_none_none_1.0_logits_20,model_deployment=proxy_tuning/llama-70b-chat_none_none_1.0_logits_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=proxy_tuning/mellama-70b-chat_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/mellama-70b-chat_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=proxy_tuning/mellama-13b-chat_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/mellama-13b-chat_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=proxy_tuning/qwen3-30b_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=proxy_tuning/qwen3-30b_mellama-13b-chat_llama-13b-base_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_mellama-13b-chat_llama-13b-base_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=proxy_tuning/llama-70b-chat_mellama-13b-chat_llama-13b-base_1.0_logits_20,model_deployment=proxy_tuning/llama-70b-chat_mellama-13b-chat_llama-13b-base_1.0_logits_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=attention_deficit_hyperactivity_disorder,max_eval_instances=100,model=proxy_tuning/qwen3-30b_mellama-13b-chat_none_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_mellama-13b-chat_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + + #Bipolar Disorder + {description: "clear:condition=bipolar_disorder,max_eval_instances=100,model=proxy_tuning/llama-70b-chat_none_none_1.0_logits_20,model_deployment=proxy_tuning/llama-70b-chat_none_none_1.0_logits_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=bipolar_disorder,max_eval_instances=100,model=proxy_tuning/mellama-70b-chat_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/mellama-70b-chat_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=bipolar_disorder,max_eval_instances=100,model=proxy_tuning/mellama-13b-chat_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/mellama-13b-chat_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=bipolar_disorder,max_eval_instances=100,model=proxy_tuning/qwen3-30b_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=bipolar_disorder,max_eval_instances=100,model=proxy_tuning/qwen3-30b_mellama-13b-chat_llama-13b-base_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_mellama-13b-chat_llama-13b-base_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=bipolar_disorder,max_eval_instances=100,model=proxy_tuning/llama-70b-chat_mellama-13b-chat_llama-13b-base_1.0_logits_20,model_deployment=proxy_tuning/llama-70b-chat_mellama-13b-chat_llama-13b-base_1.0_logits_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=bipolar_disorder,max_eval_instances=100,model=proxy_tuning/qwen3-30b_mellama-13b-chat_none_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_mellama-13b-chat_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + + #Chronic Pain + {description: "clear:condition=chronic_pain,max_eval_instances=100,model=proxy_tuning/llama-70b-chat_none_none_1.0_logits_20,model_deployment=proxy_tuning/llama-70b-chat_none_none_1.0_logits_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=chronic_pain,max_eval_instances=100,model=proxy_tuning/mellama-70b-chat_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/mellama-70b-chat_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=chronic_pain,max_eval_instances=100,model=proxy_tuning/mellama-13b-chat_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/mellama-13b-chat_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=chronic_pain,max_eval_instances=100,model=proxy_tuning/qwen3-30b_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=chronic_pain,max_eval_instances=100,model=proxy_tuning/qwen3-30b_mellama-13b-chat_llama-13b-base_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_mellama-13b-chat_llama-13b-base_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=chronic_pain,max_eval_instances=100,model=proxy_tuning/llama-70b-chat_mellama-13b-chat_llama-13b-base_1.0_logits_20,model_deployment=proxy_tuning/llama-70b-chat_mellama-13b-chat_llama-13b-base_1.0_logits_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=chronic_pain,max_eval_instances=100,model=proxy_tuning/qwen3-30b_mellama-13b-chat_none_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_mellama-13b-chat_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + + #Homelessness + {description: "clear:condition=homelessness,max_eval_instances=100,model=proxy_tuning/llama-70b-chat_none_none_1.0_logits_20,model_deployment=proxy_tuning/llama-70b-chat_none_none_1.0_logits_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=homelessness,max_eval_instances=100,model=proxy_tuning/mellama-70b-chat_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/mellama-70b-chat_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=homelessness,max_eval_instances=100,model=proxy_tuning/mellama-13b-chat_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/mellama-13b-chat_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=homelessness,max_eval_instances=100,model=proxy_tuning/qwen3-30b_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=homelessness,max_eval_instances=100,model=proxy_tuning/qwen3-30b_mellama-13b-chat_llama-13b-base_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_mellama-13b-chat_llama-13b-base_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=homelessness,max_eval_instances=100,model=proxy_tuning/llama-70b-chat_mellama-13b-chat_llama-13b-base_1.0_logits_20,model_deployment=proxy_tuning/llama-70b-chat_mellama-13b-chat_llama-13b-base_1.0_logits_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=homelessness,max_eval_instances=100,model=proxy_tuning/qwen3-30b_mellama-13b-chat_none_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_mellama-13b-chat_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + + #Liver Disease + {description: "clear:condition=liver_disease,max_eval_instances=100,model=proxy_tuning/llama-70b-chat_none_none_1.0_logits_20,model_deployment=proxy_tuning/llama-70b-chat_none_none_1.0_logits_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=liver_disease,max_eval_instances=100,model=proxy_tuning/mellama-70b-chat_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/mellama-70b-chat_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=liver_disease,max_eval_instances=100,model=proxy_tuning/mellama-13b-chat_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/mellama-13b-chat_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=liver_disease,max_eval_instances=100,model=proxy_tuning/qwen3-30b_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=liver_disease,max_eval_instances=100,model=proxy_tuning/qwen3-30b_mellama-13b-chat_llama-13b-base_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_mellama-13b-chat_llama-13b-base_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=liver_disease,max_eval_instances=100,model=proxy_tuning/llama-70b-chat_mellama-13b-chat_llama-13b-base_1.0_logits_20,model_deployment=proxy_tuning/llama-70b-chat_mellama-13b-chat_llama-13b-base_1.0_logits_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=liver_disease,max_eval_instances=100,model=proxy_tuning/qwen3-30b_mellama-13b-chat_none_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_mellama-13b-chat_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + + #Major Depression + {description: "clear:condition=major_depression,max_eval_instances=100,model=proxy_tuning/llama-70b-chat_none_none_1.0_logits_20,model_deployment=proxy_tuning/llama-70b-chat_none_none_1.0_logits_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=major_depression,max_eval_instances=100,model=proxy_tuning/mellama-70b-chat_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/mellama-70b-chat_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=major_depression,max_eval_instances=100,model=proxy_tuning/mellama-13b-chat_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/mellama-13b-chat_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=major_depression,max_eval_instances=100,model=proxy_tuning/qwen3-30b_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=major_depression,max_eval_instances=100,model=proxy_tuning/qwen3-30b_mellama-13b-chat_llama-13b-base_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_mellama-13b-chat_llama-13b-base_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=major_depression,max_eval_instances=100,model=proxy_tuning/llama-70b-chat_mellama-13b-chat_llama-13b-base_1.0_logits_20,model_deployment=proxy_tuning/llama-70b-chat_mellama-13b-chat_llama-13b-base_1.0_logits_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=major_depression,max_eval_instances=100,model=proxy_tuning/qwen3-30b_mellama-13b-chat_none_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_mellama-13b-chat_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + + #Personality Disorder + {description: "clear:condition=personality_disorder,max_eval_instances=100,model=proxy_tuning/llama-70b-chat_none_none_1.0_logits_20,model_deployment=proxy_tuning/llama-70b-chat_none_none_1.0_logits_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=personality_disorder,max_eval_instances=100,model=proxy_tuning/mellama-70b-chat_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/mellama-70b-chat_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=personality_disorder,max_eval_instances=100,model=proxy_tuning/mellama-13b-chat_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/mellama-13b-chat_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=personality_disorder,max_eval_instances=100,model=proxy_tuning/qwen3-30b_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=personality_disorder,max_eval_instances=100,model=proxy_tuning/qwen3-30b_mellama-13b-chat_llama-13b-base_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_mellama-13b-chat_llama-13b-base_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=personality_disorder,max_eval_instances=100,model=proxy_tuning/llama-70b-chat_mellama-13b-chat_llama-13b-base_1.0_logits_20,model_deployment=proxy_tuning/llama-70b-chat_mellama-13b-chat_llama-13b-base_1.0_logits_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=personality_disorder,max_eval_instances=100,model=proxy_tuning/qwen3-30b_mellama-13b-chat_none_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_mellama-13b-chat_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + + #Post-Traumatic Stress Disorder + {description: "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=proxy_tuning/llama-70b-chat_none_none_1.0_logits_20,model_deployment=proxy_tuning/llama-70b-chat_none_none_1.0_logits_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=proxy_tuning/mellama-70b-chat_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/mellama-70b-chat_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=proxy_tuning/mellama-13b-chat_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/mellama-13b-chat_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=proxy_tuning/qwen3-30b_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=proxy_tuning/qwen3-30b_mellama-13b-chat_llama-13b-base_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_mellama-13b-chat_llama-13b-base_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=proxy_tuning/llama-70b-chat_mellama-13b-chat_llama-13b-base_1.0_logits_20,model_deployment=proxy_tuning/llama-70b-chat_mellama-13b-chat_llama-13b-base_1.0_logits_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=post_traumatic_stress_disorder,max_eval_instances=100,model=proxy_tuning/qwen3-30b_mellama-13b-chat_none_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_mellama-13b-chat_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + + #Substance Use Disorder + {description: "clear:condition=substance_use_disorder,max_eval_instances=100,model=proxy_tuning/llama-70b-chat_none_none_1.0_logits_20,model_deployment=proxy_tuning/llama-70b-chat_none_none_1.0_logits_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=substance_use_disorder,max_eval_instances=100,model=proxy_tuning/mellama-70b-chat_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/mellama-70b-chat_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=substance_use_disorder,max_eval_instances=100,model=proxy_tuning/mellama-13b-chat_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/mellama-13b-chat_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=substance_use_disorder,max_eval_instances=100,model=proxy_tuning/qwen3-30b_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=substance_use_disorder,max_eval_instances=100,model=proxy_tuning/qwen3-30b_mellama-13b-chat_llama-13b-base_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_mellama-13b-chat_llama-13b-base_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=substance_use_disorder,max_eval_instances=100,model=proxy_tuning/llama-70b-chat_mellama-13b-chat_llama-13b-base_1.0_logits_20,model_deployment=proxy_tuning/llama-70b-chat_mellama-13b-chat_llama-13b-base_1.0_logits_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=substance_use_disorder,max_eval_instances=100,model=proxy_tuning/qwen3-30b_mellama-13b-chat_none_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_mellama-13b-chat_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + + #Suicidal Behavior + {description: "clear:condition=suicidal_behavior,max_eval_instances=100,model=proxy_tuning/llama-70b-chat_none_none_1.0_logits_20,model_deployment=proxy_tuning/llama-70b-chat_none_none_1.0_logits_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=suicidal_behavior,max_eval_instances=100,model=proxy_tuning/mellama-70b-chat_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/mellama-70b-chat_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=suicidal_behavior,max_eval_instances=100,model=proxy_tuning/mellama-13b-chat_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/mellama-13b-chat_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=suicidal_behavior,max_eval_instances=100,model=proxy_tuning/qwen3-30b_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=suicidal_behavior,max_eval_instances=100,model=proxy_tuning/qwen3-30b_mellama-13b-chat_llama-13b-base_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_mellama-13b-chat_llama-13b-base_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=suicidal_behavior,max_eval_instances=100,model=proxy_tuning/llama-70b-chat_mellama-13b-chat_llama-13b-base_1.0_logits_20,model_deployment=proxy_tuning/llama-70b-chat_mellama-13b-chat_llama-13b-base_1.0_logits_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=suicidal_behavior,max_eval_instances=100,model=proxy_tuning/qwen3-30b_mellama-13b-chat_none_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_mellama-13b-chat_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + + #Tobacco Dependence + {description: "clear:condition=tobacco_dependence,max_eval_instances=100,model=proxy_tuning/llama-70b-chat_none_none_1.0_logits_20,model_deployment=proxy_tuning/llama-70b-chat_none_none_1.0_logits_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=tobacco_dependence,max_eval_instances=100,model=proxy_tuning/mellama-70b-chat_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/mellama-70b-chat_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=tobacco_dependence,max_eval_instances=100,model=proxy_tuning/mellama-13b-chat_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/mellama-13b-chat_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=tobacco_dependence,max_eval_instances=100,model=proxy_tuning/qwen3-30b_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=tobacco_dependence,max_eval_instances=100,model=proxy_tuning/qwen3-30b_mellama-13b-chat_llama-13b-base_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_mellama-13b-chat_llama-13b-base_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=tobacco_dependence,max_eval_instances=100,model=proxy_tuning/llama-70b-chat_mellama-13b-chat_llama-13b-base_1.0_logits_20,model_deployment=proxy_tuning/llama-70b-chat_mellama-13b-chat_llama-13b-base_1.0_logits_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=tobacco_dependence,max_eval_instances=100,model=proxy_tuning/qwen3-30b_mellama-13b-chat_none_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_mellama-13b-chat_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + + #Unemployment + {description: "clear:condition=unemployment,max_eval_instances=100,model=proxy_tuning/llama-70b-chat_none_none_1.0_logits_20,model_deployment=proxy_tuning/llama-70b-chat_none_none_1.0_logits_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=unemployment,max_eval_instances=100,model=proxy_tuning/mellama-70b-chat_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/mellama-70b-chat_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=unemployment,max_eval_instances=100,model=proxy_tuning/mellama-13b-chat_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/mellama-13b-chat_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=unemployment,max_eval_instances=100,model=proxy_tuning/qwen3-30b_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=unemployment,max_eval_instances=100,model=proxy_tuning/qwen3-30b_mellama-13b-chat_llama-13b-base_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_mellama-13b-chat_llama-13b-base_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=unemployment,max_eval_instances=100,model=proxy_tuning/llama-70b-chat_mellama-13b-chat_llama-13b-base_1.0_logits_20,model_deployment=proxy_tuning/llama-70b-chat_mellama-13b-chat_llama-13b-base_1.0_logits_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + {description: "clear:condition=unemployment,max_eval_instances=100,model=proxy_tuning/qwen3-30b_mellama-13b-chat_none_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_mellama-13b-chat_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/", priority: 1}, + + ### Planning Treatments ### + + ### Predicting Patient Risks and Outcomes ### + + + ########## Clinical Note Generation ########## + + ### Documenting Patient Visits ### + + ### Recording Procedures ### + + ### Documenting Diagnostic Reports ### + + ### Documenting Care Plans ### + {description: "chw_care_plan:model=proxy_tuning/llama-70b-chat_none_none_1.0_logits_20,model_deployment=proxy_tuning/llama-70b-chat_none_none_1.0_logits_20,data_path=/share/pi/nigam/datasets/CHW_Dataset.csv", priority: 1}, + {description: "chw_care_plan:model=proxy_tuning/mellama-70b-chat_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/mellama-70b-chat_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/datasets/CHW_Dataset.csv", priority: 1}, + {description: "chw_care_plan:model=proxy_tuning/mellama-13b-chat_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/mellama-13b-chat_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/datasets/CHW_Dataset.csv", priority: 1}, + {description: "chw_care_plan:model=proxy_tuning/qwen3-30b_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/datasets/CHW_Dataset.csv", priority: 1}, + {description: "chw_care_plan:model=proxy_tuning/qwen3-30b_mellama-13b-chat_llama-13b-base_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_mellama-13b-chat_llama-13b-base_1.0_logprobs_20,data_path=/share/pi/nigam/datasets/CHW_Dataset.csv", priority: 1}, + {description: "chw_care_plan:model=proxy_tuning/llama-70b-chat_mellama-13b-chat_llama-13b-base_1.0_logits_20,model_deployment=proxy_tuning/llama-70b-chat_mellama-13b-chat_llama-13b-base_1.0_logits_20,data_path=/share/pi/nigam/datasets/CHW_Dataset.csv", priority: 1}, + {description: "chw_care_plan:model=proxy_tuning/qwen3-30b_mellama-13b-chat_none_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_mellama-13b-chat_none_1.0_logprobs_20,data_path=/share/pi/nigam/datasets/CHW_Dataset.csv", priority: 1}, + + ########## Patient Communication and Education ########## + + ### Providing Patient Education Resources ### + + ### Delivering Personalized Care Instructions ### + {description: "starr_patient_instructions:model=proxy_tuning/llama-70b-chat_none_none_1.0_logits_20,model_deployment=proxy_tuning/llama-70b-chat_none_none_1.0_logits_20,data_path=/share/pi/nigam/suhana/medhelm/data/starr-omop-personalized-care-instr/dataset_cases_qc.csv", priority: 1}, + {description: "starr_patient_instructions:model=proxy_tuning/mellama-70b-chat_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/mellama-70b-chat_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/starr-omop-personalized-care-instr/dataset_cases_qc.csv", priority: 1}, + {description: "starr_patient_instructions:model=proxy_tuning/mellama-13b-chat_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/mellama-13b-chat_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/starr-omop-personalized-care-instr/dataset_cases_qc.csv", priority: 1}, + {description: "starr_patient_instructions:model=proxy_tuning/qwen3-30b_none_none_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_none_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/starr-omop-personalized-care-instr/dataset_cases_qc.csv", priority: 1}, + {description: "starr_patient_instructions:model=proxy_tuning/qwen3-30b_mellama-13b-chat_llama-13b-base_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_mellama-13b-chat_llama-13b-base_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/starr-omop-personalized-care-instr/dataset_cases_qc.csv", priority: 1}, + {description: "starr_patient_instructions:model=proxy_tuning/llama-70b-chat_mellama-13b-chat_llama-13b-base_1.0_logits_20,model_deployment=proxy_tuning/llama-70b-chat_mellama-13b-chat_llama-13b-base_1.0_logits_20,data_path=/share/pi/nigam/suhana/medhelm/data/starr-omop-personalized-care-instr/dataset_cases_qc.csv", priority: 1}, + {description: "starr_patient_instructions:model=proxy_tuning/qwen3-30b_mellama-13b-chat_none_1.0_logprobs_20,model_deployment=proxy_tuning/qwen3-30b_mellama-13b-chat_none_1.0_logprobs_20,data_path=/share/pi/nigam/suhana/medhelm/data/starr-omop-personalized-care-instr/dataset_cases_qc.csv", priority: 1}, + + ### Patient-Provider Messaging ### + + + ### Enhancing Patient Understanding and Accessibility in Health Communication ### + + ### Facilitating Patient Engagement and Support ### + + ########## Medical Research Assistance ########## + + ### Conducting Literature Research ### + + ### Analyzing Clinical Research Data ### + + ### Recording Research Processes ### + + + ### Ensuring Clinical Research Quality ### + + ### Managing Research Enrollment ### + + ########## Administration and Workflow ########## + + ### Scheduling Resources and Staff ### + + + ### Overseeing Financial Activities ### + + ### Care Coordination and Planning ### + + ### Organizing Workflow Processes ### + +] diff --git a/src/helm/clients/proxy_tuning_client.py b/src/helm/clients/proxy_tuning_client.py new file mode 100644 index 00000000000..9b30456565c --- /dev/null +++ b/src/helm/clients/proxy_tuning_client.py @@ -0,0 +1,226 @@ +""" +Proxy-tuned HELM client +======================= + +This module implements a HELM Client that routes generation through +decoding-time strategies for domain-level adaptation. +It runs multiple models (base, expert, anti-expert). +This is experimental code to test different decoding-time strategies. + +Main classes +------------ +- AnyModel: Imported from any_model.py in proxy_tuning directory. Performs step-wise generation under three modes: + 1) **Base** (single base model, runs using HF generate() function), + 2) **Unite** (merge base + expert via vocabulary union arithmetic): + - adapted in from [this codebase](https://github.com/starrYYxuan/UniTE/) + 3) **Proxy**: + - Original method adapted from [this codebase](https://github.com/alisawuffles/proxy-tuning/tree/main): + - base + alpha(expert − anti-expert) at the logit level with models of same vocabulary. + - Cross-Architecture Proxy Tuning (our novel method) + - same formula as above using log-probs with models of differing vocabulary + +- ProxyTuningClient: A HELM client that parses the deployment tag to + configure `AnyModel`, runs generation for a given `Request`, and logs + per-request outputs and token-wise outputs for the proxy tuning method. + +Deployment/tag format +--------------------- +The `model_name` (a.k.a. deployment tag) is expected to be of the form: + "proxy_tuning/{base}_{expert}_{antiexpert}_{alpha}_{score_type}_{k}" + +Examples: + proxy_tuning/mellama-13b-chat_none_none_1.0_logits_20 (base) + proxy_tuning/qwen3-30b_mellama-13b-chat_none_1.0_logprobs_20 (unite) + proxy_tuning/llama-70b-chat_mellama-13b-chat_llama-13b-base_0.7_logits_10 (Original proxy, logits) + proxy_tuning/qwen3-30b_mellama-13b-chat_llama-13b-base_0.7_logits_10 (CAPT proxy, logprobs) + +Each sub-tag meaning: +- base / expert / antiexpert: keys that must exist in `MODEL_PATHS` below + (use "none" to disable that role). + - if only base is not "none" --> base method + - if base and expert are not "none" --> unite method + - if base, expert, and antiexpert are not "none" --> proxy method +- alpha: float, strength of expert vs anti-expert adjustment. +- score_type: "logits" (original proxy tuning) or "logprobs" (CAPT). +- k: top-k token pool size when building the union vocabulary (for Unite + and CAPT). + +Artifacts +--------- +A results directory is created under: + + LOCAL_RESULTS_DIR/_/ + +Files inside: +- `_.csv` : One row per HELM request with columns: + timestamp, request_id, model_name, prompt, output, logits_path +- `logits_analysis/` : Optional per-request tensors (when + `return_logits_for_analysis=True`) saved via `torch.save(...)` as: + logits__r####.pt + +""" + +from helm.clients.client import Client +from helm.common.request import Request, RequestResult, GeneratedOutput +import os +import sys +import torch +from datetime import datetime + +sys.path.insert(0, "/share/pi/ema2016/users/sronaghi/proxy_tuning") + +from any_model import load_any_model # noqa: E402 + +LOCAL_RESULTS_DIR = "/share/pi/ema2016/users/sronaghi/proxy_tuning/results/medhelm" + + +def ensure_dir(d): + if not os.path.exists(d): + os.makedirs(d, exist_ok=True) + + +# proxy tuning helpers +def _safe_tag(model_name: str) -> str: + # e.g. "proxy_tuning/llama70b_mellama13bchat" -> "proxy_tuning_llama70b_mellama13bchat" + return model_name.replace("/", "_").replace(" ", "").replace(".", "").replace("-", "") + + +def setup_run_dirs(model_name: str, root=LOCAL_RESULTS_DIR): + """ + Creates: + /_/ + ├─ _.csv + └─ logits_analysis/ + Returns: (run_dir, csv_path, logits_dir) + """ + ensure_dir(root) + tag = _safe_tag(model_name) + stamp = datetime.now().strftime("%Y%m%d_%H%M%S") + run_dir = os.path.join(root, f"{tag}_{stamp}") + ensure_dir(run_dir) + + csv_name = f"{tag}_{stamp}.csv" + csv_path = os.path.join(run_dir, csv_name) + with open(csv_path, "w") as f: + f.write("timestamp,request_id,model_name,prompt,output,logits_path\n") + + logits_dir = os.path.join(run_dir, "logits_analysis") + ensure_dir(logits_dir) + + print(f"[TokenLog] created run dir: {run_dir}") + print(f"[TokenLog] csv: {csv_path}") + print(f"[TokenLog] logits dir: {logits_dir}") + return run_dir, csv_path, logits_dir + + +def append_request_row( + csv_path: str, request_id: str, model_name: str, prompt: str, output: str, logits_path: str | None +): + ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + def esc(s: str) -> str: + if s is None: + return "" + return s.replace("\n", "\\n").replace(",", ",") + + with open(csv_path, "a") as f: + f.write(f"{ts},{request_id},{esc(model_name)},{esc(prompt)},{esc(output)},{esc(logits_path or '')}\n") + + +class ProxyTuningClient(Client): + """ + A HELM client that uses ProxyTuning for inference instead of directly calling the model. + """ + + def __init__( + self, + model_name: str, + ): + """ + Initializes the ProxyTuningClient. + + """ + self.run_dir, self.token_log_path, self.logits_dir = setup_run_dirs(model_name) + self.model_name = model_name + self.run_id = datetime.now().strftime("%Y%m%d_%H%M%S") + self.req_seq = 0 + tag = model_name.split("/")[-1] + self.return_logits_for_analysis = True + + parts = tag.split("_") + base_name, expert_name, antiexpert_name, self.alpha, self.score_type, k_str = ( + parts[0], + parts[1], + parts[2], + float(parts[3]), + parts[4], + parts[5], + ) + self.k = int(k_str) + self.is_unite = False + self.is_proxy = False + if expert_name != "none": + if antiexpert_name == "none": + self.is_unite = True + else: + self.is_proxy = True + + self.any_model = load_any_model( + base_name=base_name, + expert_name=expert_name, + antiexpert_name=antiexpert_name, + alpha=self.alpha, + proxy=self.is_proxy, + unite=self.is_unite, + ) + + def make_request(self, request: Request) -> RequestResult: + + prompt_text = request.prompt + max_new_tokens = 750 + + if request.max_tokens: + max_new_tokens = request.max_tokens + print("max_new_tokens: ", max_new_tokens) + + if request.messages: + print(request.messages) + prompt_text = " ".join(msg["content"] for msg in request.messages if msg.get("role") != "system") + + # progress = tqdm.tqdm(total=1, desc="Generating Completions") + print("doing a generation", flush=True) + + generation, logit_results = self.any_model.generate( + prompt=prompt_text, + max_new_tokens=max_new_tokens, + alpha=self.alpha, + return_logits_for_analysis=self.return_logits_for_analysis, + score_type=self.score_type, + k=self.k, + unite=self.is_unite, + proxy=self.is_proxy, + ) + + print("generation: ", generation, flush=True) + + self.req_seq += 1 + request_id = f"{self.run_id}_r{self.req_seq:04d}" + + logits_path = None + if self.return_logits_for_analysis and logit_results: + logits_path = os.path.join(self.logits_dir, f"logits_{request_id}.pt") + torch.save(logit_results, logits_path) + print(f"[Logits] wrote {logits_path}") + + append_request_row( + csv_path=self.token_log_path, + request_id=request_id, + model_name=self.model_name, + prompt=prompt_text, + output=generation, + logits_path=logits_path, + ) + + # Return a HELM-compatible RequestResult + output = GeneratedOutput(text=generation, logprob=0.0, tokens=[]) + return RequestResult(success=True, cached=False, completions=[output], embedding=[]) diff --git a/src/helm/config/model_deployments.yaml b/src/helm/config/model_deployments.yaml index cdbc0037a95..c4813d1267d 100644 --- a/src/helm/config/model_deployments.yaml +++ b/src/helm/config/model_deployments.yaml @@ -8,6 +8,8 @@ # # This file defines all the model deployments that you do not want to be public. # model_deployments: [] # Leave empty to disable private model deployments +# This file contains deployments for the Proxy Tuning class. + model_deployments: - name: simple/model1 model_name: simple/model1 @@ -5107,3 +5109,66 @@ model_deployments: dspy_module: ChainOfThought dspy_api_model: openai/o3-mini-2025-01-31 dspy_api_base: null + + - name: proxy_tuning/llama-70b-chat_none_none_1.0_logits_20 + model_name: proxy_tuning/llama-70b-chat_none_none_1.0_logits_20 + tokenizer_name: meta-llama/Llama-2-7b-hf + max_sequence_length: 4096 + client_spec: + class_name: "helm.clients.proxy_tuning_client.ProxyTuningClient" + + - name: proxy_tuning/mellama-70b-chat_none_none_1.0_logprobs_20 + model_name: proxy_tuning/mellama-70b-chat_none_none_1.0_logprobs_20 + tokenizer_name: meta-llama/Llama-2-7b-hf + max_sequence_length: 4096 + client_spec: + class_name: "helm.clients.proxy_tuning_client.ProxyTuningClient" + + - name: proxy_tuning/mellama-13b-chat_none_none_1.0_logprobs_20 + model_name: proxy_tuning/mellama-13b-chat_none_none_1.0_logprobs_20 + tokenizer_name: meta-llama/Llama-2-7b-hf + max_sequence_length: 4096 + client_spec: + class_name: "helm.clients.proxy_tuning_client.ProxyTuningClient" + + - name: proxy_tuning/qwen3-30b_none_none_1.0_logprobs_20 + model_name: proxy_tuning/qwen3-30b_none_none_1.0_logprobs_20 + tokenizer_name: qwen/qwen2.5-7b-instruct + max_sequence_length: 4096 + client_spec: + class_name: "helm.clients.proxy_tuning_client.ProxyTuningClient" + + - name: proxy_tuning/qwen3-30b_mellama-13b-chat_llama-13b-base_1.0_logprobs_20 + model_name: proxy_tuning/qwen3-30b_mellama-13b-chat_llama-13b-base_1.0_logprobs_20 + tokenizer_name: qwen/qwen2.5-7b-instruct + max_sequence_length: 4096 + client_spec: + class_name: "helm.clients.proxy_tuning_client.ProxyTuningClient" + + - name: proxy_tuning/qwen3-30b_mellama-13b-base_llama-13b-base_1.0_logprobs_20 + model_name: proxy_tuning/qwen3-30b_mellama-13b-base_llama-13b-base_1.0_logprobs_20 + tokenizer_name: qwen/qwen2.5-7b-instruct + max_sequence_length: 4096 + client_spec: + class_name: "helm.clients.proxy_tuning_client.ProxyTuningClient" + + - name: proxy_tuning/qwen3-30b_mellama-13b-chat_mellama-13b-base_1.0_logprobs_20 + model_name: proxy_tuning/qwen3-30b_mellama-13b-chat_mellama-13b-base_1.0_logprobs_20 + tokenizer_name: qwen/qwen2.5-7b-instruct + max_sequence_length: 4096 + client_spec: + class_name: "helm.clients.proxy_tuning_client.ProxyTuningClient" + + - name: proxy_tuning/llama-70b-chat_mellama-13b-chat_llama-13b-base_1.0_logits_20 + model_name: proxy_tuning/llama-70b-chat_mellama-13b-chat_llama-13b-base_1.0_logits_20 + tokenizer_name: meta-llama/Llama-2-7b-hf + max_sequence_length: 4096 + client_spec: + class_name: "helm.clients.proxy_tuning_client.ProxyTuningClient" + + - name: proxy_tuning/qwen3-30b_mellama-13b-chat_none_1.0_logprobs_20 + model_name: proxy_tuning/qwen3-30b_mellama-13b-chat_none_1.0_logprobs_20 + tokenizer_name: qwen/qwen2.5-7b-instruct + max_sequence_length: 4096 + client_spec: + class_name: "helm.clients.proxy_tuning_client.ProxyTuningClient" diff --git a/src/helm/config/model_metadata.yaml b/src/helm/config/model_metadata.yaml index 9f3fee61c65..8cd416d58c9 100644 --- a/src/helm/config/model_metadata.yaml +++ b/src/helm/config/model_metadata.yaml @@ -8,6 +8,8 @@ # # This file contains the metadata for private models # models: [] # Leave empty to disable private models +# This file contains model metadata for models using the ProxyTuning Class. + models: @@ -5229,3 +5231,88 @@ models: access: limited release_date: 2025-01-31 tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG] + + - name: proxy_tuning/llama-70b-chat_none_none_1.0_logits_20 + display_name: proxy_tuning/llama-70b-chat_none_none_1.0_logits_20 + description: llama-70b-chat. + creator_organization_name: Sasha Ronaghi + access: open + num_parameters: 70000000000 + release_date: 2025-10-15 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: proxy_tuning/mellama-70b-chat_none_none_1.0_logprobs_20 + display_name: proxy_tuning/mellama-70b-chat_none_none_1.0_logprobs_20 + description: mellama-70b-chat. + creator_organization_name: Sasha Ronaghi + access: open + num_parameters: 70000000000 + release_date: 2025-10-15 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: proxy_tuning/mellama-13b-chat_none_none_1.0_logprobs_20 + display_name: proxy_tuning/mellama-13b-chat_none_none_1.0_logprobs_20 + description: mellama-13b-chat. + creator_organization_name: Sasha Ronaghi + access: open + num_parameters: 13000000000 + release_date: 2025-10-15 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + + - name: proxy_tuning/qwen3-30b_none_none_1.0_logprobs_20 + display_name: proxy_tuning/qwen3-30b_none_none_1.0_logprobs_20 + description: Qwen3-30b. + creator_organization_name: Sasha Ronaghi + access: open + num_parameters: 30000000000 + release_date: 2025-10-15 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + + - name: proxy_tuning/qwen3-30b_mellama-13b-chat_llama-13b-base_1.0_logprobs_20 + display_name: proxy_tuning/qwen3-30b_mellama-13b-chat_llama-13b-base_1.0_logprobs_20 + description: Proxy tuned Qwen3-30b with mellama-13b-chat expert and llama-13b-base antiexpert. + creator_organization_name: Sasha Ronaghi + access: open + num_parameters: 30000000000 + release_date: 2025-10-15 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: proxy_tuning/qwen3-30b_mellama-13b-base_llama-13b-base_1.0_logprobs_20 + display_name: proxy_tuning/qwen3-30b_mellama-13b-base_llama-13b-base_1.0_logprobs_20 + description: Proxy tuned Qwen3-30b with mellama-13b-base expert and llama-13b-base antiexpert. + creator_organization_name: Sasha Ronaghi + access: open + num_parameters: 30000000000 + release_date: 2025-10-15 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: proxy_tuning/qwen3-30b_mellama-13b-chat_mellama-13b-base_1.0_logprobs_20 + display_name: proxy_tuning/qwen3-30b_mellama-13b-chat_mellama-13b-base_1.0_logprobs_20 + description: Proxy tuned Qwen3-30b with mellama-13b-chat expert and mellama-13b-base antiexpert. + creator_organization_name: Sasha Ronaghi + access: open + num_parameters: 30000000000 + release_date: 2025-10-15 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: proxy_tuning/llama-70b-chat_mellama-13b-chat_llama-13b-base_1.0_logits_20 + display_name: proxy_tuning/llama-70b-chat_mellama-13b-chat_llama-13b-base_1.0_logits_20 + description: Proxy tuned Llama2-70b-chat with mellama-13b-chat expert and llama-13b-base antiexpert. + creator_organization_name: Sasha Ronaghi + access: open + num_parameters: 70000000000 + release_date: 2025-10-15 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: proxy_tuning/qwen3-30b_mellama-13b-chat_none_1.0_logprobs_20 + display_name: proxy_tuning/qwen3-30b_mellama-13b-chat_none_1.0_logprobs_20 + description: Unite of Qwen3-30b with mellama-13b-chat expert. + creator_organization_name: Sasha Ronaghi + access: open + num_parameters: 30000000000 + release_date: 2025-10-15 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + +