-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
84538b1
commit ee35f7c
Showing
134 changed files
with
153,779 additions
and
155,205 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +0,0 @@ | ||
/ckpts | ||
/data | ||
/logs | ||
/wanda_logs | ||
/result | ||
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,32 +1,26 @@ | ||
# download ApolloCorpus | ||
mkdir metadata | ||
|
||
cd metadata | ||
wget https://huggingface.co/datasets/FreedomIntelligence/ApolloCorpus/resolve/main/ApolloCorpus.zip | ||
unzip ApolloCorpus.zip | ||
cd train/pretrain | ||
|
||
qa_dir="qa" | ||
pretrain_sft_dir="pretrain_sft" | ||
# Prepare Data for Mix training | ||
mkdir mixTrain | ||
|
||
if [ ! -d "$qa_dir" ]; then | ||
mkdir -p "$qa_dir" | ||
fi | ||
|
||
if [ ! -d "$pretrain_sft_dir" ]; then | ||
mkdir -p "$pretrain_sft_dir" | ||
fi | ||
|
||
cd train/pretrain | ||
# Mixtraining Only use QA pairs in Pretrain | ||
for file in *; do | ||
if [[ $file == *_qa.json ]]; then | ||
mv "$file" "$qa_dir/" | ||
elif [[ $file == *_text.json ]]; then | ||
mv "$file" "$pretrain_sft_dir/" | ||
fi | ||
cp "$file" "../mixTrain/" | ||
done | ||
mv pretrain_sft/ ../ | ||
mv qa/ ../ | ||
cd ../ | ||
rm pretrain | ||
|
||
mv sft/ all_sft/ | ||
# copy all file from sft to mix_train | ||
mv sft/* mixTrain/ | ||
|
||
# merge all the file from mix_train directory to json | ||
python merge_json_train.py | ||
cd ../ | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,13 @@ | ||
python ./src/process/prepare/data_process_test_qwen.py \ | ||
# Take gemma as example, other models' python code is in ./src/process/prepare/data_process_test_{model}.py | ||
mkdir -p ./data/gemma | ||
|
||
python ./src/process/prepare/data_process_test_gemma.py \ | ||
--data_path ./metadata/test.json \ | ||
--few_shot 3 \ | ||
--save_path ./data/Qwen/test.json | ||
--save_path ./data/gemma/test.json | ||
|
||
|
||
python ./src/process/prepare/data_process_test_qwen.py \ | ||
python ./src/process/prepare/data_process_test_gemma.py \ | ||
--data_path ./metadata/dev.json \ | ||
--few_shot 3 \ | ||
--save_path ./data/Qwen/dev.json | ||
--save_path ./data/gemma/dev.json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,15 +1,17 @@ | ||
model_name = Qwen | ||
model_path = /your_model_path/Qwen1.5-0.5B | ||
experiment_name=qwenallsftcom_data | ||
# need change 4 place | ||
# Please set the wandb key in the python file (e.g ./src/process/prepare/data_process_train_gemma.py) | ||
|
||
mkdir wandb_logs | ||
|
||
experiment_name=Gemma_MixTrain_Data | ||
log_folder="./logs/${experiment_name}" | ||
mkdir -p $log_folder | ||
log_name=$(date +"%m-%d_%H-%M").log | ||
|
||
|
||
python ./src/process/prepare/data_process_train_qwen.py \ | ||
--data_path ./metadata/train/sft.json \ | ||
--model_path ${model_path} \ | ||
python ./src/process/prepare/data_process_train_gemma.py \ | ||
--data_path ./metadata/train/mixTrain.json \ | ||
--model_path /your/path/to/gemma-2b \ | ||
--wandb_log ./wandb_logs \ | ||
--experiment_name ${experiment_name} \ | ||
--save_path ./data/${model_name}/allsftcom > ${log_folder}/$log_name 2>&1 & | ||
|
||
--save_path ./data/Gemma/mixTrain > ${log_folder}/$log_name 2>&1 & |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,12 @@ | ||
experiment_name=Qwen1.5-0.5B_test | ||
cd . | ||
experiment_name=Gemma2b_MixTrain_Test | ||
log_folder="./logs/${experiment_name}" | ||
result_folder="./results/${experiment_name}" | ||
mkdir -p $log_folder | ||
mkdir -p $result_folder | ||
log_name=$(date +"%m-%d_%H-%M").log | ||
|
||
python ./src/evaluate/eval_qwen.py \ | ||
--input_path=./data/qwen/test.json \ | ||
python ./src/evaluate/eval_gemma.py \ | ||
--input_path=./data/gemma/test.json \ | ||
--output_path=${result_folder}/model_ans.jsonl \ | ||
--score_path=${result_folder}/score.json \ | ||
--wrong_item_path=${result_folder}/wrong_item.json > ${log_folder}/$log_name 2>&1 & | ||
--wrong_item_path=${result_folder}/wrong_item.json > ${log_folder}/$log_name 2>&1 & |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.