-
Notifications
You must be signed in to change notification settings - Fork 269
Expand file tree
/
Copy pathtest_xgboost.py
More file actions
86 lines (66 loc) · 2.89 KB
/
test_xgboost.py
File metadata and controls
86 lines (66 loc) · 2.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import colorama
import argparse
import pandas as pd
import xgboost as xgb
# Blog: https://blog.csdn.net/fengbingchun/article/details/146159691
def parse_args():
parser = argparse.ArgumentParser(description="test XGBoost")
parser.add_argument("--task", required=True, type=str, choices=["regress", "classify", "rank"], help="specify what kind of task")
parser.add_argument("--csv", required=True, type=str, help="source csv file")
parser.add_argument("--model", required=True, type=str, help="model file, save or load")
args = parser.parse_args()
return args
def split_train_test(X, y):
X = X.sample(frac=1, random_state=42).reset_index(drop=True) # random_state=42: make the results consistent each time
y = y.sample(frac=1, random_state=42).reset_index(drop=True)
index = int(len(X) * 0.8)
X_train, X_test = X[:index], X[index:]
y_train, y_test = y[:index], y[index:]
return X_train, X_test, y_train, y_test
def calculate_rmse(input, target): # Root Mean Squared Error
return (sum((input - target) ** 2) / len(input)) ** 0.5
def regress(csv_file, model_file):
# 1. load data
data = pd.read_csv(csv_file)
# 2. split into training set and test se
X = data.drop('MEDV', axis=1)
y = data['MEDV']
print(f"X: type: {type(X)}, shape: {X.shape}; y: type: {type(X)}, shape: {y.shape}")
X_train, X_test, y_train, y_test = split_train_test(X, y)
train_dmatrix = xgb.DMatrix(X_train, label=y_train)
test_dmatrix = xgb.DMatrix(X_test, label=y_test)
print(f"train_dmatrix type: {type(train_dmatrix)}, shape(h,w): {train_dmatrix.num_row()}, {train_dmatrix.num_col()}")
# 3. set XGBoost params
params = {
'objective': 'reg:squarederror', # specify the learning task: classify: binary:logistic or multi:softmax or multi:softprob; rank: rank:ndcg
'max_depth': 5, # maximum tree depth
'eta': 0.1, # learning rate
'subsample': 0.8, # subsample ratio of the training instance
'colsample_bytree': 0.8, # subsample ratio of columns when constructing each tree
'seed': 42, # random number seed
'eval_metric': 'rmse' # metric used for monitoring the training result and early stopping
}
# 4. train model
best = xgb.train(params, train_dmatrix, num_boost_round=1000) # num_boost_round: epochs
# 5. predict
y_pred = best.predict(test_dmatrix)
# print(f"y_pred: {y_pred}")
# 6. evaluate the model
rmse = calculate_rmse(y_test, y_pred)
print(f"rmse: {rmse}")
# 7. save model
best.save_model(model_file)
# 8. load mode and predict
model = xgb.Booster()
model.load_model(model_file)
result = model.predict(test_dmatrix)
test_label = test_dmatrix.get_label()
for idx in range(len(result)):
print(f"ground truth: {test_label[idx]:.1f}, \tpredict: {result[idx]:.1f}")
if __name__ == "__main__":
print("xgboost version:", xgb.__version__)
colorama.init(autoreset=True)
args = parse_args()
if args.task == "regress":
regress(args.csv, args.model)
print(colorama.Fore.GREEN + "====== execution completed ======")