import numpy as np
import scipy.sparse
import pickle
import xgboost as xgb
从文件或者 xgboost 生成的二进制缓存中读取数据
dtrain = xgb.DMatrix('demo/data/agaricus.txt.train')
dtest = xgb.DMatrix('demo/data/agaricus.txt.test')
用 dict 保存参数
param = {
'max_depth': 2,
'eta': 1,
'silent': 1,
'objective': 'binary:logistic'
}
设置 validation set
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
num_round = 2
bst = xgb.train(param, dtrain, num_round, watchlist)
[0] eval-error:0.042831 train-error:0.046522
[1] eval-error:0.021726 train-error:0.022263
预测
preds = bst.predict(dtest)
labels = dtest.get_label()
labels
array([0., 1., 0., ..., 1., 0., 1.], dtype=float32)
preds
array([0.28583017, 0.9239239 , 0.28583017, ..., 0.9239239 , 0.05169873,
0.9239239 ], dtype=float32)
preds > 0.5
array([False, True, False, ..., True, False, True])
pred_labels = np.copy(preds)
选择阈值进行二分类
threshold = 0.5
pred_labels[preds > threshold] = 1
pred_labels[preds <= threshold] = 0
pred_labels
array([0., 1., 0., ..., 1., 0., 1.], dtype=float32)
print(sum(pred_labels == labels) / labels.shape)
[0.97827436]
sum(np.array([True, False, True, False]))
2
bst.save_model('test.model')
dump model: 这个不是很理解,是说可以把模型和特征一一对应起来输出吗?看下面的 save_model,好像 dump_model 并不是保存模型,因为不能用 Booster() 加载
ls demo/data
agaricus.txt.test agaricus.txt.train
bst.dump_model('dump.raw.txt')
bst.dump_model('dump.nice.txt', 'binary_classification/featmap.txt')
cat dump.nice.txt
booster[0]:
0:[odor=pungent] yes=2,no=1
1:[stalk-root=cup] yes=4,no=3
3:leaf=1.71218
4:leaf=-1.70044
2:[spore-print-color=orange] yes=6,no=5
5:leaf=-1.94071
6:leaf=1.85965
booster[1]:
0:[stalk-root=missing] yes=2,no=1
1:[odor=pungent] yes=4,no=3
3:leaf=0.784718
4:leaf=-0.96853
2:leaf=-6.23624
cat dump.raw.txt
booster[0]:
0:[f29<-9.53674e-07] yes=1,no=2,missing=1
1:[f56<-9.53674e-07] yes=3,no=4,missing=3
3:leaf=1.71218
4:leaf=-1.70044
2:[f109<-9.53674e-07] yes=5,no=6,missing=5
5:leaf=-1.94071
6:leaf=1.85965
booster[1]:
0:[f60<-9.53674e-07] yes=1,no=2,missing=1
1:[f29<-9.53674e-07] yes=3,no=4,missing=3
3:leaf=0.784718
4:leaf=-0.96853
2:leaf=-6.23624
看结果,好像是用特征名称代替了数字?
把 dmatrix 保存到二进制缓存中
dtest.save_binary('dtest.buffer')
save model
bst.save_model('xgb.boost')
load model
bst2 = xgb.Booster(model_file = 'xgb.boost')
dtest2 = xgb.DMatrix('dtest.buffer')
preds2 = bst2.predict(dtest2)
assert np.sum(np.abs(preds2 - preds)) == 0
使用 scipy.sparse.csr_matrix 从稀疏矩阵中读取数据矩阵。
scipy.sparse.csr_matrix((data, (row, col))),数据 list 和每个数据对应的行列,很好理解。 理解了这个参数,以及原始数据的格式之后,就可以自己写数据读取的代码了。 可以在 https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html 里面看看例子。
print('\n\n Start running examples of build DMatrix from scipy.sparse CSR Matrix \n')
Start running examples of build DMatrix from scipy.sparse CSR Matrix
label = []
row = []
col = []
dat = []
i = 0
for l in open('./demo/data/agaricus.txt.train'):
arr = l.split()
label.append(int(arr[0]))
for it in arr[1:]:
k, v = it.split(':')
row.append(i)
col.append(int(k))
dat.append(float(v))
i += 1
len(label)
6513
type(label[0])
int
len(row)
143286
len(col)
143286
len(dat)
143286
csr = scipy.sparse.csr_matrix((dat, (row, col)))
type(label[0])
int
dtrain = xgb.DMatrix(csr, label = label)
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
bst = xgb.train(param, dtrain, num_round, watchlist)
[0] eval-error:0.042831 train-error:0.046522
[1] eval-error:0.021726 train-error:0.022263
print('Start running examples of buid DMatrix from numpy array')
Start running examples of buid DMatrix from numpy array
npymat = csr.todense()
dtrain = xgb.DMatrix(npymat, label = label)
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
bst = xgb.train(param, dtrain, num_round, watchlist)
[0] eval-error:0.042831 train-error:0.046522
[1] eval-error:0.021726 train-error:0.022263