import numpy as np
import scipy.sparse
import pickle
import xgboost as xgb

从文件或者 xgboost 生成的二进制缓存中读取数据

dtrain = xgb.DMatrix('demo/data/agaricus.txt.train')
dtest = xgb.DMatrix('demo/data/agaricus.txt.test')

用 dict 保存参数

param = {
    'max_depth': 2,
    'eta': 1,
    'silent': 1,
    'objective': 'binary:logistic'
}

设置 validation set

watchlist = [(dtest, 'eval'), (dtrain, 'train')]
num_round = 2
bst = xgb.train(param, dtrain, num_round, watchlist)
[0]    eval-error:0.042831    train-error:0.046522
[1]    eval-error:0.021726    train-error:0.022263

预测

preds = bst.predict(dtest)
labels = dtest.get_label()
labels
array([0., 1., 0., ..., 1., 0., 1.], dtype=float32)
preds
array([0.28583017, 0.9239239 , 0.28583017, ..., 0.9239239 , 0.05169873,
       0.9239239 ], dtype=float32)
preds > 0.5
array([False,  True, False, ...,  True, False,  True])
pred_labels = np.copy(preds)

选择阈值进行二分类

threshold = 0.5
pred_labels[preds > threshold] = 1
pred_labels[preds <= threshold] = 0
pred_labels
array([0., 1., 0., ..., 1., 0., 1.], dtype=float32)
print(sum(pred_labels == labels) / labels.shape)
[0.97827436]
sum(np.array([True, False, True, False]))
2
bst.save_model('test.model')

dump model: 这个不是很理解,是说可以把模型和特征一一对应起来输出吗?看下面的 save_model,好像 dump_model 并不是保存模型,因为不能用 Booster() 加载

ls demo/data
agaricus.txt.test   agaricus.txt.train
bst.dump_model('dump.raw.txt')
bst.dump_model('dump.nice.txt', 'binary_classification/featmap.txt')
cat dump.nice.txt
booster[0]:
0:[odor=pungent] yes=2,no=1
    1:[stalk-root=cup] yes=4,no=3
        3:leaf=1.71218
        4:leaf=-1.70044
    2:[spore-print-color=orange] yes=6,no=5
        5:leaf=-1.94071
        6:leaf=1.85965
booster[1]:
0:[stalk-root=missing] yes=2,no=1
    1:[odor=pungent] yes=4,no=3
        3:leaf=0.784718
        4:leaf=-0.96853
    2:leaf=-6.23624
cat dump.raw.txt
booster[0]:
0:[f29<-9.53674e-07] yes=1,no=2,missing=1
    1:[f56<-9.53674e-07] yes=3,no=4,missing=3
        3:leaf=1.71218
        4:leaf=-1.70044
    2:[f109<-9.53674e-07] yes=5,no=6,missing=5
        5:leaf=-1.94071
        6:leaf=1.85965
booster[1]:
0:[f60<-9.53674e-07] yes=1,no=2,missing=1
    1:[f29<-9.53674e-07] yes=3,no=4,missing=3
        3:leaf=0.784718
        4:leaf=-0.96853
    2:leaf=-6.23624

看结果,好像是用特征名称代替了数字?

把 dmatrix 保存到二进制缓存中

dtest.save_binary('dtest.buffer')

save model

bst.save_model('xgb.boost')

load model

bst2 = xgb.Booster(model_file = 'xgb.boost')
dtest2 = xgb.DMatrix('dtest.buffer')
preds2 = bst2.predict(dtest2)
assert np.sum(np.abs(preds2 - preds)) == 0

使用 scipy.sparse.csr_matrix 从稀疏矩阵中读取数据矩阵。

scipy.sparse.csr_matrix((data, (row, col))),数据 list 和每个数据对应的行列,很好理解。 理解了这个参数,以及原始数据的格式之后,就可以自己写数据读取的代码了。 可以在 https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html 里面看看例子。

print('\n\n Start running examples of build DMatrix from scipy.sparse CSR Matrix \n')
 Start running examples of build DMatrix from scipy.sparse CSR Matrix 
label = []
row = []
col = []
dat = []
i = 0
for l in open('./demo/data/agaricus.txt.train'):
    arr = l.split()
    label.append(int(arr[0]))
    for it in arr[1:]:
        k, v = it.split(':')
        row.append(i)
        col.append(int(k))
        dat.append(float(v))
    i += 1
len(label)
6513
type(label[0])
int
len(row)
143286
len(col)
143286
len(dat)
143286
csr = scipy.sparse.csr_matrix((dat, (row, col)))
type(label[0])
int
dtrain = xgb.DMatrix(csr, label = label)
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
bst = xgb.train(param, dtrain, num_round, watchlist)
[0]    eval-error:0.042831    train-error:0.046522
[1]    eval-error:0.021726    train-error:0.022263
print('Start running examples of buid DMatrix from numpy array')
Start running examples of buid DMatrix from numpy array
npymat = csr.todense()
dtrain = xgb.DMatrix(npymat, label = label)
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
bst = xgb.train(param, dtrain, num_round, watchlist)
[0]    eval-error:0.042831    train-error:0.046522
[1]    eval-error:0.021726    train-error:0.022263

results matching ""

    No results matching ""