I was trying to train a xgboost ranker model on ray with BigQueryDatasource (data can be large).
The cluster has been setup that other xgb models (such as logistic) work well. I tried to use xgboost_ray.RayDMatrix as the input since the ranker model requires the extra qid column, and it is the only way I found that can pass in those information. But still, I got errors that are hard to understand.
from vertex_ray import BigQueryDatasource
from xgboost_ray import RayDMatrix, RayParams, train
train_dataset = ray.data.read_datasource(
BigQueryDatasource(),
query="SELECT * FROM my_proj.my_dataset.my_table",
)
train_dataset.fully_executed()
ray_dmatrix = xgboost_ray.RayDMatrix(train_dataset, label="y", qid="qid")
bst = train(
{
"objective": "rank:ndcg",
},
ray_dmatrix,
ray_params=RayParams(
num_actors=2, # Number of remote actors
cpus_per_actor=1))
bst.save_model("model.xgb")
(_wrapped pid=909) 2024-02-27 01:49:36,502 INFO main.py:1126 -- [RayXGBoost] Created 2 new actors (2 total actors). Waiting until actors are ready for training.
(_wrapped pid=909) 2024-02-27 01:49:41,995 INFO main.py:1177 -- [RayXGBoost] Starting XGBoost training.
(_RemoteRayXGBoostActor pid=229, ip=10.16.0.164) [01:49:42] task [xgboost.ray]:137063672512960 got new rank 1
(_RemoteRayXGBoostActor pid=228, ip=10.16.0.164) [01:49:42] task [xgboost.ray]:135302736939760 got new rank 0
(_wrapped pid=909) 2024-02-27 01:49:44,679 INFO main.py:1694 -- [RayXGBoost] Finished XGBoost training on training data with total N=100 in 8.25 seconds (2.67 pure XGBoost training time).
---------------------------------------------------------------------------
XGBoostError Traceback (most recent call last)
<ipython-input-22-2d4c36d612fd> in <cell line: 1>()
----> 1 bst = train(
2 {
3 "objective": "rank:ndcg",
4 },
5 ray_dmatrix,
7 frames
~/.local/lib/python3.10/site-packages/xgboost_ray/main.py in train(params, dtrain, num_boost_round, evals, evals_result, additional_results, ray_params, _remote, *args, **kwargs)
1408 _wrapped = force_on_current_node(_wrapped)
1409
-> 1410 bst, train_evals_result, train_additional_results = ray.get(
1411 _wrapped.remote(
1412 params,
~/.local/lib/python3.10/site-packages/ray/_private/client_mode_hook.py in wrapper(*args, **kwargs)
102 # we only convert init function if RAY_CLIENT_MODE=1
103 if func.__name__ != "init" or is_client_mode_enabled_by_default:
--> 104 return getattr(ray, func.__name__)(*args, **kwargs)
105 return func(*args, **kwargs)
106
~/.local/lib/python3.10/site-packages/ray/util/client/api.py in get(self, vals, timeout)
40 timeout: Optional timeout in milliseconds
41 """
---> 42 return self.worker.get(vals, timeout=timeout)
43
44 def put(self, *args, **kwargs):
~/.local/lib/python3.10/site-packages/ray/util/client/worker.py in get(self, vals, timeout)
432 op_timeout = max_blocking_operation_time
433 try:
--> 434 res = self._get(to_get, op_timeout)
435 break
436 except GetTimeoutError:
~/.local/lib/python3.10/site-packages/ray/util/client/worker.py in _get(self, ref, timeout)
476 except grpc.RpcError as e:
477 raise decode_exception(e)
--> 478 return loads_from_server(data)
479
480 def put(
~/.local/lib/python3.10/site-packages/ray/util/client/client_pickler.py in loads_from_server(data, fix_imports, encoding, errors)
176 return ServerUnpickler(
177 file, fix_imports=fix_imports, encoding=encoding, errors=errors
--> 178 ).load()
179
180
~/.local/lib/python3.10/site-packages/xgboost/core.py in __setstate__(self, state)
1679 length = c_bst_ulong(len(buf))
1680 ptr = (ctypes.c_char * len(buf)).from_buffer(buf)
-> 1681 _check_call(
1682 _LIB.XGBoosterUnserializeFromBuffer(handle, ptr, length))
1683 state['handle'] = handle
~/.local/lib/python3.10/site-packages/xgboost/core.py in _check_call(ret)
277 """
278 if ret != 0:
--> 279 raise XGBoostError(py_str(_LIB.XGBGetLastError()))
280
281
XGBoostError: [01:49:45] ../include/xgboost/json.h:81: Invalid cast, from Null to Object
Stack trace:
[bt] (0) /root/.local/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x139553) [0x7fed1bd39553]
[bt] (1) /root/.local/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x1638f5) [0x7fed1bd638f5]
[bt] (2) /root/.local/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x66ab91) [0x7fed1c26ab91]
[bt] (3) /root/.local/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x66b665) [0x7fed1c26b665]
[bt] (4) /root/.local/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x2e8ace) [0x7fed1bee8ace]
[bt] (5) /root/.local/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x2f1fc1) [0x7fed1bef1fc1]
[bt] (6) /root/.local/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(XGBoosterUnserializeFromBuffer+0x65) [0x7fed1bd3da55]
[bt] (7) /lib/x86_64-linux-gnu/libffi.so.8(+0x7e2e) [0x7fedd78f4e2e]
[bt] (8) /lib/x86_64-linux-gnu/libffi.so.8(+0x4493) [0x7fedd78f1493]
Similar data and code piece works with local xgboost and xgb.DMatrix
from xgboost import train, DMatrix
dm = DMatrix(X, label=y, qid=qid)
bst = train(
{
"objective": "rank:ndcg",
},
dm)
Anyone knows why and what is the right way to do that?