I have a C++ client to benchmark a gRPC server.
4 poller threads are running concurrently on the client.
Each thread run concurrently 100 asynchronous request using its own channel and stub, and per-thread RPS is then aggregated.
When using this code, I achieve 90K RPS, but when using ghz, I achieve 150K RPS when using the same count of threads and connections (--connections=4 --cpus=4).
struct RPC
{
MessageEcho response;
grpc::ClientContext context;
std::unique_ptr<grpc::ClientAsyncResponseReader<MessageEcho>> reader;
grpc::Status status;
};
void run_client_thread(const Config& config, ThreadRes& res, int n_concurrents)
{
grpc::CompletionQueue cq;
auto enqueue_new_rpc = [&](RPC* rpc) {
rpc = new(rpc) RPC{};
rpc->reader = stub.AsyncEcho(&rpc->context, res.payload, &cq);
rpc->reader->Finish(&rpc->response, &rpc->status, rpc);
};
for(int i = 0; i < n_concurrents; i++)
{
enqueue_new_rpc(static_cast<RPC*>(::operator new(sizeof(RPC))));
}
const Timespec start = Timespec::now();
while(keeprunning)
{
void* got_tag;
bool ok = false;
auto deadline = std::chrono::system_clock::now() + std::chrono::milliseconds(300);
const grpc::CompletionQueue::NextStatus status = cq.AsyncNext(&got_tag, &ok, deadline);
if(status == grpc::CompletionQueue::GOT_EVENT && ok)
{
RPC* const rpc = static_cast<RPC*>(got_tag);
if(rpc->status.ok())
{
res.n_success++;
}
else
{
res.n_fails++;
}
rpc->~RPC();
enqueue_new_rpc(rpc);
}
else if(status == grpc::CompletionQueue::TIMEOUT)
{
res.n_timeouts++;
}
else
{
res.n_fails++;
}
}
const Timespec stop = Timespec::now();
const double elapsed = stop.difftime_s(start);
res.requests_per_sec = static_cast<double>(res.n_success) / elapsed;
cq.Shutdown();
}
Is there any obvious performance issue in this code?
Additional notes:
- When profiling, I see that the CPU is under-utilized compared to ghz
- How the channels are distributed (every thread has its own channel, or sharing the channel/stub) doesn't affect performances.