// repo-core-src/src/backend_model.ccStatusTritonModel::SetConfiguredScheduler(){std::unique_ptr<Scheduler>scheduler;// Need to enforce equal shape batches (i.e. non-ragged batches) if// the model 1) allows one or more variable-size input tensors that// are not marked as 'allow_ragged_batch' or 2) has one or more// shape-tensor inputs. This is not needed if all input shapes are// non-variable and if there are no shape tensors... so we don't// enable it in that case for efficiency reasons.std::unordered_map<std::string,bool>enforce_equal_shape_tensors;for(constautoinput:config_.input()){// https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/user_guide/model_configuration.html#shape-tensors, 目前只有TensorRT支持shape_tensor。if(input.is_shape_tensor()){enforce_equal_shape_tensors.insert({input.name(),true});}elseif(!input.allow_ragged_batch()&&(triton::common::GetElementCount(input)==-1)){enforce_equal_shape_tensors.insert({input.name(),false});}}// ......RETURN_IF_ERROR(DynamicBatchScheduler::Create(this,nullptr,0/*nice*/,true/* dynamic_batching_enabled */,config_.max_batch_size(),enforce_equal_shape_tensors,config_.dynamic_batching(),config_.response_cache().enable()/* response_cache_enable */,&scheduler));}
// _deps/repo-core-src/src/dynamic_batch_scheduler.ccuint64_tDynamicBatchScheduler::GetDynamicBatch(){// When there is optional input or input shape must be enforced,// the inputs in the requests must be examined for forming a batchconstboolcheck_input=!enforce_equal_shape_tensors_.empty()||has_optional_input_;// If there is no pending batch, then this request is starting a// new batch.if((payload_batch_size+queue_.PendingBatchCount())==0){// Get the shape of the new batch that is being started...if(check_input){if(!curr_payload_->MutableRequiredEqualInputs()->Initialize(queue_.RequestAtCursor(),enforce_equal_shape_tensors_,has_optional_input_).IsOk()){send_now=true;break;}}}else{// There is a pending batch and adding this request would make// the batch size larger than all of the preferred batch sizes,// so mark the cursor at this point. Not sending the pending batch so// that we can examine the queue delay of requests that fits in a batch.if(((payload_batch_size+pending_batch_size_+batch_size)>max_preferred_batch_size_)&&(best_preferred_batch_size==0)){best_preferred_batch_size=pending_batch_size_;queue_.MarkCursor();payload_saturated_=true;}if((payload_batch_size+pending_batch_size_+batch_size)>max_batch_size_){send_now=true;break;}// There is a pending batch and it has a different shape then// this request, so send the pending batch as it is.if(check_input&&!curr_payload_->MutableRequiredEqualInputs()->HasEqualInputs(queue_.RequestAtCursor())){curr_payload_->MarkSaturated();send_now=true;break;}}// 直接发送// If the delay has been exceeded, or if the current batch can't grow// any larger then just immediately execute whatever is pending.if(send_now||((payload_batch_size+pending_batch_size_)>=max_preferred_batch_size_)){payload_saturated_=true;return0;}}
boolRequiredEqualInputs::HasEqualInputs(conststd::unique_ptr<InferenceRequest>&request){//......constauto&d1=itr->second.first->Data();constauto&d2=input->Data();// For now being conservative and assuming that content// comparison is for shape tensors which are likely to always// be in a single buffer.if((d1->BufferCount()!=1)||(d2->BufferCount()!=1)){returnfalse;}size_td1_byte_size,d2_byte_size;TRITONSERVER_MemoryTyped1_memory_type,d2_memory_type;int64_td1_memory_id,d2_memory_id;constchar*d1_buffer=d1->BufferAt(0/* idx */,&d1_byte_size,&d1_memory_type,&d1_memory_id);constchar*d2_buffer=d2->BufferAt(0/* idx */,&d2_byte_size,&d2_memory_type,&d2_memory_id);// Tensor must be same size and in in CPU memory so that it// can be easily compared. If not return false conservatively.if((d1_byte_size!=d2_byte_size)||(d1_buffer==nullptr)||(d2_buffer==nullptr)||(d1_memory_type==TRITONSERVER_MEMORY_GPU)||(d2_memory_type==TRITONSERVER_MEMORY_GPU)){returnfalse;}if(strncmp(d1_buffer,d2_buffer,d1_byte_size)!=0){returnfalse;}}
The backends, such as ONNX Runtime backend, TensorFlow backend, PyTorch backend, and TensorRT backend, require models to accept ragged inputs as 1-dimensional tensors. These backends concatenates the request inputs into the 1-dimensional tensor.Because the concatenated input doesn’t track the start and end index for each request, the backends often require the model to have additional input(s), batch input, that describe various information about the batch formed.
https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/user_guide/ragged_batching.html