ModelInferHandler::Execute(InferHandler::State*state){// Maintain shared pointers(read-only reference) to the shared memory block's// information for the shared memory regions used by the request. These// pointers will automatically increase the usage count, preventing// unregistration of the shared memory. This vector must be cleared in the// `InferResponseComplete` callback (after inference) to decrease the count// and permit unregistration. The vector will be included in// `response_release_payload` for the callback.std::vector<std::shared_ptr<constSharedMemoryManager::SharedMemoryInfo>>shm_regions_info;if(err==nullptr){err=InferGRPCToInput(tritonserver_,shm_manager_,request,&serialized_data,irequest,&shm_regions_info);}if(err==nullptr){err=InferAllocatorPayload<inference::ModelInferResponse>(tritonserver_,shm_manager_,request,std::move(serialized_data),response_queue,&state->alloc_payload_,&shm_regions_info);}}
voidTritonModelInstance::Execute(std::vector<TRITONBACKEND_Request*>&triton_requests){TRITONBACKEND_ModelInstance*triton_model_instance=reinterpret_cast<TRITONBACKEND_ModelInstance*>(this);TritonBackend::TritonModelInstanceExecFn_tinst_exec_fn=model_->Backend()->ModelInstanceExecFn();// If there is an error then we retain ownership of 'requests'// and must send error responses.TRITONSERVER_Error*err=inst_exec_fn(triton_model_instance,&triton_requests[0],triton_requests.size());if(err!=nullptr){Statusstatus=Status(TritonCodeToStatusCode(TRITONSERVER_ErrorCode(err)),TRITONSERVER_ErrorMessage(err));for(TRITONBACKEND_Request*tr:triton_requests){std::unique_ptr<InferenceRequest>ur(reinterpret_cast<InferenceRequest*>(tr));InferenceRequest::RespondIfError(ur,status,true/* release_requests */);}TRITONSERVER_ErrorDelete(err);}}
// Create the allocator that will be used to allocate buffers for// the result tensors.FAIL_IF_ERR(TRITONSERVER_ResponseAllocatorNew(&allocator_,InferResponseAlloc,InferResponseFree,InferResponseStart),"creating inference response allocator");FAIL_IF_ERR(TRITONSERVER_ResponseAllocatorSetQueryFunction(allocator_,OutputBufferQuery),"setting allocator's query function");FAIL_IF_ERR(TRITONSERVER_ResponseAllocatorSetBufferAttributesFunction(allocator_,OutputBufferAttributes),"setting allocator's output buffer attributes function");
TRITONAPI_DECLSPECTRITONSERVER_Error*TRITONBACKEND_RequestOutputBufferProperties(TRITONBACKEND_Request*request,constchar*name,size_t*byte_size,TRITONSERVER_MemoryType*memory_type,int64_t*memory_type_id){InferenceRequest*tr=reinterpret_cast<InferenceRequest*>(request);autostatus=tr->OutputBufferProperties(name,byte_size,memory_type,memory_type_id);if(!status.IsOk()){returnTRITONSERVER_ErrorNew(StatusCodeToTritonCode(status.StatusCode()),status.Message().c_str());}returnnullptr;// success}TRITONBACKEND_DECLSPECTRITONSERVER_Error*TRITONBACKEND_InputBufferAttributes(TRITONBACKEND_Input*input,constuint32_tindex,constvoid**buffer,TRITONSERVER_BufferAttributes**buffer_attributes);/// Get the memory type field of the buffer attributes.////// \param buffer_attributes The buffer attributes object./// \param memory_type Returns the memory type associated with the buffer/// attributes object./// \return a TRITONSERVER_Error indicating success or failure.TRITONSERVER_DECLSPECTRITONSERVER_Error*TRITONSERVER_BufferAttributesMemoryType(TRITONSERVER_BufferAttributes*buffer_attributes,TRITONSERVER_MemoryType*memory_type);/// Get the CudaIpcHandle field of the buffer attributes object.////// \param buffer_attributes The buffer attributes object./// \param cuda_ipc_handle Returns the memory type associated with the buffer/// attributes object. If the cudaIpcHandle does not exist for the buffer,/// nullptr will be returned./// \return a TRITONSERVER_Error indicating success or failure.TRITONSERVER_DECLSPECTRITONSERVER_Error*TRITONSERVER_BufferAttributesCudaIpcHandle(TRITONSERVER_BufferAttributes*buffer_attributes,void**cuda_ipc_handle);/// Get the byte size field of the buffer attributes.////// \param buffer_attributes The buffer attributes object./// \param byte_size Returns the byte size associated with the buffer attributes/// object./// \return a TRITONSERVER_Error indicating success or failure.TRITONSERVER_DECLSPECTRITONSERVER_Error*TRITONSERVER_BufferAttributesByteSize(TRITONSERVER_BufferAttributes*buffer_attributes,size_t*byte_size);
TRITONSERVER_Error*ModelInstanceState::GetInputTensor(constuint32_tinput_idx,std::shared_ptr<PbTensor>&input_tensor,TRITONBACKEND_Request*request,std::shared_ptr<std::vector<TRITONBACKEND_Response*>>&responses){TRITONSERVER_BufferAttributes*buffer_attributes;// This value is not used.constvoid*buffer_p;RETURN_IF_ERROR(TRITONBACKEND_InputBufferAttributes(in,0,&buffer_p,&buffer_attributes));input_tensor=std::make_shared<PbTensor>(std::string(input_name),std::vector<int64_t>(input_shape,input_shape+input_dims_count),input_dtype,src_memory_type,src_memory_type_id,const_cast<void*>(buffer),input_byte_size,nullptr/* DLManagedTensor */);cudaIpcMemHandle_t*cuda_ipc_handle;RETURN_IF_ERROR(TRITONSERVER_BufferAttributesCudaIpcHandle(buffer_attributes,reinterpret_cast<void**>(&cuda_ipc_handle)));if(cuda_ipc_handle!=nullptr){RETURN_IF_EXCEPTION(input_tensor->SaveToSharedMemory(Stub()->ShmPool(),false/* copy_gpu */));RETURN_IF_EXCEPTION(input_tensor->Memory()->SetCudaIpcHandle(cuda_ipc_handle));}else{RETURN_IF_EXCEPTION(input_tensor->SaveToSharedMemory(Stub()->ShmPool(),true/* copy_gpu */));}}