现在是需要把pytorch refinedet转tensorrt。而且是一步步的搭建网络实现。
pytorch refinedet https://github.com/luuuyi/RefineDet.PyTorch
tensorrt https://github.com/wang-xinyu/tensorrtx
tensorrtx这个仓库里面包含了很多流行网络的实现,都是用tensorrt api一步步搭的。
import torch
import torch.nn as nn
import struct
from models.refinedet import build_refinedet
num_classes = 25
path_model = "/data_2/pytorch_refinedet/2021/20210308.pth"
path_save_wts = "./refinedet0312.wts"
input_size = 320
net = build_refinedet('test', input_size, num_classes) # initialize net
f = open(path_save_wts, 'w')
for k, v in net.state_dict().items():
vr = v.reshape(-1).cpu().numpy()
f.write('{} {} '.format(k, len(vr)))
for vv in vr:
f.write(' ')
print("success generate wts!")
std::map<std::string, Weights> loadWeights(const std::string file) {
std::cout << "Loading weights: " << file << std::endl;
std::map<std::string, Weights> weightMap;
// Open weights file
std::ifstream input(file);
assert(input.is_open() && "Unable to load weight file.");
// Read number of weight blobs
int32_t count;
input >> count;
assert(count > 0 && "Invalid weight map file.");
while (count--)
Weights wt{DataType::kFLOAT, nullptr, 0};
uint32_t size;
// Read name and type of blob
std::string name;
input >> name >> std::dec >> size;
wt.type = DataType::kFLOAT;
// Load blob
uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
for (uint32_t x = 0, y = size; x < y; ++x)
input >> std::hex >> val[x];
wt.values = val;
wt.count = size;
weightMap[name] = wt;
return weightMap;
3.tensorrt 网络搭建
def build_refinedet(phase, size=320, num_classes=21):
if phase != "test" and phase != "train":
print("ERROR: Phase: " + phase + " not recognized")
if size != 320 and size != 512:
print("ERROR: You specified size " + repr(size) + ". However, " +
"currently only RefineDet320 and RefineDet512 is supported!")
base_ = vgg(base[str(size)], 3)
extras_ = add_extras(extras[str(size)], size, 1024)
ARM_ = arm_multibox(base_, extras_, mbox[str(size)])
ODM_ = odm_multibox(base_, extras_, mbox[str(size)], num_classes)
TCB_ = add_tcb(tcb[str(size)])
return RefineDet(phase, size, base_, extras_, ARM_, ODM_, TCB_, num_classes)
class refinedet_my(nn.Module): # SfSNet = PS-Net in SfSNet_deploy.prototxt
def __init__(self):
# C64
super(refinedet_my, self).__init__()
self.num_classes = 25
self.conv0 = nn.Conv2d(3, 64, kernel_size=3, padding=1)
self.relu1 = nn.ReLU(inplace=True)
self.conv2 = nn.Conv2d(64, 64, kernel_size=3, padding=1)
self.relu3 = nn.ReLU(inplace=True)
self.maxpool4 = nn.MaxPool2d(kernel_size=2, stride=2)
self.conv5 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
self.relu6 = nn.ReLU(inplace=True)
self.conv7 = nn.Conv2d(128, 128, kernel_size=3, padding=1)
self.relu8 = nn.ReLU(inplace=True)
self.maxpool9 = nn.MaxPool2d(kernel_size=2, stride=2)
self.conv10 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
self.relu11 = nn.ReLU(inplace=True)
self.conv12 = nn.Conv2d(256, 256, kernel_size=3, padding=1)
self.relu13 = nn.ReLU(inplace=True)
self.conv14 = nn.Conv2d(256, 256, kernel_size=3, padding=1)
self.relu15 = nn.ReLU(inplace=True)
self.maxpool16 = nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)
self.conv17 = nn.Conv2d(256, 512, kernel_size=3, padding=1)
self.relu18 = nn.ReLU(inplace=True)
self.conv19 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.relu20 = nn.ReLU(inplace=True)
self.conv21 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.relu22 = nn.ReLU(inplace=True)
self.maxpool23 = nn.MaxPool2d(kernel_size=2, stride=2)
self.conv24 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.relu25 = nn.ReLU(inplace=True)
self.conv26 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.relu27 = nn.ReLU(inplace=True)
self.conv28 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.relu29 = nn.ReLU(inplace=True)
self.maxpool30 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
self.conv31 = nn.Conv2d(512, 1024, kernel_size=3, padding=3, dilation=3)
self.relu32 = nn.ReLU(inplace=True)
self.conv33 = nn.Conv2d(1024, 1024, kernel_size=1)
self.relu34 = nn.ReLU(inplace=True)
self.extras0 = nn.Conv2d(1024, 256, kernel_size=1)
self.relu_e0 = nn.ReLU(inplace=True)
self.extras1 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1)
self.relu_e1 = nn.ReLU(inplace=True)
self.conv4_3_L2Norm = L2Norm(512, 10)
self.conv5_3_L2Norm = L2Norm(512, 8)
self.arm_loc_0 = nn.Conv2d(512, 12, kernel_size=3, padding=1)
self.arm_loc_1 = nn.Conv2d(512, 12, kernel_size=3, padding=1)
self.arm_loc_2 = nn.Conv2d(1024, 12, kernel_size=3, padding=1)
self.arm_loc_3 = nn.Conv2d(512, 12, kernel_size=3, padding=1)
self.arm_conf_0 = nn.Conv2d(512, 6, kernel_size=3, padding=1)
self.arm_conf_1 = nn.Conv2d(512, 6, kernel_size=3, padding=1)
self.arm_conf_2 = nn.Conv2d(1024, 6, kernel_size=3, padding=1)
self.arm_conf_3 = nn.Conv2d(512, 6, kernel_size=3, padding=1)
self.tcb0_9 = nn.Conv2d(512, 256, 3, padding=1)
self.tcb0_10 = nn.ReLU(inplace=True)
self.tcb0_11 = nn.Conv2d(256, 256, 3, padding=1)
self.tcb0_6 = nn.Conv2d(1024, 256, 3, padding=1)
self.tcb0_7 = nn.ReLU(inplace=True)
self.tcb0_8 = nn.Conv2d(256, 256, 3, padding=1)
self.tcb0_3 = nn.Conv2d(512, 256, 3, padding=1)
self.tcb0_4 = nn.ReLU(inplace=True)
self.tcb0_5 = nn.Conv2d(256, 256, 3, padding=1)
self.tcb0_0 = nn.Conv2d(512, 256, 3, padding=1)
self.tcb0_1 = nn.ReLU(inplace=True)
self.tcb0_2 = nn.Conv2d(256, 256, 3, padding=1)
self.tcb2_0 = nn.ReLU(inplace=True)
self.tcb2_1 = nn.Conv2d(256, 256, 3, padding=1)
self.tcb2_2 = nn.ReLU(inplace=True)
self.tcb2_3 = nn.ReLU(inplace=True)
self.tcb2_4 = nn.Conv2d(256, 256, 3, padding=1)
self.tcb2_5 = nn.ReLU(inplace=True)
self.tcb2_6 = nn.ReLU(inplace=True)
self.tcb2_7 = nn.Conv2d(256, 256, 3, padding=1)
self.tcb2_8 = nn.ReLU(inplace=True)
self.tcb2_9 = nn.ReLU(inplace=True)
self.tcb2_10 = nn.Conv2d(256, 256, 3, padding=1)
self.tcb2_11 = nn.ReLU(inplace=True)
self.tcb1_2 = nn.ConvTranspose2d(256, 256, 2, 2)
self.tcb1_1 = nn.ConvTranspose2d(256, 256, 2, 2)
self.tcb1_0 = nn.ConvTranspose2d(256, 256, 2, 2)
self.odm_loc_0 = nn.Conv2d(256, 12, kernel_size=3, padding=1)
self.odm_loc_1 = nn.Conv2d(256, 12, kernel_size=3, padding=1)
self.odm_loc_2 = nn.Conv2d(256, 12, kernel_size=3, padding=1)
self.odm_loc_3 = nn.Conv2d(256, 12, kernel_size=3, padding=1)
self.odm_conf_0 = nn.Conv2d(256, 3 * self.num_classes, kernel_size=3, padding=1)
self.odm_conf_1 = nn.Conv2d(256, 3 * self.num_classes, kernel_size=3, padding=1)
self.odm_conf_2 = nn.Conv2d(256, 3 * self.num_classes, kernel_size=3, padding=1)
self.odm_conf_3 = nn.Conv2d(256, 3 * self.num_classes, kernel_size=3, padding=1)
self.softmax = nn.Softmax(dim=-1)
def forward(self, inputs):
inputs = inputs.cuda()
sources = list()
tcb_source = list()
x = self.relu1(self.conv0(inputs))
x = self.relu3(self.conv2(x))
x = self.maxpool4(x)
x = self.relu6(self.conv5(x))
x = self.relu8(self.conv7(x))
x = self.maxpool9(x)
x = self.relu11(self.conv10(x))
x = self.relu13(self.conv12(x))
x = self.relu15(self.conv14(x))
x = self.maxpool16(x)
x = self.relu18(self.conv17(x))
x = self.relu20(self.conv19(x))
x = self.relu22(self.conv21(x))
out_conv4_3_L2Norm = x.clone()
out_conv4_3_L2Norm = self.conv4_3_L2Norm(out_conv4_3_L2Norm) ####s_0
x = self.maxpool23(x)
x = self.relu25(self.conv24(x))
x = self.relu27(self.conv26(x))
x = self.relu29(self.conv28(x))
out_conv5_3_L2Norm = x.clone()
out_conv5_3_L2Norm = self.conv5_3_L2Norm(out_conv5_3_L2Norm) ####s_1
x = self.maxpool30(x)
x = self.relu32(self.conv31(x))
x = self.relu34(self.conv33(x))
return x
img = cv2.imread(path_img).astype(np.float32)
img = img[:, :, (2, 1, 0)] ## bgr --> rgb
img = img / 255.0
img_2 = torch.from_numpy(img).permute(2, 0, 1) ## hwc --> chw
bb0 = img_2.unsqueeze(0)
out = net(img_2.unsqueeze(0))
float data[3 * INPUT_H * INPUT_W];
pr_img = cv::imread(path_img);
for (int i = 0; i < INPUT_H * INPUT_W; i++) {
data[i] = (float)(pr_img.at<cv::Vec3b>(i)[2]) * 1.0 / 255.0;
data[i + INPUT_H * INPUT_W] = (float)(pr_img.at<cv::Vec3b>(i)[1]) * 1.0 / 255.0;
data[i + 2 * INPUT_H * INPUT_W] = (float)(pr_img.at<cv::Vec3b>(i)[0]) * 1.0 / 255.0;
checkpoint = torch.load(path_model, map_location=torch.device('cpu'))
net.load_state_dict(checkpoint, strict=False)
checkpoint = torch.load(path_model, map_location=torch.device('cpu'))
net = refinedet_my()
# net.eval()
index = 0
print("=" * 50)
for name, param in list(net.named_parameters()):
print(str(index) + ':', name, param.size())
index += 1
print("=" * 50)
for k, v in net.state_dict().items():
print(k," shape::",v.shape)
print("@" * 50)
checkpoint = torch.load(path_model, map_location=torch.device('cpu'))
import collections
new_state_dict = collections.OrderedDict()
print("--------load pth name----------------------")
for k, v in checkpoint.items():
print(k," shape==",v.shape)
checkpoint = torch.load(path_model, map_location=torch.device('cpu'))
import collections
new_state_dict = collections.OrderedDict()
for k, v in checkpoint.items():
name = k.replace('vgg.', 'conv')
name = name.replace('extras.', 'extras')
if "arm" in name or "loc" in name or "tcb" in name or "odm" in name:
name = name.replace(".","_",1)
new_state_dict[name] = v
net.load_state_dict(new_state_dict, strict=False)
// Creat the engine using only the API and not any parser.
ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) {
INetworkDefinition* network = builder->createNetworkV2(0U);
ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W});
std::map<std::string, Weights> weightMap = loadWeights(path_wts);
Weights emptywts{DataType::kFLOAT, nullptr, 0};
DimsHW maxpool_hw = DimsHW(2,2);
auto lr0 = convRelu(network, weightMap, *data, 64, 3, 1, 1, 0);
auto lr1 = convRelu(network, weightMap, *lr0->getOutput(0), 64, 3, 1, 1, 2);
IPoolingLayer* pool1 = network->addPoolingNd(*lr1->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
pool1->setStrideNd(DimsHW{2, 2});
auto lr2 = convRelu(network, weightMap, *pool1->getOutput(0), 128, 3, 1, 1, 5);
auto lr3 = convRelu(network, weightMap, *lr2->getOutput(0), 128, 3, 1, 1, 7);
IPoolingLayer* pool2 = network->addPoolingNd(*lr3->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
pool2->setStrideNd(DimsHW{2, 2});
auto lr4 = convRelu(network, weightMap, *pool2->getOutput(0), 256, 3, 1, 1, 10);
auto lr5 = convRelu(network, weightMap, *lr4->getOutput(0), 256, 3, 1, 1, 12);
auto lr6 = convRelu(network, weightMap, *lr5->getOutput(0), 256, 3, 1, 1, 14);
IPoolingLayer* pool3 = network->addPoolingNd(*lr6->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
pool3->setStrideNd(DimsHW{2, 2});
auto lr7 = convRelu(network, weightMap, *pool3->getOutput(0), 512, 3, 1, 1, 17);
auto lr8 = convRelu(network, weightMap, *lr7->getOutput(0), 512, 3, 1, 1, 19);
auto lr9 = convRelu(network, weightMap, *lr8->getOutput(0), 512, 3, 1, 1, 21);
IPoolingLayer* pool4 = network->addPoolingNd(*lr9->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
pool4->setStrideNd(DimsHW{2, 2});
auto lr24 = convRelu(network, weightMap, *pool4->getOutput(0), 512, 3, 1, 1, 24);
auto lr26 = convRelu(network, weightMap, *lr24->getOutput(0), 512, 3, 1, 1, 26);
auto lr28 = convRelu(network, weightMap, *lr26->getOutput(0), 512, 3, 1, 1, 28);
auto lr0 = convRelu(network, weightMap, *data, 64, 3, 1, 1, 0);
config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB
ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
std::cout << "Build engine successfully!" << std::endl;
// Don't need the network any more
// Release host memory
for (auto& mem : weightMap)
free((void*) (mem.second.values));
return engine;
void doInference(IExecutionContext& context, float* input, float* output, int batchSize) {
const ICudaEngine& engine = context.getEngine();
// Pointers to input and output device buffers to pass to engine.
// Engine requires exactly IEngine::getNbBindings() number of buffers.
assert(engine.getNbBindings() == 2);//getNbBindings()输入和输出的个数
void* buffers[2];
// In order to bind the buffers, we need to know the names of the input and output tensors.
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
// Create GPU buffers on device
CUDA_CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H_refinedet * INPUT_W_refinedet * sizeof(float)));
const int OUTPUT_SIZE_2 = 1 * 64 * 160 * 160; //大小需要自己知道写这里 分配显存
CUDA_CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE_2 * sizeof(float)));
// Create stream
cudaStream_t stream;
// DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
CUDA_CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H_refinedet * INPUT_W_refinedet * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(batchSize, buffers, stream, nullptr); //推理!!!
CUDA_CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE_2 * sizeof(float), cudaMemcpyDeviceToHost, stream)); //输出从cuda给到cpu
// Release stream and buffers
【L2norm 层tensorrt api实现】(https://www.cnblogs.com/yanghailin/p/14448829.html)
【指定维度softmax 层tensorRT api实现】(https://www.cnblogs.com/yanghailin/p/14486077.html)
定义网络的时候直接多个输出。然后推理的时候,engine.getNbBindings()就是5,context.enqueue(batchSize, buffers, stream, nullptr);这里的buffer就是5维的数据
void doInference(IExecutionContext& context, void* buffers[], cudaStream_t &stream, float* input, std::vector<std::vector<float>> &detections) {
auto start_infer = std::chrono::system_clock::now();
int batchSize = 1;
const ICudaEngine& engine = context.getEngine();
// Pointers to input and output device buffers to pass to engine.
// Engine requires exactly IEngine::getNbBindings() number of buffers.
// std::cout<<"engine.getNbBindings()==="<<engine.getNbBindings()<<std::endl;
assert(engine.getNbBindings() == 5);
// In order to bind the buffers, we need to know the names of the input and output tensors.
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
const int outputIndex_arm_loc = engine.getBindingIndex(OUTPUT_BLOB_NAME_arm_loc);
const int outputIndex_arm_conf = engine.getBindingIndex(OUTPUT_BLOB_NAME_arm_conf);
const int outputIndex_odm_loc = engine.getBindingIndex(OUTPUT_BLOB_NAME_odm_loc);
const int outputIndex_odm_conf = engine.getBindingIndex(OUTPUT_BLOB_NAME_odm_conf);
// const int outputIndex2 = engine.getBindingIndex("prob2");
// printf("inputIndex=%d
// printf("outputIndex_arm_loc=%d
// printf("outputIndex_arm_conf=%d
// printf("outputIndex_odm_loc=%d
// printf("outputIndex_odm_conf=%d
// DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
CUDA_CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(batchSize, buffers, stream, nullptr);
context.enqueue(batchSize, buffers, stream, nullptr);
def forward(self, num_classes, size, bkg_label, top_k, conf_thresh, nms_thresh,
objectness_thre, keep_top_k,arm_loc_data, arm_conf_data, odm_loc_data, odm_conf_data, prior_data):
loc_data: (tensor) Loc preds from loc layers
Shape: [batch,num_priors*4]
conf_data: (tensor) Shape: Conf preds from conf layers
Shape: [batch*num_priors,num_classes]
prior_data: (tensor) Prior boxes and variances from priorbox layers
Shape: [1,num_priors,4]
arm_loc_data: torch.Size([1, 6375, 4])
arm_conf_data: torch.Size([1, 6375, 2])
odm_loc_data: torch.Size([1, 6375, 4])
odm_conf_data: torch.Size([1, 6375, 25])
prior_data: torch.Size([6375, 4])
self.num_classes = num_classes
self.background_label = bkg_label
self.top_k = top_k
self.keep_top_k = keep_top_k
# Parameters used in nms.
self.nms_thresh = nms_thresh
if nms_thresh <= 0:
raise ValueError('nms_threshold must be non negative.')
self.conf_thresh = conf_thresh ## 0.01
self.objectness_thre = objectness_thre ## 0.01
self.variance = cfg[str(size)]['variance']
loc_data = odm_loc_data #[1, 6375, 4]
conf_data = odm_conf_data #[1, 6375, 25]
# [1,6375,1] #[1,6375,2] --->>> [1,6375,1]
arm_object_conf = arm_conf_data.data[:, :, 1:]
# [1,6375,1]
no_object_index = arm_object_conf <= self.objectness_thre
conf_data[no_object_index.expand_as(conf_data)] = 0 ##[1, 6375, 25]
num = loc_data.size(0) # 1 batch size: 1
num_priors = prior_data.size(0) #6375
output = torch.zeros(num, self.num_classes, self.top_k, 5)##[1,25,1000,5]
conf_preds = conf_data.view(num, num_priors,
self.num_classes).transpose(2, 1)
# Decode predictions into bboxes.
for i in range(num):
#[6375,4] [6375,4] [6375,4]
default = decode(arm_loc_data[i], prior_data, self.variance)
default = center_size(default)
#[6375,4] #[6375,4] [6375,4]
decoded_boxes = decode(loc_data[i], default, self.variance)
#[25,6375] For each class, perform nms
conf_scores = conf_preds[i].clone()
#print(decoded_boxes, conf_scores)
for cl in range(1, self.num_classes):
c_mask = conf_scores[cl].gt(self.conf_thresh)
scores = conf_scores[cl][c_mask]
if scores.size(0) == 0:
l_mask = c_mask.unsqueeze(1).expand_as(decoded_boxes)
boxes = decoded_boxes[l_mask].view(-1, 4)
#或者boxes = decoded_boxes[c_mask].view(-1, 4)
# idx of highest scoring and non-overlapping boxes per class
#print(boxes, scores)
ids, count = nms(boxes, scores, self.nms_thresh, self.top_k)
output[i, cl, :count] =
boxes[ids[:count]]), 1)
flt = output.contiguous().view(num, -1, 5)
_, idx = flt[:, :, 0].sort(1, descending=True)
_, rank = idx.sort(1)
flt[(rank < self.keep_top_k).unsqueeze(-1).expand_as(flt)].fill_(0)
return output
assert(engine.getNbBindings() == 5);
// In order to bind the buffers, we need to know the names of the input and output tensors.
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
const int outputIndex_arm_loc = engine.getBindingIndex(OUTPUT_BLOB_NAME_arm_loc);
const int outputIndex_arm_conf = engine.getBindingIndex(OUTPUT_BLOB_NAME_arm_conf);
const int outputIndex_odm_loc = engine.getBindingIndex(OUTPUT_BLOB_NAME_odm_loc);
const int outputIndex_odm_conf = engine.getBindingIndex(OUTPUT_BLOB_NAME_odm_conf);
// const int outputIndex2 = engine.getBindingIndex("prob2");
// printf("inputIndex=%d
// printf("outputIndex_arm_loc=%d
// printf("outputIndex_arm_conf=%d
// printf("outputIndex_odm_loc=%d
// printf("outputIndex_odm_conf=%d
// DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
CUDA_CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(batchSize, buffers, stream, nullptr);
int m_prior_size = 6375;
torch::Tensor arm_loc = torch::from_blob(buffers[outputIndex_arm_loc],{m_prior_size,4}).cuda().toType(torch::kFloat64).unsqueeze(0);
torch::Tensor arm_conf = torch::from_blob(buffers[outputIndex_arm_conf],{m_prior_size,2}).cuda().toType(torch::kFloat64).unsqueeze(0);
torch::Tensor odm_loc = torch::from_blob(buffers[outputIndex_odm_loc],{m_prior_size,4}).cuda().toType(torch::kFloat64).unsqueeze(0);
torch::Tensor odm_conf = torch::from_blob(buffers[outputIndex_odm_conf],{m_prior_size,25}).cuda().toType(torch::kFloat64).unsqueeze(0);
厉害了,试了这里可以访问torch::Tensor arm_loc里面数据,并且是对的!