<a target="_blank" rel="noopener noreferrer nofollow" href="https://user-images.github

我把解码部分用CUDA重写，并行解码检测框，现在在JetsonNX上解码时间可以缩短到5ms <a target="_blank" rel="noopener no

<a class="user-mention notranslate" data-hovercard-type="user" data-hovercard-url="/us

hey, i already create this pull request. <a class="user-mention notranslate" data-hove

检测结果解码时间长 about yolo-tensorrt HOT 8 CLOSED

ccccwb commented on May 25, 2024

检测结果解码时间长

from yolo-tensorrt.

Comments (8)

ccccwb commented on May 25, 2024 1

我把解码部分用CUDA重写，并行解码检测框，现在在JetsonNX上解码时间可以缩短到5ms

from yolo-tensorrt.

ccccwb commented on May 25, 2024

from yolo-tensorrt.

ccccwb commented on May 25, 2024

文件名：decodeTensorCUDA.cu
代码：
#include "decodeTensorCUDA.h"
global void decodeTensorKernel(
float* detections, uint32_t* masks, float* anchors, float* boxes, uint32_t grid_h, uint32_t grid_w, uint32_t numClasses, uint32_t numBBoxes)
{
// 获取idx
uint32_t y = blockIdx.y * blockDim.y + threadIdx.y;
uint32_t x = blockIdx.x * blockDim.x + threadIdx.x;
if (y >= grid_h || x >= grid_w) return;

const int numGridCells = grid_h * grid_w;

for (uint32_t b = 0; b < numBBoxes; ++b)
{
    const float pw = anchors[masks[b] * 2];
    const float ph = anchors[masks[b] * 2 + 1];

    // printf("pw %f, ph %f \n",  pw, ph);
    const uint32_t bbindex = y * grid_w + x;
    boxes[18 * bbindex + 6 * b + 0] = x + detections[bbindex + numGridCells * (b * (5 + numClasses) + 0)];

    boxes[18 * bbindex + 6 * b + 1] = y + detections[bbindex + numGridCells * (b * (5 + numClasses) + 1)];
    boxes[18 * bbindex + 6 * b + 2] = pw * detections[bbindex + numGridCells * (b * (5 + numClasses) + 2)];
    boxes[18 * bbindex + 6 * b + 3] = ph * detections[bbindex + numGridCells * (b * (5 + numClasses) + 3)];

    // printf("x %f y %f w %f h %f\n", boxes[18 * bbindex + 6 * b + 0], boxes[18 * bbindex + 6 * b + 1], boxes[18 * bbindex + 6 * b + 2], boxes[18 * bbindex + 6 * b + 3]);

    const float objectness = detections[bbindex + numGridCells * (b * (5 + numClasses) + 4)];
    float maxProb = 0.0f;
    int maxIndex = -1;

    for (uint32_t i = 0; i < numClasses; ++i)
    {
        float prob = detections[bbindex + numGridCells * (b * (5 + numClasses) + (5 + i))];

        if (prob > maxProb)
        {
            maxProb = prob;
            maxIndex = i;
        }
    }
    // printf("objectness * maxProb  %f , objectness %f , maxProb %f \n", objectness * maxProb, objectness, maxProb);
    boxes[18 * bbindex + 6 * b + 4] = objectness * maxProb;
    boxes[18 * bbindex + 6 * b + 5] = (float) maxIndex;
}

}

float* decodeTensorCUDA(const int imageIdx, const TensorInfo& tensor)
{
// 申请host 内存
int boxes_bytes = 6sizeof(float)tensor.grid_htensor.grid_wtensor.numBBoxes; // x y w h maxProb maxIndex 6个元素
const float* detections = &tensor.hostBuffer[imageIdx * tensor.volume];
float* boxes = (float*) malloc(boxes_bytes);

uint32_t grid_h = tensor.grid_h;
uint32_t grid_w = tensor.grid_w;
uint32_t numClasses = tensor.numClasses;
uint32_t numBBoxes = tensor.numBBoxes;

// 申请 device 内存    sizeof(指针) 只能获取指针类型的字节数！！！！！！！！！！
float* d_detections;
int d_detections_size = sizeof(float) * grid_h * grid_w * (5 + numClasses) * numBBoxes;
cudaMalloc((void**) &d_detections, d_detections_size);
cudaMemcpy((void*) d_detections, (void*) detections, d_detections_size, cudaMemcpyHostToDevice);

uint32_t* d_masks;
cudaMalloc((void**) &d_masks, sizeof(uint32_t)*numBBoxes);
cudaMemcpy((void*) d_masks, (void*) &tensor.masks[0], sizeof(uint32_t)*numBBoxes, cudaMemcpyHostToDevice);

float* d_anchors;
cudaMalloc((void**) &d_anchors, sizeof(float)*tensor.anchors.size());
cudaMemcpy((void*) d_anchors, (void*) &tensor.anchors[0], sizeof(float)*tensor.anchors.size(), cudaMemcpyHostToDevice);    

float* d_boxes;
cudaMalloc((void**) &d_boxes, boxes_bytes);

// 定义 size 
dim3 threads_per_block(20, 20);
dim3 number_of_blocks((tensor.grid_w / threads_per_block.x) + 1, (tensor.grid_h / threads_per_block.y) + 1);

// 启动kernel   

decodeTensorKernel<<<number_of_blocks, threads_per_block>>>(d_detections, d_masks, d_anchors, d_boxes, grid_h, grid_w, numClasses, numBBoxes);
// 异步copy
cudaMemcpyAsync((void*) boxes, (void*) d_boxes, boxes_bytes, cudaMemcpyDeviceToHost); 

// 等待cuda搞完  
cudaDeviceSynchronize();

// 释放内存
cudaFree(d_detections);
cudaFree(d_masks);
cudaFree(d_anchors);
cudaFree(d_boxes);

return boxes;

}

from yolo-tensorrt.

ccccwb commented on May 25, 2024

文件名 decodeTensorCUDA.h
代码：
#ifndef DECODETENSORCUDA_H_
#define DECODETENSORCUDA_H_
#include "yolo.h"

float* decodeTensorCUDA(const int imageIdx, const TensorInfo& tensor);
#endif

from yolo-tensorrt.

ccccwb commented on May 25, 2024

文件名：yolov5.cpp
代码：
#include "yolov5.h"
#include "decodeTensorCUDA.h"

YoloV5::YoloV5(
const NetworkInfo &network_info_,
const InferParams &infer_params_) :
Yolo( network_info_, infer_params_) {}
std::vector YoloV5::decodeTensor(const int imageIdx, const int imageH, const int imageW, const TensorInfo& tensor)
{
float scale_h = 1.f;
float scale_w = 1.f;
int xOffset = 0;
int yOffset = 0;
calcuate_letterbox_message(m_InputH, m_InputW, imageH, imageW, scale_h, scale_w, xOffset, yOffset);
std::vector binfo;

// 把所有框的 坐标， maxProb 和 maxIndex 用cuda计算后放到 boxes 这个指针中
float* boxes = decodeTensorCUDA(imageIdx, tensor);

// 遍历 boxes， 没想到怎么把这几个循环给去掉，但也挺快了，就不管了
for (uint32_t y = 0; y < tensor.grid_h; ++y)
{
	for (uint32_t x = 0; x < tensor.grid_w; ++x)
	{
		for (uint32_t b = 0; b < tensor.numBBoxes; ++b)
		{	
			const int bbindex = y * tensor.grid_w+ x;
			const float bx
				= boxes[18 * bbindex + 6*b + 0];

			const float by
				= boxes[18 * bbindex + 6*b + 1];

			const float bw
				= boxes[18 * bbindex + 6*b + 2];

			const float bh
				= boxes[18 * bbindex + 6*b + 3];

			const float maxProb
				= boxes[18 * bbindex + 6*b + 4];

			const int maxIndex 
				= (int) boxes[18 * bbindex + 6*b + 5];
			// std::cout<< bx << "  " << by << "  " << maxProb << "  " << m_ProbThresh << "  " << maxIndex << std::endl;
			if (maxProb > m_ProbThresh)
			{
				add_bbox_proposal(bx, by, bw, bh, tensor.stride_h, tensor.stride_w, scale_h, scale_w, xOffset, yOffset, maxIndex, maxProb, imageW, imageH, binfo);
			}
			
		}
	}
}
return binfo;

}

from yolo-tensorrt.

Nuzhny007 commented on May 25, 2024

@ccccwb do you can create pull request with your improvements?

from yolo-tensorrt.

ccccwb commented on May 25, 2024

@Nuzhny007
sure, i will pull it soon

from yolo-tensorrt.

ccccwb commented on May 25, 2024

hey, i already create this pull request. @Nuzhny007

from yolo-tensorrt.

检测结果解码时间长 about yolo-tensorrt HOT 8 CLOSED

Comments (8)

Related Issues (20)

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent