Git Product home page Git Product logo

Comments (8)

ccccwb avatar ccccwb commented on May 25, 2024 1

我把解码部分用CUDA重写,并行解码检测框,现在在JetsonNX上解码时间可以缩短到5ms
image

from yolo-tensorrt.

ccccwb avatar ccccwb commented on May 25, 2024

image

from yolo-tensorrt.

ccccwb avatar ccccwb commented on May 25, 2024

文件名:decodeTensorCUDA.cu
代码:
#include "decodeTensorCUDA.h"
global void decodeTensorKernel(
float* detections, uint32_t* masks, float* anchors, float* boxes, uint32_t grid_h, uint32_t grid_w, uint32_t numClasses, uint32_t numBBoxes)
{
// 获取idx
uint32_t y = blockIdx.y * blockDim.y + threadIdx.y;
uint32_t x = blockIdx.x * blockDim.x + threadIdx.x;
if (y >= grid_h || x >= grid_w) return;

const int numGridCells = grid_h * grid_w;

for (uint32_t b = 0; b < numBBoxes; ++b)
{
    const float pw = anchors[masks[b] * 2];
    const float ph = anchors[masks[b] * 2 + 1];

    // printf("pw %f, ph %f \n",  pw, ph);
    const uint32_t bbindex = y * grid_w + x;
    boxes[18 * bbindex + 6 * b + 0] = x + detections[bbindex + numGridCells * (b * (5 + numClasses) + 0)];

    boxes[18 * bbindex + 6 * b + 1] = y + detections[bbindex + numGridCells * (b * (5 + numClasses) + 1)];
    boxes[18 * bbindex + 6 * b + 2] = pw * detections[bbindex + numGridCells * (b * (5 + numClasses) + 2)];
    boxes[18 * bbindex + 6 * b + 3] = ph * detections[bbindex + numGridCells * (b * (5 + numClasses) + 3)];

    // printf("x %f y %f w %f h %f\n", boxes[18 * bbindex + 6 * b + 0], boxes[18 * bbindex + 6 * b + 1], boxes[18 * bbindex + 6 * b + 2], boxes[18 * bbindex + 6 * b + 3]);

    const float objectness = detections[bbindex + numGridCells * (b * (5 + numClasses) + 4)];
    float maxProb = 0.0f;
    int maxIndex = -1;

    for (uint32_t i = 0; i < numClasses; ++i)
    {
        float prob = detections[bbindex + numGridCells * (b * (5 + numClasses) + (5 + i))];

        if (prob > maxProb)
        {
            maxProb = prob;
            maxIndex = i;
        }
    }
    // printf("objectness * maxProb  %f , objectness %f , maxProb %f \n", objectness * maxProb, objectness, maxProb);
    boxes[18 * bbindex + 6 * b + 4] = objectness * maxProb;
    boxes[18 * bbindex + 6 * b + 5] = (float) maxIndex;
}

}

float* decodeTensorCUDA(const int imageIdx, const TensorInfo& tensor)
{
// 申请host 内存
int boxes_bytes = 6sizeof(float)tensor.grid_htensor.grid_wtensor.numBBoxes; // x y w h maxProb maxIndex 6个元素
const float* detections = &tensor.hostBuffer[imageIdx * tensor.volume];
float* boxes = (float*) malloc(boxes_bytes);

uint32_t grid_h = tensor.grid_h;
uint32_t grid_w = tensor.grid_w;
uint32_t numClasses = tensor.numClasses;
uint32_t numBBoxes = tensor.numBBoxes;

// 申请 device 内存    sizeof(指针) 只能获取指针类型的字节数!!!!!!!!!!
float* d_detections;
int d_detections_size = sizeof(float) * grid_h * grid_w * (5 + numClasses) * numBBoxes;
cudaMalloc((void**) &d_detections, d_detections_size);
cudaMemcpy((void*) d_detections, (void*) detections, d_detections_size, cudaMemcpyHostToDevice);

uint32_t* d_masks;
cudaMalloc((void**) &d_masks, sizeof(uint32_t)*numBBoxes);
cudaMemcpy((void*) d_masks, (void*) &tensor.masks[0], sizeof(uint32_t)*numBBoxes, cudaMemcpyHostToDevice);

float* d_anchors;
cudaMalloc((void**) &d_anchors, sizeof(float)*tensor.anchors.size());
cudaMemcpy((void*) d_anchors, (void*) &tensor.anchors[0], sizeof(float)*tensor.anchors.size(), cudaMemcpyHostToDevice);    

float* d_boxes;
cudaMalloc((void**) &d_boxes, boxes_bytes);

// 定义 size 
dim3 threads_per_block(20, 20);
dim3 number_of_blocks((tensor.grid_w / threads_per_block.x) + 1, (tensor.grid_h / threads_per_block.y) + 1);

// 启动kernel   

decodeTensorKernel<<<number_of_blocks, threads_per_block>>>(d_detections, d_masks, d_anchors, d_boxes, grid_h, grid_w, numClasses, numBBoxes);
// 异步copy
cudaMemcpyAsync((void*) boxes, (void*) d_boxes, boxes_bytes, cudaMemcpyDeviceToHost); 

// 等待cuda搞完  
cudaDeviceSynchronize();

// 释放内存
cudaFree(d_detections);
cudaFree(d_masks);
cudaFree(d_anchors);
cudaFree(d_boxes);

return boxes;	

}

from yolo-tensorrt.

ccccwb avatar ccccwb commented on May 25, 2024

文件名 decodeTensorCUDA.h
代码:
#ifndef DECODETENSORCUDA_H_
#define DECODETENSORCUDA_H_
#include "yolo.h"

float* decodeTensorCUDA(const int imageIdx, const TensorInfo& tensor);
#endif

from yolo-tensorrt.

ccccwb avatar ccccwb commented on May 25, 2024

文件名:yolov5.cpp
代码:
#include "yolov5.h"
#include "decodeTensorCUDA.h"

YoloV5::YoloV5(
const NetworkInfo &network_info_,
const InferParams &infer_params_) :
Yolo( network_info_, infer_params_) {}
std::vector YoloV5::decodeTensor(const int imageIdx, const int imageH, const int imageW, const TensorInfo& tensor)
{
float scale_h = 1.f;
float scale_w = 1.f;
int xOffset = 0;
int yOffset = 0;
calcuate_letterbox_message(m_InputH, m_InputW, imageH, imageW, scale_h, scale_w, xOffset, yOffset);
std::vector binfo;

// 把所有框的 坐标, maxProb 和 maxIndex 用cuda计算后放到 boxes 这个指针中
float* boxes = decodeTensorCUDA(imageIdx, tensor);

// 遍历 boxes, 没想到怎么把这几个循环给去掉,但也挺快了,就不管了
for (uint32_t y = 0; y < tensor.grid_h; ++y)
{
	for (uint32_t x = 0; x < tensor.grid_w; ++x)
	{
		for (uint32_t b = 0; b < tensor.numBBoxes; ++b)
		{	
			const int bbindex = y * tensor.grid_w+ x;
			const float bx
				= boxes[18 * bbindex + 6*b + 0];

			const float by
				= boxes[18 * bbindex + 6*b + 1];

			const float bw
				= boxes[18 * bbindex + 6*b + 2];

			const float bh
				= boxes[18 * bbindex + 6*b + 3];

			const float maxProb
				= boxes[18 * bbindex + 6*b + 4];

			const int maxIndex 
				= (int) boxes[18 * bbindex + 6*b + 5];
			// std::cout<< bx << "  " << by << "  " << maxProb << "  " << m_ProbThresh << "  " << maxIndex << std::endl;
			if (maxProb > m_ProbThresh)
			{
				add_bbox_proposal(bx, by, bw, bh, tensor.stride_h, tensor.stride_w, scale_h, scale_w, xOffset, yOffset, maxIndex, maxProb, imageW, imageH, binfo);
			}
			
		}
	}
}
return binfo;

}

from yolo-tensorrt.

Nuzhny007 avatar Nuzhny007 commented on May 25, 2024

@ccccwb do you can create pull request with your improvements?

from yolo-tensorrt.

ccccwb avatar ccccwb commented on May 25, 2024

@Nuzhny007
sure, i will pull it soon

from yolo-tensorrt.

ccccwb avatar ccccwb commented on May 25, 2024

hey, i already create this pull request. @Nuzhny007

from yolo-tensorrt.

Related Issues (20)

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. 📊📈🎉

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google ❤️ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.