Comments (8)
我把解码部分用CUDA重写,并行解码检测框,现在在JetsonNX上解码时间可以缩短到5ms
from yolo-tensorrt.
from yolo-tensorrt.
文件名:decodeTensorCUDA.cu
代码:
#include "decodeTensorCUDA.h"
global void decodeTensorKernel(
float* detections, uint32_t* masks, float* anchors, float* boxes, uint32_t grid_h, uint32_t grid_w, uint32_t numClasses, uint32_t numBBoxes)
{
// 获取idx
uint32_t y = blockIdx.y * blockDim.y + threadIdx.y;
uint32_t x = blockIdx.x * blockDim.x + threadIdx.x;
if (y >= grid_h || x >= grid_w) return;
const int numGridCells = grid_h * grid_w;
for (uint32_t b = 0; b < numBBoxes; ++b)
{
const float pw = anchors[masks[b] * 2];
const float ph = anchors[masks[b] * 2 + 1];
// printf("pw %f, ph %f \n", pw, ph);
const uint32_t bbindex = y * grid_w + x;
boxes[18 * bbindex + 6 * b + 0] = x + detections[bbindex + numGridCells * (b * (5 + numClasses) + 0)];
boxes[18 * bbindex + 6 * b + 1] = y + detections[bbindex + numGridCells * (b * (5 + numClasses) + 1)];
boxes[18 * bbindex + 6 * b + 2] = pw * detections[bbindex + numGridCells * (b * (5 + numClasses) + 2)];
boxes[18 * bbindex + 6 * b + 3] = ph * detections[bbindex + numGridCells * (b * (5 + numClasses) + 3)];
// printf("x %f y %f w %f h %f\n", boxes[18 * bbindex + 6 * b + 0], boxes[18 * bbindex + 6 * b + 1], boxes[18 * bbindex + 6 * b + 2], boxes[18 * bbindex + 6 * b + 3]);
const float objectness = detections[bbindex + numGridCells * (b * (5 + numClasses) + 4)];
float maxProb = 0.0f;
int maxIndex = -1;
for (uint32_t i = 0; i < numClasses; ++i)
{
float prob = detections[bbindex + numGridCells * (b * (5 + numClasses) + (5 + i))];
if (prob > maxProb)
{
maxProb = prob;
maxIndex = i;
}
}
// printf("objectness * maxProb %f , objectness %f , maxProb %f \n", objectness * maxProb, objectness, maxProb);
boxes[18 * bbindex + 6 * b + 4] = objectness * maxProb;
boxes[18 * bbindex + 6 * b + 5] = (float) maxIndex;
}
}
float* decodeTensorCUDA(const int imageIdx, const TensorInfo& tensor)
{
// 申请host 内存
int boxes_bytes = 6sizeof(float)tensor.grid_htensor.grid_wtensor.numBBoxes; // x y w h maxProb maxIndex 6个元素
const float* detections = &tensor.hostBuffer[imageIdx * tensor.volume];
float* boxes = (float*) malloc(boxes_bytes);
uint32_t grid_h = tensor.grid_h;
uint32_t grid_w = tensor.grid_w;
uint32_t numClasses = tensor.numClasses;
uint32_t numBBoxes = tensor.numBBoxes;
// 申请 device 内存 sizeof(指针) 只能获取指针类型的字节数!!!!!!!!!!
float* d_detections;
int d_detections_size = sizeof(float) * grid_h * grid_w * (5 + numClasses) * numBBoxes;
cudaMalloc((void**) &d_detections, d_detections_size);
cudaMemcpy((void*) d_detections, (void*) detections, d_detections_size, cudaMemcpyHostToDevice);
uint32_t* d_masks;
cudaMalloc((void**) &d_masks, sizeof(uint32_t)*numBBoxes);
cudaMemcpy((void*) d_masks, (void*) &tensor.masks[0], sizeof(uint32_t)*numBBoxes, cudaMemcpyHostToDevice);
float* d_anchors;
cudaMalloc((void**) &d_anchors, sizeof(float)*tensor.anchors.size());
cudaMemcpy((void*) d_anchors, (void*) &tensor.anchors[0], sizeof(float)*tensor.anchors.size(), cudaMemcpyHostToDevice);
float* d_boxes;
cudaMalloc((void**) &d_boxes, boxes_bytes);
// 定义 size
dim3 threads_per_block(20, 20);
dim3 number_of_blocks((tensor.grid_w / threads_per_block.x) + 1, (tensor.grid_h / threads_per_block.y) + 1);
// 启动kernel
decodeTensorKernel<<<number_of_blocks, threads_per_block>>>(d_detections, d_masks, d_anchors, d_boxes, grid_h, grid_w, numClasses, numBBoxes);
// 异步copy
cudaMemcpyAsync((void*) boxes, (void*) d_boxes, boxes_bytes, cudaMemcpyDeviceToHost);
// 等待cuda搞完
cudaDeviceSynchronize();
// 释放内存
cudaFree(d_detections);
cudaFree(d_masks);
cudaFree(d_anchors);
cudaFree(d_boxes);
return boxes;
}
from yolo-tensorrt.
文件名 decodeTensorCUDA.h
代码:
#ifndef DECODETENSORCUDA_H_
#define DECODETENSORCUDA_H_
#include "yolo.h"
float* decodeTensorCUDA(const int imageIdx, const TensorInfo& tensor);
#endif
from yolo-tensorrt.
文件名:yolov5.cpp
代码:
#include "yolov5.h"
#include "decodeTensorCUDA.h"
YoloV5::YoloV5(
const NetworkInfo &network_info_,
const InferParams &infer_params_) :
Yolo( network_info_, infer_params_) {}
std::vector YoloV5::decodeTensor(const int imageIdx, const int imageH, const int imageW, const TensorInfo& tensor)
{
float scale_h = 1.f;
float scale_w = 1.f;
int xOffset = 0;
int yOffset = 0;
calcuate_letterbox_message(m_InputH, m_InputW, imageH, imageW, scale_h, scale_w, xOffset, yOffset);
std::vector binfo;
// 把所有框的 坐标, maxProb 和 maxIndex 用cuda计算后放到 boxes 这个指针中
float* boxes = decodeTensorCUDA(imageIdx, tensor);
// 遍历 boxes, 没想到怎么把这几个循环给去掉,但也挺快了,就不管了
for (uint32_t y = 0; y < tensor.grid_h; ++y)
{
for (uint32_t x = 0; x < tensor.grid_w; ++x)
{
for (uint32_t b = 0; b < tensor.numBBoxes; ++b)
{
const int bbindex = y * tensor.grid_w+ x;
const float bx
= boxes[18 * bbindex + 6*b + 0];
const float by
= boxes[18 * bbindex + 6*b + 1];
const float bw
= boxes[18 * bbindex + 6*b + 2];
const float bh
= boxes[18 * bbindex + 6*b + 3];
const float maxProb
= boxes[18 * bbindex + 6*b + 4];
const int maxIndex
= (int) boxes[18 * bbindex + 6*b + 5];
// std::cout<< bx << " " << by << " " << maxProb << " " << m_ProbThresh << " " << maxIndex << std::endl;
if (maxProb > m_ProbThresh)
{
add_bbox_proposal(bx, by, bw, bh, tensor.stride_h, tensor.stride_w, scale_h, scale_w, xOffset, yOffset, maxIndex, maxProb, imageW, imageH, binfo);
}
}
}
}
return binfo;
}
from yolo-tensorrt.
@ccccwb do you can create pull request with your improvements?
from yolo-tensorrt.
@Nuzhny007
sure, i will pull it soon
from yolo-tensorrt.
hey, i already create this pull request. @Nuzhny007
from yolo-tensorrt.
Related Issues (20)
- run engine error HOT 2
- Does this project have minimum CPU requirements? HOT 1
- 使用自己训练yolov5l模型,生成engine后检测锚框不准,在tensorrtx工程上可以正常运行,请问有可能是那部分的问题 HOT 1
- 关于dynamic input size
- where is attempt_download
- 关于yolov5s6减少类别至8的推理结果差异问题
- update yolov7 HOT 1
- 检测结果中id和真实目标的映射
- Are there any mirrors for the weight instead of "MEGA" host?
- 前处理和后处理的时间是不是太长了?
- how to find the corresponding version of yolov5?
- Explicit batch
- trt8 is not supply leaky? HOT 3
- trt8 maxpool的問題 HOT 3
- 有没有留一个专门函数,weights 转 engine 文件的
- 能够支持yolov3-tiny吗?
- yolo7 tiny? HOT 3
- YOLO v8
- Problems with yolov4 when loading engine second time
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
-
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google ❤️ Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.
from yolo-tensorrt.