initial commit

0b4e78cf · Sikhin VC · 9e765965 · 0b4e78cf · 0b4e78cf · 0b4e78cf
Commit 0b4e78cf authored May 31, 2023 by Sikhin VC
23 changed files
--- a/.gitignore
+++ b/.gitignore
+*/build
+*/*/build
+*/*.wts
+*/*.ppm
+*idea*
\ No newline at end of file
--- a/Dockerfile
+++ b/Dockerfile
+ARG TENSORRT="7"
+ARG CUDA="10"
+
+FROM hakuyyf/tensorrtx:trt${TENSORRT}_cuda${CUDA}
+
+# Get opencv 3.4 for bionic based images
+RUN rm /etc/apt/sources.list.d/timsc-ubuntu-opencv-3_3-bionic.list
+RUN rm /etc/apt/sources.list.d/timsc-ubuntu-opencv-3_3-bionic.list.save
+RUN add-apt-repository -y ppa:timsc/opencv-3.4
+
+RUN apt-get update
+RUN apt-get install -y libopencv-dev libopencv-dnn-dev libopencv-shape3.4-dbg
+
+# git clone tensorrtx
+RUN git clone https://github.com/wang-xinyu/tensorrtx.git
\ No newline at end of file
--- a/LICENSE
+++ b/LICENSE
+MIT License
+
+Copyright (c) 2019-2020 Wang Xinyu
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/yolo_optimization.py
+++ b/yolo_optimization.py
+import os
+import subprocess
+from loguru import logger
+
+
+class ModelOptimization:
+    def __init__(self, num_class, image_size=416):
+        self.num_class = num_class
+        self.image_size = image_size
+
+    def change_configurations(self):
+        logger.info(f"Provided number of classes and image size are :  {self.num_class} and {self.image_size}")
+        try:
+
+            with open('yolov5/yololayer.h', 'r') as file:
+                # read a list of lines into data
+                data = file.readlines()
+
+            data[19] = f"    static constexpr int CLASS_NUM = {self.num_class};\n"
+            data[20] = f"    static constexpr int INPUT_H = {self.image_size};\n"
+            data[21] = f"    static constexpr int INPUT_H = {self.image_size};\n"
+
+            # and write everything back
+            with open('yolov5/yololayer.h', 'w') as file:
+                file.writelines(data)
+            logger.info("Successfully changed configurations")
+        except Exception as e:
+            logger.info(f"Failed to change configurations :  {e}")
+
+    def optimize_model(self, ):
+        try:
+
+            current_directory = os.getcwd()
+            logger.info(f"Current directory is :  {current_directory}")
+            build_path = os.path.join(current_directory, "yolov5", "build")
+            os.mkdir(build_path)
+            logger.info(f"Created build folder")
+            os.chdir('yolov5/build')
+            logger.info("Running CMake command")
+            subprocess.run(['cmake', '..'])
+            logger.info("Running Make command")
+            subprocess.run(['make'])
+            logger.info("Optimizing model")
+            subprocess.run(["sudo", "./yolov5", "-s", "jk_v5_cam_47.wts", "jk_v5_cam_47.engine", "c", "0.33", "0.50"])
+
+        except Exception as e:
+            logger.info(f"Failed to optimized model :  {e}")
+
+
+# obj = ModelOptimization(num_class=2, image_size=244)
+# obj.change_configurations()
+
--- a/yolov5/CMakeLists.txt
+++ b/yolov5/CMakeLists.txt
+cmake_minimum_required(VERSION 2.6)
+
+project(yolov5)
+
+add_definitions(-std=c++11)
+add_definitions(-DAPI_EXPORTS)
+option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_BUILD_TYPE Debug)
+
+find_package(CUDA REQUIRED)
+
+if(WIN32)
+enable_language(CUDA)
+endif(WIN32)
+
+include_directories(${PROJECT_SOURCE_DIR}/include)
+# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
+# cuda
+include_directories(/usr/local/cuda/include)
+link_directories(/usr/local/cuda/lib64)
+# tensorrt
+include_directories(/usr/include/x86_64-linux-gnu/)
+link_directories(/usr/lib/x86_64-linux-gnu/)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -g -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
+cuda_add_library(myplugins SHARED yololayer.cu)
+target_link_libraries(myplugins nvinfer cudart)
+
+find_package(OpenCV)
+include_directories(${OpenCV_INCLUDE_DIRS})
+
+cuda_add_executable(yolov5 calibrator.cpp yolov5.cpp preprocess.cu)
+
+target_link_libraries(yolov5 nvinfer)
+target_link_libraries(yolov5 cudart)
+target_link_libraries(yolov5 myplugins)
+target_link_libraries(yolov5 ${OpenCV_LIBS})
+
+if(UNIX)
+add_definitions(-O2 -pthread)
+endif(UNIX)
+
+
--- a/yolov5/README.md
+++ b/yolov5/README.md
+# yolov5
+
+The Pytorch implementation is [ultralytics/yolov5](https://github.com/ultralytics/yolov5).
+
+## Different versions of yolov5
+
+Currently, we support yolov5 v1.0, v2.0, v3.0, v3.1, v4.0, v5.0 and v6.0.
+
+- For yolov5 v6.0, download .pt from [yolov5 release v6.0](https://github.com/ultralytics/yolov5/releases/tag/v6.0), `git clone -b v6.0 https://github.com/ultralytics/yolov5.git` and `git clone https://github.com/wang-xinyu/tensorrtx.git`, then follow how-to-run in current page.
+- For yolov5 v5.0, download .pt from [yolov5 release v5.0](https://github.com/ultralytics/yolov5/releases/tag/v5.0), `git clone -b v5.0 https://github.com/ultralytics/yolov5.git` and `git clone -b yolov5-v5.0 https://github.com/wang-xinyu/tensorrtx.git`, then follow how-to-run in [tensorrtx/yolov5-v5.0](https://github.com/wang-xinyu/tensorrtx/tree/yolov5-v5.0/yolov5).
+- For yolov5 v4.0, download .pt from [yolov5 release v4.0](https://github.com/ultralytics/yolov5/releases/tag/v4.0), `git clone -b v4.0 https://github.com/ultralytics/yolov5.git` and `git clone -b yolov5-v4.0 https://github.com/wang-xinyu/tensorrtx.git`, then follow how-to-run in [tensorrtx/yolov5-v4.0](https://github.com/wang-xinyu/tensorrtx/tree/yolov5-v4.0/yolov5).
+- For yolov5 v3.1, download .pt from [yolov5 release v3.1](https://github.com/ultralytics/yolov5/releases/tag/v3.1), `git clone -b v3.1 https://github.com/ultralytics/yolov5.git` and `git clone -b yolov5-v3.1 https://github.com/wang-xinyu/tensorrtx.git`, then follow how-to-run in [tensorrtx/yolov5-v3.1](https://github.com/wang-xinyu/tensorrtx/tree/yolov5-v3.1/yolov5).
+- For yolov5 v3.0, download .pt from [yolov5 release v3.0](https://github.com/ultralytics/yolov5/releases/tag/v3.0), `git clone -b v3.0 https://github.com/ultralytics/yolov5.git` and `git clone -b yolov5-v3.0 https://github.com/wang-xinyu/tensorrtx.git`, then follow how-to-run in [tensorrtx/yolov5-v3.0](https://github.com/wang-xinyu/tensorrtx/tree/yolov5-v3.0/yolov5).
+- For yolov5 v2.0, download .pt from [yolov5 release v2.0](https://github.com/ultralytics/yolov5/releases/tag/v2.0), `git clone -b v2.0 https://github.com/ultralytics/yolov5.git` and `git clone -b yolov5-v2.0 https://github.com/wang-xinyu/tensorrtx.git`, then follow how-to-run in [tensorrtx/yolov5-v2.0](https://github.com/wang-xinyu/tensorrtx/tree/yolov5-v2.0/yolov5).
+- For yolov5 v1.0, download .pt from [yolov5 release v1.0](https://github.com/ultralytics/yolov5/releases/tag/v1.0), `git clone -b v1.0 https://github.com/ultralytics/yolov5.git` and `git clone -b yolov5-v1.0 https://github.com/wang-xinyu/tensorrtx.git`, then follow how-to-run in [tensorrtx/yolov5-v1.0](https://github.com/wang-xinyu/tensorrtx/tree/yolov5-v1.0/yolov5).
+
+## Config
+
+- Choose the model n/s/m/l/x/n6/s6/m6/l6/x6 from command line arguments.
+- Input shape defined in yololayer.h
+- Number of classes defined in yololayer.h, **DO NOT FORGET TO ADAPT THIS, If using your own model**
+- INT8/FP16/FP32 can be selected by the macro in yolov5.cpp, **INT8 need more steps, pls follow `How to Run` first and then go the `INT8 Quantization` below**
+- GPU id can be selected by the macro in yolov5.cpp
+- NMS thresh in yolov5.cpp
+- BBox confidence thresh in yolov5.cpp
+- Batch size in yolov5.cpp
+
+## How to Run, yolov5s as example
+
+1. generate .wts from pytorch with .pt, or download .wts from model zoo
+
+```
+// clone code according to above #Different versions of yolov5
+// download https://github.com/ultralytics/yolov5/releases/download/v6.0/yolov5s.pt
+cp {tensorrtx}/yolov5/gen_wts.py {ultralytics}/yolov5
+cd {ultralytics}/yolov5
+python gen_wts.py -w yolov5s.pt -o yolov5s.wts
+// a file 'yolov5s.wts' will be generated.
+```
+
+2. build tensorrtx/yolov5 and run
+
+```
+cd {tensorrtx}/yolov5/
+// update CLASS_NUM in yololayer.h if your model is trained on custom dataset
+mkdir build
+cd build
+cp {ultralytics}/yolov5/yolov5s.wts {tensorrtx}/yolov5/build
+cmake ..
+make
+sudo ./yolov5 -s [.wts] [.engine] [n/s/m/l/x/n6/s6/m6/l6/x6 or c/c6 gd gw]  // serialize model to plan file
+sudo ./yolov5 -d [.engine] [image folder]  // deserialize and run inference, the images in [image folder] will be processed.
+// For example yolov5s
+sudo ./yolov5 -s yolov5s.wts yolov5s.engine s
+sudo ./yolov5 -d yolov5s.engine ../samples
+// For example Custom model with depth_multiple=0.17, width_multiple=0.25 in yolov5.yaml
+sudo ./yolov5 -s yolov5_custom.wts yolov5.engine c 0.17 0.25
+sudo ./yolov5 -d yolov5.engine ../samples
+```
+
+3. check the images generated, as follows. _zidane.jpg and _bus.jpg
+
+4. optional, load and run the tensorrt model in python
+
+```
+// install python-tensorrt, pycuda, etc.
+// ensure the yolov5s.engine and libmyplugins.so have been built
+python yolov5_trt.py
+
+// Another version of python script, which is using CUDA Python instead of pycuda.
+python yolov5_trt_cuda_python.py
+```
+
+# INT8 Quantization
+
+1. Prepare calibration images, you can randomly select 1000s images from your train set. For coco, you can also download my calibration images `coco_calib` from [GoogleDrive](https://drive.google.com/drive/folders/1s7jE9DtOngZMzJC1uL307J2MiaGwdRSI?usp=sharing) or [BaiduPan](https://pan.baidu.com/s/1GOm_-JobpyLMAqZWCDUhKg) pwd: a9wh
+
+2. unzip it in yolov5/build
+
+3. set the macro `USE_INT8` in yolov5.cpp and make
+
+4. serialize the model and test
+
+<p align="center">
+<img src="https://user-images.githubusercontent.com/15235574/78247927-4d9fac00-751e-11ea-8b1b-704a0aeb3fcf.jpg">
+</p>
+
+<p align="center">
+<img src="https://user-images.githubusercontent.com/15235574/78247970-60b27c00-751e-11ea-88df-41473fed4823.jpg">
+</p>
+
+## More Information
+
+See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)
+
--- a/yolov5/calibrator.cpp
+++ b/yolov5/calibrator.cpp
+#include <iostream>
+#include <iterator>
+#include <fstream>
+#include <opencv2/dnn/dnn.hpp>
+#include "calibrator.h"
+#include "cuda_utils.h"
+#include "utils.h"
+
+Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache)
+    : batchsize_(batchsize)
+    , input_w_(input_w)
+    , input_h_(input_h)
+    , img_idx_(0)
+    , img_dir_(img_dir)
+    , calib_table_name_(calib_table_name)
+    , input_blob_name_(input_blob_name)
+    , read_cache_(read_cache)
+{
+    input_count_ = 3 * input_w * input_h * batchsize;
+    CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float)));
+    read_files_in_dir(img_dir, img_files_);
+}
+
+Int8EntropyCalibrator2::~Int8EntropyCalibrator2()
+{
+    CUDA_CHECK(cudaFree(device_input_));
+}
+
+int Int8EntropyCalibrator2::getBatchSize() const TRT_NOEXCEPT
+{
+    return batchsize_;
+}
+
+bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT
+{
+    if (img_idx_ + batchsize_ > (int)img_files_.size()) {
+        return false;
+    }
+
+    std::vector<cv::Mat> input_imgs_;
+    for (int i = img_idx_; i < img_idx_ + batchsize_; i++) {
+        std::cout << img_files_[i] << "  " << i << std::endl;
+        cv::Mat temp = cv::imread(img_dir_ + img_files_[i]);
+        if (temp.empty()){
+            std::cerr << "Fatal error: image cannot open!" << std::endl;
+            return false;
+        }
+        cv::Mat pr_img = preprocess_img(temp, input_w_, input_h_);
+        input_imgs_.push_back(pr_img);
+    }
+    img_idx_ += batchsize_;
+    cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0 / 255.0, cv::Size(input_w_, input_h_), cv::Scalar(0, 0, 0), true, false);
+
+    CUDA_CHECK(cudaMemcpy(device_input_, blob.ptr<float>(0), input_count_ * sizeof(float), cudaMemcpyHostToDevice));
+    assert(!strcmp(names[0], input_blob_name_));
+    bindings[0] = device_input_;
+    return true;
+}
+
+const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) TRT_NOEXCEPT
+{
+    std::cout << "reading calib cache: " << calib_table_name_ << std::endl;
+    calib_cache_.clear();
+    std::ifstream input(calib_table_name_, std::ios::binary);
+    input >> std::noskipws;
+    if (read_cache_ && input.good())
+    {
+        std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(calib_cache_));
+    }
+    length = calib_cache_.size();
+    return length ? calib_cache_.data() : nullptr;
+}
+
+void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT
+{
+    std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl;
+    std::ofstream output(calib_table_name_, std::ios::binary);
+    output.write(reinterpret_cast<const char*>(cache), length);
+}
+
--- a/yolov5/calibrator.h
+++ b/yolov5/calibrator.h
+#ifndef ENTROPY_CALIBRATOR_H
+#define ENTROPY_CALIBRATOR_H
+
+#include <NvInfer.h>
+#include <string>
+#include <vector>
+#include "macros.h"
+
+//! \class Int8EntropyCalibrator2
+//!
+//! \brief Implements Entropy calibrator 2.
+//!  CalibrationAlgoType is kENTROPY_CALIBRATION_2.
+//!
+class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2
+{
+public:
+    Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true);
+
+    virtual ~Int8EntropyCalibrator2();
+    int getBatchSize() const TRT_NOEXCEPT override;
+    bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override;
+    const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override;
+    void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override;
+
+private:
+    int batchsize_;
+    int input_w_;
+    int input_h_;
+    int img_idx_;
+    std::string img_dir_;
+    std::vector<std::string> img_files_;
+    size_t input_count_;
+    std::string calib_table_name_;
+    const char* input_blob_name_;
+    bool read_cache_;
+    void* device_input_;
+    std::vector<char> calib_cache_;
+};
+
+#endif // ENTROPY_CALIBRATOR_H
--- a/yolov5/common.hpp
+++ b/yolov5/common.hpp
--- a/yolov5/cuda_utils.h
+++ b/yolov5/cuda_utils.h
+#ifndef TRTX_CUDA_UTILS_H_
+#define TRTX_CUDA_UTILS_H_
+
+#include <cuda_runtime_api.h>
+
+#ifndef CUDA_CHECK
+#define CUDA_CHECK(callstr)\
+    {\
+        cudaError_t error_code = callstr;\
+        if (error_code != cudaSuccess) {\
+            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\
+            assert(0);\
+        }\
+    }
+#endif  // CUDA_CHECK
+
+#endif  // TRTX_CUDA_UTILS_H_
+
--- a/yolov5/gen_wts.py
+++ b/yolov5/gen_wts.py
+import sys
+import argparse
+import os
+import struct
+import torch
+from utils.torch_utils import select_device
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Convert .pt file to .wts')
+    parser.add_argument('-w', '--weights', required=True, help='Input weights (.pt) file path (required)')
+    parser.add_argument('-o', '--output', help='Output (.wts) file path (optional)')
+    args = parser.parse_args()
+    if not os.path.isfile(args.weights):
+        raise SystemExit('Invalid input file')
+    if not args.output:
+        args.output = os.path.splitext(args.weights)[0] + '.wts'
+    elif os.path.isdir(args.output):
+        args.output = os.path.join(
+            args.output,
+            os.path.splitext(os.path.basename(args.weights))[0] + '.wts')
+    return args.weights, args.output
+
+
+pt_file, wts_file = parse_args()
+
+# Initialize
+device = select_device('cpu')
+# Load model
+model = torch.load(pt_file, map_location=device)  # load to FP32
+model = model['ema' if model.get('ema') else 'model'].float()
+
+# update anchor_grid info
+anchor_grid = model.model[-1].anchors * model.model[-1].stride[...,None,None]
+# model.model[-1].anchor_grid = anchor_grid
+delattr(model.model[-1], 'anchor_grid')  # model.model[-1] is detect layer
+model.model[-1].register_buffer("anchor_grid",anchor_grid) #The parameters are saved in the OrderDict through the "register_buffer" method, and then saved to the weight.
+
+model.to(device).eval()
+
+with open(wts_file, 'w') as f:
+    f.write('{}\n'.format(len(model.state_dict().keys())))
+    for k, v in model.state_dict().items():
+        vr = v.reshape(-1).cpu().numpy()
+        f.write('{} {} '.format(k, len(vr)))
+        for vv in vr:
+            f.write(' ')
+            f.write(struct.pack('>f' ,float(vv)).hex())
+        f.write('\n')
--- a/yolov5/logging.h
+++ b/yolov5/logging.h
--- a/yolov5/macros.h
+++ b/yolov5/macros.h
+#ifndef __MACROS_H
+#define __MACROS_H
+
+#ifdef API_EXPORTS
+#if defined(_MSC_VER)
+#define API __declspec(dllexport)
+#else
+#define API __attribute__((visibility("default")))
+#endif
+#else
+
+#if defined(_MSC_VER)
+#define API __declspec(dllimport)
+#else
+#define API
+#endif
+#endif  // API_EXPORTS
+
+#if NV_TENSORRT_MAJOR >= 8
+#define TRT_NOEXCEPT noexcept
+#define TRT_CONST_ENQUEUE const
+#else
+#define TRT_NOEXCEPT
+#define TRT_CONST_ENQUEUE
+#endif
+
+#endif  // __MACROS_H
--- a/yolov5/preprocess.cu
+++ b/yolov5/preprocess.cu
+#include "preprocess.h"
+#include <opencv2/opencv.hpp>
+
+__global__ void warpaffine_kernel( 
+    uint8_t* src, int src_line_size, int src_width, 
+    int src_height, float* dst, int dst_width, 
+    int dst_height, uint8_t const_value_st,
+    AffineMatrix d2s, int edge) {
+    int position = blockDim.x * blockIdx.x + threadIdx.x;
+    if (position >= edge) return;
+
+    float m_x1 = d2s.value[0];
+    float m_y1 = d2s.value[1];
+    float m_z1 = d2s.value[2];
+    float m_x2 = d2s.value[3];
+    float m_y2 = d2s.value[4];
+    float m_z2 = d2s.value[5];
+
+    int dx = position % dst_width;
+    int dy = position / dst_width;
+    float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f;
+    float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f;
+    float c0, c1, c2;
+
+    if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) {
+        // out of range
+        c0 = const_value_st;
+        c1 = const_value_st;
+        c2 = const_value_st;
+    } else {
+        int y_low = floorf(src_y);
+        int x_low = floorf(src_x);
+        int y_high = y_low + 1;
+        int x_high = x_low + 1;
+
+        uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
+        float ly = src_y - y_low;
+        float lx = src_x - x_low;
+        float hy = 1 - ly;
+        float hx = 1 - lx;
+        float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+        uint8_t* v1 = const_value;
+        uint8_t* v2 = const_value;
+        uint8_t* v3 = const_value;
+        uint8_t* v4 = const_value;
+
+        if (y_low >= 0) {
+            if (x_low >= 0)
+                v1 = src + y_low * src_line_size + x_low * 3;
+
+            if (x_high < src_width)
+                v2 = src + y_low * src_line_size + x_high * 3;
+        }
+
+        if (y_high < src_height) {
+            if (x_low >= 0)
+                v3 = src + y_high * src_line_size + x_low * 3;
+
+            if (x_high < src_width)
+                v4 = src + y_high * src_line_size + x_high * 3;
+        }
+
+        c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0];
+        c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1];
+        c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2];
+    }
+
+    //bgr to rgb 
+    float t = c2;
+    c2 = c0;
+    c0 = t;
+
+    //normalization
+    c0 = c0 / 255.0f;
+    c1 = c1 / 255.0f;
+    c2 = c2 / 255.0f;
+
+    //rgbrgbrgb to rrrgggbbb
+    int area = dst_width * dst_height;
+    float* pdst_c0 = dst + dy * dst_width + dx;
+    float* pdst_c1 = pdst_c0 + area;
+    float* pdst_c2 = pdst_c1 + area;
+    *pdst_c0 = c0;
+    *pdst_c1 = c1;
+    *pdst_c2 = c2;
+}
+
+void preprocess_kernel_img(
+    uint8_t* src, int src_width, int src_height,
+    float* dst, int dst_width, int dst_height,
+    cudaStream_t stream) {
+    AffineMatrix s2d,d2s;
+    float scale = std::min(dst_height / (float)src_height, dst_width / (float)src_width);
+
+    s2d.value[0] = scale;
+    s2d.value[1] = 0;
+    s2d.value[2] = -scale * src_width  * 0.5  + dst_width * 0.5;
+    s2d.value[3] = 0;
+    s2d.value[4] = scale;
+    s2d.value[5] = -scale * src_height * 0.5 + dst_height * 0.5;
+
+    cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value);
+    cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value);
+    cv::invertAffineTransform(m2x3_s2d, m2x3_d2s);
+
+    memcpy(d2s.value, m2x3_d2s.ptr<float>(0), sizeof(d2s.value));
+
+    int jobs = dst_height * dst_width;
+    int threads = 256;
+    int blocks = ceil(jobs / (float)threads);
+    warpaffine_kernel<<<blocks, threads, 0, stream>>>(
+        src, src_width*3, src_width,
+        src_height, dst, dst_width,
+        dst_height, 128, d2s, jobs);
+
+}
--- a/yolov5/preprocess.h
+++ b/yolov5/preprocess.h
+#ifndef __PREPROCESS_H
+#define __PREPROCESS_H
+
+#include <cuda_runtime.h>
+#include <cstdint>
+
+
+struct AffineMatrix{
+    float value[6];
+};
+
+
+void preprocess_kernel_img(uint8_t* src, int src_width, int src_height,
+                           float* dst, int dst_width, int dst_height,
+                           cudaStream_t stream);
+#endif  // __PREPROCESS_H
--- a/yolov5/samples
+++ b/yolov5/samples
+../yolov3-spp/samples/
\ No newline at end of file
--- a/yolov5/utils.h
+++ b/yolov5/utils.h
+#ifndef TRTX_YOLOV5_UTILS_H_
+#define TRTX_YOLOV5_UTILS_H_
+
+#include <dirent.h>
+#include <opencv2/opencv.hpp>
+
+static inline cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) {
+    int w, h, x, y;
+    float r_w = input_w / (img.cols*1.0);
+    float r_h = input_h / (img.rows*1.0);
+    if (r_h > r_w) {
+        w = input_w;
+        h = r_w * img.rows;
+        x = 0;
+        y = (input_h - h) / 2;
+    } else {
+        w = r_h * img.cols;
+        h = input_h;
+        x = (input_w - w) / 2;
+        y = 0;
+    }
+    cv::Mat re(h, w, CV_8UC3);
+    cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
+    cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128));
+    re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
+    return out;
+}
+
+static inline int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
+    DIR *p_dir = opendir(p_dir_name);
+    if (p_dir == nullptr) {
+        return -1;
+    }
+
+    struct dirent* p_file = nullptr;
+    while ((p_file = readdir(p_dir)) != nullptr) {
+        if (strcmp(p_file->d_name, ".") != 0 &&
+            strcmp(p_file->d_name, "..") != 0) {
+            //std::string cur_file_name(p_dir_name);
+            //cur_file_name += "/";
+            //cur_file_name += p_file->d_name;
+            std::string cur_file_name(p_file->d_name);
+            file_names.push_back(cur_file_name);
+        }
+    }
+
+    closedir(p_dir);
+    return 0;
+}
+
+#endif  // TRTX_YOLOV5_UTILS_H_
+
--- a/yolov5/yololayer.cu
+++ b/yolov5/yololayer.cu
--- a/yolov5/yololayer.h
+++ b/yolov5/yololayer.h
+#ifndef _YOLO_LAYER_H
+#define _YOLO_LAYER_H
+
+#include <vector>
+#include <string>
+#include <NvInfer.h>
+#include "macros.h"
+
+namespace Yolo
+{
+    static constexpr int CHECK_COUNT = 3;
+    static constexpr float IGNORE_THRESH = 0.1f;
+    struct YoloKernel
+    {
+        int width;
+        int height;
+        float anchors[CHECK_COUNT * 2];
+    };
+    static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000;
+    static constexpr int CLASS_NUM = 80;
+    static constexpr int INPUT_H = 640;  // yolov5's input height and width must be divisible by 32.
+    static constexpr int INPUT_W = 640;
+
+    static constexpr int LOCATIONS = 4;
+    struct alignas(float) Detection {
+        //center_x center_y w h
+        float bbox[LOCATIONS];
+        float conf;  // bbox_conf * cls_conf
+        float class_id;
+    };
+}
+
+namespace nvinfer1
+{
+    class API YoloLayerPlugin : public IPluginV2IOExt
+    {
+    public:
+        YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, const std::vector<Yolo::YoloKernel>& vYoloKernel);
+        YoloLayerPlugin(const void* data, size_t length);
+        ~YoloLayerPlugin();
+
+        int getNbOutputs() const TRT_NOEXCEPT override
+        {
+            return 1;
+        }
+
+        Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT override;
+
+        int initialize() TRT_NOEXCEPT override;
+
+        virtual void terminate() TRT_NOEXCEPT override {};
+
+        virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; }
+
+        virtual int enqueue(int batchSize, const void* const* inputs, void*TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+
+        virtual size_t getSerializationSize() const TRT_NOEXCEPT override;
+
+        virtual void serialize(void* buffer) const TRT_NOEXCEPT override;
+
+        bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const TRT_NOEXCEPT override {
+            return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
+        }
+
+        const char* getPluginType() const TRT_NOEXCEPT override;
+
+        const char* getPluginVersion() const TRT_NOEXCEPT override;
+
+        void destroy() TRT_NOEXCEPT override;
+
+        IPluginV2IOExt* clone() const TRT_NOEXCEPT override;
+
+        void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override;
+
+        const char* getPluginNamespace() const TRT_NOEXCEPT override;
+
+        DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;
+
+        bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override;
+
+        bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override;
+
+        void attachToContext(
+            cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;
+
+        void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT override;
+
+        void detachFromContext() TRT_NOEXCEPT override;
+
+    private:
+        void forwardGpu(const float* const* inputs, float *output, cudaStream_t stream, int batchSize = 1);
+        int mThreadCount = 256;
+        const char* mPluginNamespace;
+        int mKernelCount;
+        int mClassCount;
+        int mYoloV5NetWidth;
+        int mYoloV5NetHeight;
+        int mMaxOutObject;
+        std::vector<Yolo::YoloKernel> mYoloKernel;
+        void** mAnchor;
+    };
+
+    class API YoloPluginCreator : public IPluginCreator
+    {
+    public:
+        YoloPluginCreator();
+
+        ~YoloPluginCreator() override = default;
+
+        const char* getPluginName() const TRT_NOEXCEPT override;
+
+        const char* getPluginVersion() const TRT_NOEXCEPT override;
+
+        const PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;
+
+        IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT override;
+
+        IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;
+
+        void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override
+        {
+            mNamespace = libNamespace;
+        }
+
+        const char* getPluginNamespace() const TRT_NOEXCEPT override
+        {
+            return mNamespace.c_str();
+        }
+
+    private:
+        std::string mNamespace;
+        static PluginFieldCollection mFC;
+        static std::vector<PluginField> mPluginAttributes;
+    };
+    REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
+};
+
+#endif  // _YOLO_LAYER_H
--- a/yolov5/yololayer_bkp.h
+++ b/yolov5/yololayer_bkp.h
+#ifndef _YOLO_LAYER_H
+#define _YOLO_LAYER_H
+
+#include <vector>
+#include <string>
+#include <NvInfer.h>
+#include "macros.h"
+
+namespace Yolo
+{
+    static constexpr int CHECK_COUNT = 3;
+    static constexpr float IGNORE_THRESH = 0.1f;
+    struct YoloKernel
+    {
+        int width;
+        int height;
+        float anchors[CHECK_COUNT * 2];
+    };
+    static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000;
+    static constexpr int CLASS_NUM = 2;
+    static constexpr int INPUT_H = 244;
+    static constexpr int INPUT_H = 244;
+
+    static constexpr int LOCATIONS = 4;
+    struct alignas(float) Detection {
+        //center_x center_y w h
+        float bbox[LOCATIONS];
+        float conf;  // bbox_conf * cls_conf
+        float class_id;
+    };
+}
+
+namespace nvinfer1
+{
+    class API YoloLayerPlugin : public IPluginV2IOExt
+    {
+    public:
+        YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, const std::vector<Yolo::YoloKernel>& vYoloKernel);
+        YoloLayerPlugin(const void* data, size_t length);
+        ~YoloLayerPlugin();
+
+        int getNbOutputs() const TRT_NOEXCEPT override
+        {
+            return 1;
+        }
+
+        Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT override;
+
+        int initialize() TRT_NOEXCEPT override;
+
+        virtual void terminate() TRT_NOEXCEPT override {};
+
+        virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; }
+
+        virtual int enqueue(int batchSize, const void* const* inputs, void*TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+
+        virtual size_t getSerializationSize() const TRT_NOEXCEPT override;
+
+        virtual void serialize(void* buffer) const TRT_NOEXCEPT override;
+
+        bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const TRT_NOEXCEPT override {
+            return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
+        }
+
+        const char* getPluginType() const TRT_NOEXCEPT override;
+
+        const char* getPluginVersion() const TRT_NOEXCEPT override;
+
+        void destroy() TRT_NOEXCEPT override;
+
+        IPluginV2IOExt* clone() const TRT_NOEXCEPT override;
+
+        void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override;
+
+        const char* getPluginNamespace() const TRT_NOEXCEPT override;
+
+        DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;
+
+        bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override;
+
+        bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override;
+
+        void attachToContext(
+            cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;
+
+        void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT override;
+
+        void detachFromContext() TRT_NOEXCEPT override;
+
+    private:
+        void forwardGpu(const float* const* inputs, float *output, cudaStream_t stream, int batchSize = 1);
+        int mThreadCount = 256;
+        const char* mPluginNamespace;
+        int mKernelCount;
+        int mClassCount;
+        int mYoloV5NetWidth;
+        int mYoloV5NetHeight;
+        int mMaxOutObject;
+        std::vector<Yolo::YoloKernel> mYoloKernel;
+        void** mAnchor;
+    };
+
+    class API YoloPluginCreator : public IPluginCreator
+    {
+    public:
+        YoloPluginCreator();
+
+        ~YoloPluginCreator() override = default;
+
+        const char* getPluginName() const TRT_NOEXCEPT override;
+
+        const char* getPluginVersion() const TRT_NOEXCEPT override;
+
+        const PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;
+
+        IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT override;
+
+        IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;
+
+        void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override
+        {
+            mNamespace = libNamespace;
+        }
+
+        const char* getPluginNamespace() const TRT_NOEXCEPT override
+        {
+            return mNamespace.c_str();
+        }
+
+    private:
+        std::string mNamespace;
+        static PluginFieldCollection mFC;
+        static std::vector<PluginField> mPluginAttributes;
+    };
+    REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
+};
+
+#endif  // _YOLO_LAYER_H
--- a/yolov5/yolov5.cpp
+++ b/yolov5/yolov5.cpp
--- a/yolov5/yolov5_trt.py
+++ b/yolov5/yolov5_trt.py
--- a/yolov5/yolov5_trt_cuda_python.py
+++ b/yolov5/yolov5_trt_cuda_python.py