xuning

引入yolo8

  1 +// Tencent is pleased to support the open source community by making ncnn available.
  2 +//
  3 +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
  4 +//
  5 +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6 +// in compliance with the License. You may obtain a copy of the License at
  7 +//
  8 +// https://opensource.org/licenses/BSD-3-Clause
  9 +//
  10 +// Unless required by applicable law or agreed to in writing, software distributed
  11 +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12 +// CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13 +// specific language governing permissions and limitations under the License.
  14 +
  15 +#include "yolov8.h"
  16 +
  17 +YOLOv8::~YOLOv8()
  18 +{
  19 + det_target_size = 320;
  20 +}
  21 +
  22 +int YOLOv8::load(const char* parampath, const char* modelpath, bool use_gpu)
  23 +{
  24 + yolov8.clear();
  25 +
  26 + yolov8.opt = ncnn::Option();
  27 +
  28 +#if NCNN_VULKAN
  29 + yolov8.opt.use_vulkan_compute = use_gpu;
  30 +#endif
  31 +
  32 + yolov8.load_param(parampath);
  33 + yolov8.load_model(modelpath);
  34 +
  35 + return 0;
  36 +}
  37 +
  38 +int YOLOv8::load(AAssetManager* mgr, const char* parampath, const char* modelpath, bool use_gpu)
  39 +{
  40 + yolov8.clear();
  41 +
  42 + yolov8.opt = ncnn::Option();
  43 +
  44 +#if NCNN_VULKAN
  45 + yolov8.opt.use_vulkan_compute = use_gpu;
  46 +#endif
  47 +
  48 + yolov8.load_param(mgr, parampath);
  49 + yolov8.load_model(mgr, modelpath);
  50 +
  51 + return 0;
  52 +}
  53 +
  54 +void YOLOv8::set_det_target_size(int target_size)
  55 +{
  56 + det_target_size = target_size;
  57 +}
  1 +// Tencent is pleased to support the open source community by making ncnn available.
  2 +//
  3 +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
  4 +//
  5 +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6 +// in compliance with the License. You may obtain a copy of the License at
  7 +//
  8 +// https://opensource.org/licenses/BSD-3-Clause
  9 +//
  10 +// Unless required by applicable law or agreed to in writing, software distributed
  11 +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12 +// CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13 +// specific language governing permissions and limitations under the License.
  14 +
  15 +#ifndef YOLOV8_H
  16 +#define YOLOV8_H
  17 +
  18 +#include <opencv2/core/core.hpp>
  19 +
  20 +#include <net.h>
  21 +
  22 +struct KeyPoint
  23 +{
  24 + cv::Point2f p;
  25 + float prob;
  26 +};
  27 +
  28 +struct Object
  29 +{
  30 + cv::Rect_<float> rect;
  31 + cv::RotatedRect rrect;
  32 + int label;
  33 + float prob;
  34 + int gindex;
  35 + cv::Mat mask;
  36 + std::vector<KeyPoint> keypoints;
  37 +};
  38 +
  39 +class YOLOv8
  40 +{
  41 +public:
  42 + virtual ~YOLOv8();
  43 +
  44 + int load(const char* parampath, const char* modelpath, bool use_gpu = false);
  45 + int load(AAssetManager* mgr, const char* parampath, const char* modelpath, bool use_gpu = false);
  46 +
  47 + void set_det_target_size(int target_size);
  48 +
  49 + virtual int detect(const cv::Mat& rgb, std::vector<Object>& objects) = 0;
  50 + virtual int draw(cv::Mat& rgb, const std::vector<Object>& objects) = 0;
  51 +
  52 +protected:
  53 + ncnn::Net yolov8;
  54 + int det_target_size;
  55 +};
  56 +
  57 +class YOLOv8_det : public YOLOv8
  58 +{
  59 +public:
  60 + virtual int detect(const cv::Mat& rgb, std::vector<Object>& objects);
  61 +};
  62 +
  63 +class YOLOv8_det_coco : public YOLOv8_det
  64 +{
  65 +public:
  66 + virtual int draw(cv::Mat& rgb, const std::vector<Object>& objects);
  67 +};
  68 +
  69 +class YOLOv8_det_oiv7 : public YOLOv8_det
  70 +{
  71 +public:
  72 + virtual int draw(cv::Mat& rgb, const std::vector<Object>& objects);
  73 +};
  74 +
  75 +class YOLOv8_seg : public YOLOv8
  76 +{
  77 +public:
  78 + virtual int detect(const cv::Mat& rgb, std::vector<Object>& objects);
  79 + virtual int draw(cv::Mat& rgb, const std::vector<Object>& objects);
  80 +};
  81 +
  82 +class YOLOv8_pose : public YOLOv8
  83 +{
  84 +public:
  85 + virtual int detect(const cv::Mat& rgb, std::vector<Object>& objects);
  86 + virtual int draw(cv::Mat& rgb, const std::vector<Object>& objects);
  87 +};
  88 +
  89 +class YOLOv8_cls : public YOLOv8
  90 +{
  91 +public:
  92 + virtual int detect(const cv::Mat& rgb, std::vector<Object>& objects);
  93 + virtual int draw(cv::Mat& rgb, const std::vector<Object>& objects);
  94 +};
  95 +
  96 +class YOLOv8_obb : public YOLOv8
  97 +{
  98 +public:
  99 + virtual int detect(const cv::Mat& rgb, std::vector<Object>& objects);
  100 + virtual int draw(cv::Mat& rgb, const std::vector<Object>& objects);
  101 +};
  102 +
  103 +#endif // YOLOV8_H
  1 +// Tencent is pleased to support the open source community by making ncnn available.
  2 +//
  3 +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
  4 +//
  5 +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6 +// in compliance with the License. You may obtain a copy of the License at
  7 +//
  8 +// https://opensource.org/licenses/BSD-3-Clause
  9 +//
  10 +// Unless required by applicable law or agreed to in writing, software distributed
  11 +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12 +// CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13 +// specific language governing permissions and limitations under the License.
  14 +
  15 +// 1. install
  16 +// pip3 install -U ultralytics pnnx ncnn
  17 +// 2. export yolov8-cls torchscript
  18 +// yolo export model=yolov8n-cls.pt format=torchscript
  19 +// 3. convert torchscript with static shape
  20 +// pnnx yolov8n-cls.torchscript
  21 +// 4. now you get ncnn model files
  22 +// yolov8n_cls.ncnn.param
  23 +// yolov8n_cls.ncnn.bin
  24 +
  25 +#include "yolov8.h"
  26 +
  27 +#include <opencv2/core/core.hpp>
  28 +#include <opencv2/imgproc/imgproc.hpp>
  29 +
  30 +#include <float.h>
  31 +#include <stdio.h>
  32 +#include <vector>
  33 +
  34 +static void get_topk(const ncnn::Mat& cls_scores, int topk, std::vector<Object>& objects)
  35 +{
  36 + // partial sort topk with index
  37 + int size = cls_scores.w;
  38 + std::vector<std::pair<float, int> > vec;
  39 + vec.resize(size);
  40 + for (int i = 0; i < size; i++)
  41 + {
  42 + vec[i] = std::make_pair(cls_scores[i], i);
  43 + }
  44 +
  45 + std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
  46 + std::greater<std::pair<float, int> >());
  47 +
  48 + objects.resize(topk);
  49 + for (int i = 0; i < topk; i++)
  50 + {
  51 + objects[i].label = vec[i].second;
  52 + objects[i].prob = vec[i].first;
  53 + }
  54 +}
  55 +
  56 +int YOLOv8_cls::detect(const cv::Mat& rgb, std::vector<Object>& objects)
  57 +{
  58 + const int target_size = 224;
  59 + const int topk = 5;
  60 +
  61 + int img_w = rgb.cols;
  62 + int img_h = rgb.rows;
  63 +
  64 + // letterbox pad
  65 + int w = img_w;
  66 + int h = img_h;
  67 + float scale = 1.f;
  68 + if (w > h)
  69 + {
  70 + scale = (float)target_size / w;
  71 + w = target_size;
  72 + h = h * scale;
  73 + }
  74 + else
  75 + {
  76 + scale = (float)target_size / h;
  77 + h = target_size;
  78 + w = w * scale;
  79 + }
  80 +
  81 + ncnn::Mat in = ncnn::Mat::from_pixels_resize(rgb.data, ncnn::Mat::PIXEL_RGB, img_w, img_h, w, h);
  82 +
  83 + // letterbox pad to target_size rectangle
  84 + int wpad = target_size - w;
  85 + int hpad = target_size - h;
  86 + ncnn::Mat in_pad;
  87 + ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f);
  88 +
  89 + const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
  90 + in_pad.substract_mean_normalize(0, norm_vals);
  91 +
  92 + ncnn::Extractor ex = yolov8.create_extractor();
  93 +
  94 + ex.input("in0", in_pad);
  95 +
  96 + ncnn::Mat out;
  97 + ex.extract("out0", out);
  98 +
  99 + // return top-5
  100 + get_topk(out, topk, objects);
  101 +
  102 + return 0;
  103 +}
  104 +
  105 +int YOLOv8_cls::draw(cv::Mat& rgb, const std::vector<Object>& objects)
  106 +{
  107 + static const char* class_names[] = {
  108 + "tench", "goldfish", "great white shark", "tiger shark", "hammerhead", "electric ray", "stingray", "cock",
  109 + "hen", "ostrich", "brambling", "goldfinch", "house finch", "junco", "indigo bunting", "robin", "bulbul",
  110 + "jay", "magpie", "chickadee", "water ouzel", "kite", "bald eagle", "vulture", "great grey owl",
  111 + "European fire salamander", "common newt", "eft", "spotted salamander", "axolotl", "bullfrog", "tree frog",
  112 + "tailed frog", "loggerhead", "leatherback turtle", "mud turtle", "terrapin", "box turtle", "banded gecko",
  113 + "common iguana", "American chameleon", "whiptail", "agama", "frilled lizard", "alligator lizard",
  114 + "Gila monster", "green lizard", "African chameleon", "Komodo dragon", "African crocodile",
  115 + "American alligator", "triceratops", "thunder snake", "ringneck snake", "hognose snake", "green snake",
  116 + "king snake", "garter snake", "water snake", "vine snake", "night snake", "boa constrictor", "rock python",
  117 + "Indian cobra", "green mamba", "sea snake", "horned viper", "diamondback", "sidewinder", "trilobite",
  118 + "harvestman", "scorpion", "black and gold garden spider", "barn spider", "garden spider", "black widow",
  119 + "tarantula", "wolf spider", "tick", "centipede", "black grouse", "ptarmigan", "ruffed grouse",
  120 + "prairie chicken", "peacock", "quail", "partridge", "African grey", "macaw", "sulphur-crested cockatoo",
  121 + "lorikeet", "coucal", "bee eater", "hornbill", "hummingbird", "jacamar", "toucan", "drake",
  122 + "red-breasted merganser", "goose", "black swan", "tusker", "echidna", "platypus", "wallaby", "koala",
  123 + "wombat", "jellyfish", "sea anemone", "brain coral", "flatworm", "nematode", "conch", "snail", "slug",
  124 + "sea slug", "chiton", "chambered nautilus", "Dungeness crab", "rock crab", "fiddler crab", "king crab",
  125 + "American lobster", "spiny lobster", "crayfish", "hermit crab", "isopod", "white stork", "black stork",
  126 + "spoonbill", "flamingo", "little blue heron", "American egret", "bittern", "crane (bird)", "limpkin",
  127 + "European gallinule", "American coot", "bustard", "ruddy turnstone", "red-backed sandpiper", "redshank",
  128 + "dowitcher", "oystercatcher", "pelican", "king penguin", "albatross", "grey whale", "killer whale",
  129 + "dugong", "sea lion", "Chihuahua", "Japanese spaniel", "Maltese dog", "Pekinese", "Shih-Tzu",
  130 + "Blenheim spaniel", "papillon", "toy terrier", "Rhodesian ridgeback", "Afghan hound", "basset", "beagle",
  131 + "bloodhound", "bluetick", "black-and-tan coonhound", "Walker hound", "English foxhound", "redbone",
  132 + "borzoi", "Irish wolfhound", "Italian greyhound", "whippet", "Ibizan hound", "Norwegian elkhound",
  133 + "otterhound", "Saluki", "Scottish deerhound", "Weimaraner", "Staffordshire bullterrier",
  134 + "American Staffordshire terrier", "Bedlington terrier", "Border terrier", "Kerry blue terrier",
  135 + "Irish terrier", "Norfolk terrier", "Norwich terrier", "Yorkshire terrier", "wire-haired fox terrier",
  136 + "Lakeland terrier", "Sealyham terrier", "Airedale", "cairn", "Australian terrier", "Dandie Dinmont",
  137 + "Boston bull", "miniature schnauzer", "giant schnauzer", "standard schnauzer", "Scotch terrier",
  138 + "Tibetan terrier", "silky terrier", "soft-coated wheaten terrier", "West Highland white terrier",
  139 + "Lhasa", "flat-coated retriever", "curly-coated retriever", "golden retriever", "Labrador retriever",
  140 + "Chesapeake Bay retriever", "German short-haired pointer", "vizsla", "English setter", "Irish setter",
  141 + "Gordon setter", "Brittany spaniel", "clumber", "English springer", "Welsh springer spaniel",
  142 + "cocker spaniel", "Sussex spaniel", "Irish water spaniel", "kuvasz", "schipperke", "groenendael",
  143 + "malinois", "briard", "kelpie", "komondor", "Old English sheepdog", "Shetland sheepdog", "collie",
  144 + "Border collie", "Bouvier des Flandres", "Rottweiler", "German shepherd", "Doberman",
  145 + "miniature pinscher", "Greater Swiss Mountain dog", "Bernese mountain dog", "Appenzeller", "EntleBucher",
  146 + "boxer", "bull mastiff", "Tibetan mastiff", "French bulldog", "Great Dane", "Saint Bernard",
  147 + "Eskimo dog", "malamute", "Siberian husky", "dalmatian", "affenpinscher", "basenji", "pug", "Leonberg",
  148 + "Newfoundland", "Great Pyrenees", "Samoyed", "Pomeranian", "chow", "keeshond", "Brabancon griffon",
  149 + "Pembroke", "Cardigan", "toy poodle", "miniature poodle", "standard poodle", "Mexican hairless",
  150 + "timber wolf", "white wolf", "red wolf", "coyote", "dingo", "dhole", "African hunting dog", "hyena",
  151 + "red fox", "kit fox", "Arctic fox", "grey fox", "tabby", "tiger cat", "Persian cat", "Siamese cat",
  152 + "Egyptian cat", "cougar", "lynx", "leopard", "snow leopard", "jaguar", "lion", "tiger", "cheetah",
  153 + "brown bear", "American black bear", "ice bear", "sloth bear", "mongoose", "meerkat", "tiger beetle",
  154 + "ladybug", "ground beetle", "long-horned beetle", "leaf beetle", "dung beetle", "rhinoceros beetle",
  155 + "weevil", "fly", "bee", "ant", "grasshopper", "cricket", "walking stick", "cockroach", "mantis",
  156 + "cicada", "leafhopper", "lacewing", "dragonfly", "damselfly", "admiral", "ringlet", "monarch",
  157 + "cabbage butterfly", "sulphur butterfly", "lycaenid", "starfish", "sea urchin", "sea cucumber",
  158 + "wood rabbit", "hare", "Angora", "hamster", "porcupine", "fox squirrel", "marmot", "beaver",
  159 + "guinea pig", "sorrel", "zebra", "hog", "wild boar", "warthog", "hippopotamus", "ox", "water buffalo",
  160 + "bison", "ram", "bighorn", "ibex", "hartebeest", "impala", "gazelle", "Arabian camel", "llama",
  161 + "weasel", "mink", "polecat", "black-footed ferret", "otter", "skunk", "badger", "armadillo",
  162 + "three-toed sloth", "orangutan", "gorilla", "chimpanzee", "gibbon", "siamang", "guenon", "patas",
  163 + "baboon", "macaque", "langur", "colobus", "proboscis monkey", "marmoset", "capuchin", "howler monkey",
  164 + "titi", "spider monkey", "squirrel monkey", "Madagascar cat", "indri", "Indian elephant",
  165 + "African elephant", "lesser panda", "giant panda", "barracouta", "eel", "coho", "rock beauty",
  166 + "anemone fish", "sturgeon", "gar", "lionfish", "puffer", "abacus", "abaya", "academic gown",
  167 + "accordion", "acoustic guitar", "aircraft carrier", "airliner", "airship", "altar", "ambulance",
  168 + "amphibian", "analog clock", "apiary", "apron", "ashcan", "assault rifle", "backpack", "bakery",
  169 + "balance beam", "balloon", "ballpoint", "Band Aid", "banjo", "bannister", "barbell", "barber chair",
  170 + "barbershop", "barn", "barometer", "barrel", "barrow", "baseball", "basketball", "bassinet", "bassoon",
  171 + "bathing cap", "bath towel", "bathtub", "beach wagon", "beacon", "beaker", "bearskin", "beer bottle",
  172 + "beer glass", "bell cote", "bib", "bicycle-built-for-two", "bikini", "binder", "binoculars",
  173 + "birdhouse", "boathouse", "bobsled", "bolo tie", "bonnet", "bookcase", "bookshop", "bottlecap", "bow",
  174 + "bow tie", "brass", "brassiere", "breakwater", "breastplate", "broom", "bucket", "buckle",
  175 + "bulletproof vest", "bullet train", "butcher shop", "cab", "caldron", "candle", "cannon", "canoe",
  176 + "can opener", "cardigan", "car mirror", "carousel", "carpenter's kit", "carton", "car wheel",
  177 + "cash machine", "cassette", "cassette player", "castle", "catamaran", "CD player", "cello",
  178 + "cellular telephone", "chain", "chainlink fence", "chain mail", "chain saw", "chest", "chiffonier",
  179 + "chime", "china cabinet", "Christmas stocking", "church", "cinema", "cleaver", "cliff dwelling",
  180 + "cloak", "clog", "cocktail shaker", "coffee mug", "coffeepot", "coil", "combination lock",
  181 + "computer keyboard", "confectionery", "container ship", "convertible", "corkscrew", "cornet",
  182 + "cowboy boot", "cowboy hat", "cradle", "crane (machine)", "crash helmet", "crate", "crib",
  183 + "Crock Pot", "croquet ball", "crutch", "cuirass", "dam", "desk", "desktop computer", "dial telephone",
  184 + "diaper", "digital clock", "digital watch", "dining table", "dishrag", "dishwasher", "disk brake",
  185 + "dock", "dogsled", "dome", "doormat", "drilling platform", "drum", "drumstick", "dumbbell",
  186 + "Dutch oven", "electric fan", "electric guitar", "electric locomotive", "entertainment center",
  187 + "envelope", "espresso maker", "face powder", "feather boa", "file", "fireboat", "fire engine",
  188 + "fire screen", "flagpole", "flute", "folding chair", "football helmet", "forklift", "fountain",
  189 + "fountain pen", "four-poster", "freight car", "French horn", "frying pan", "fur coat", "garbage truck",
  190 + "gasmask", "gas pump", "goblet", "go-kart", "golf ball", "golfcart", "gondola", "gong", "gown",
  191 + "grand piano", "greenhouse", "grille", "grocery store", "guillotine", "hair slide", "hair spray",
  192 + "half track", "hammer", "hamper", "hand blower", "hand-held computer", "handkerchief", "hard disc",
  193 + "harmonica", "harp", "harvester", "hatchet", "holster", "home theater", "honeycomb", "hook",
  194 + "hoopskirt", "horizontal bar", "horse cart", "hourglass", "iPod", "iron", "jack-o'-lantern", "jean",
  195 + "jeep", "jersey", "jigsaw puzzle", "jinrikisha", "joystick", "kimono", "knee pad", "knot", "lab coat",
  196 + "ladle", "lampshade", "laptop", "lawn mower", "lens cap", "letter opener", "library", "lifeboat",
  197 + "lighter", "limousine", "liner", "lipstick", "Loafer", "lotion", "loudspeaker", "loupe", "lumbermill",
  198 + "magnetic compass", "mailbag", "mailbox", "maillot (tights)", "maillot (tank suit)", "manhole cover",
  199 + "maraca", "marimba", "mask", "matchstick", "maypole", "maze", "measuring cup", "medicine chest",
  200 + "megalith", "microphone", "microwave", "military uniform", "milk can", "minibus", "miniskirt",
  201 + "minivan", "missile", "mitten", "mixing bowl", "mobile home", "Model T", "modem", "monastery",
  202 + "monitor", "moped", "mortar", "mortarboard", "mosque", "mosquito net", "motor scooter", "mountain bike",
  203 + "mountain tent", "mouse", "mousetrap", "moving van", "muzzle", "nail", "neck brace", "necklace",
  204 + "nipple", "notebook", "obelisk", "oboe", "ocarina", "odometer", "oil filter", "organ", "oscilloscope",
  205 + "overskirt", "oxcart", "oxygen mask", "packet", "paddle", "paddlewheel", "padlock", "paintbrush",
  206 + "pajama", "palace", "panpipe", "paper towel", "parachute", "parallel bars", "park bench",
  207 + "parking meter", "passenger car", "patio", "pay-phone", "pedestal", "pencil box", "pencil sharpener",
  208 + "perfume", "Petri dish", "photocopier", "pick", "pickelhaube", "picket fence", "pickup", "pier",
  209 + "piggy bank", "pill bottle", "pillow", "ping-pong ball", "pinwheel", "pirate", "pitcher", "plane",
  210 + "planetarium", "plastic bag", "plate rack", "plow", "plunger", "Polaroid camera", "pole",
  211 + "police van", "poncho", "pool table", "pop bottle", "pot", "potter's wheel", "power drill",
  212 + "prayer rug", "printer", "prison", "projectile", "projector", "puck", "punching bag", "purse",
  213 + "quill", "quilt", "racer", "racket", "radiator", "radio", "radio telescope", "rain barrel",
  214 + "recreational vehicle", "reel", "reflex camera", "refrigerator", "remote control", "restaurant",
  215 + "revolver", "rifle", "rocking chair", "rotisserie", "rubber eraser", "rugby ball", "rule",
  216 + "running shoe", "safe", "safety pin", "saltshaker", "sandal", "sarong", "sax", "scabbard", "scale",
  217 + "school bus", "schooner", "scoreboard", "screen", "screw", "screwdriver", "seat belt", "sewing machine",
  218 + "shield", "shoe shop", "shoji", "shopping basket", "shopping cart", "shovel", "shower cap",
  219 + "shower curtain", "ski", "ski mask", "sleeping bag", "slide rule", "sliding door", "slot", "snorkel",
  220 + "snowmobile", "snowplow", "soap dispenser", "soccer ball", "sock", "solar dish", "sombrero",
  221 + "soup bowl", "space bar", "space heater", "space shuttle", "spatula", "speedboat", "spider web",
  222 + "spindle", "sports car", "spotlight", "stage", "steam locomotive", "steel arch bridge", "steel drum",
  223 + "stethoscope", "stole", "stone wall", "stopwatch", "stove", "strainer", "streetcar", "stretcher",
  224 + "studio couch", "stupa", "submarine", "suit", "sundial", "sunglass", "sunglasses", "sunscreen",
  225 + "suspension bridge", "swab", "sweatshirt", "swimming trunks", "swing", "switch", "syringe",
  226 + "table lamp", "tank", "tape player", "teapot", "teddy", "television", "tennis ball", "thatch",
  227 + "theater curtain", "thimble", "thresher", "throne", "tile roof", "toaster", "tobacco shop",
  228 + "toilet seat", "torch", "totem pole", "tow truck", "toyshop", "tractor", "trailer truck", "tray",
  229 + "trench coat", "tricycle", "trimaran", "tripod", "triumphal arch", "trolleybus", "trombone", "tub",
  230 + "turnstile", "typewriter keyboard", "umbrella", "unicycle", "upright", "vacuum", "vase", "vault",
  231 + "velvet", "vending machine", "vestment", "viaduct", "violin", "volleyball", "waffle iron", "wall clock",
  232 + "wallet", "wardrobe", "warplane", "washbasin", "washer", "water bottle", "water jug", "water tower",
  233 + "whiskey jug", "whistle", "wig", "window screen", "window shade", "Windsor tie", "wine bottle", "wing",
  234 + "wok", "wooden spoon", "wool", "worm fence", "wreck", "yawl", "yurt", "web site", "comic book",
  235 + "crossword puzzle", "street sign", "traffic light", "book jacket", "menu", "plate", "guacamole",
  236 + "consomme", "hot pot", "trifle", "ice cream", "ice lolly", "French loaf", "bagel", "pretzel",
  237 + "cheeseburger", "hotdog", "mashed potato", "head cabbage", "broccoli", "cauliflower", "zucchini",
  238 + "spaghetti squash", "acorn squash", "butternut squash", "cucumber", "artichoke", "bell pepper",
  239 + "cardoon", "mushroom", "Granny Smith", "strawberry", "orange", "lemon", "fig", "pineapple", "banana",
  240 + "jackfruit", "custard apple", "pomegranate", "hay", "carbonara", "chocolate sauce", "dough",
  241 + "meat loaf", "pizza", "potpie", "burrito", "red wine", "espresso", "cup", "eggnog", "alp", "bubble",
  242 + "cliff", "coral reef", "geyser", "lakeside", "promontory", "sandbar", "seashore", "valley", "volcano",
  243 + "ballplayer", "groom", "scuba diver", "rapeseed", "daisy", "yellow lady's slipper", "corn", "acorn",
  244 + "hip", "buckeye", "coral fungus", "agaric", "gyromitra", "stinkhorn", "earthstar", "hen-of-the-woods",
  245 + "bolete", "ear", "toilet tissue"
  246 + };
  247 +
  248 + int y_offset = 0;
  249 + for (size_t i = 0; i < objects.size(); i++)
  250 + {
  251 + const Object& obj = objects[i];
  252 +
  253 + // fprintf(stderr, "%d = %.5f\n", obj.label, obj.prob);
  254 +
  255 + char text[256];
  256 + sprintf(text, "%4.1f%% %s", obj.prob * 100, class_names[obj.label]);
  257 +
  258 + int baseLine = 0;
  259 + cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
  260 +
  261 + int x = 0;
  262 + int y = y_offset;
  263 +
  264 + cv::rectangle(rgb, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
  265 + cv::Scalar(255, 255, 255), -1);
  266 +
  267 + cv::putText(rgb, text, cv::Point(x, y + label_size.height),
  268 + cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
  269 +
  270 + y_offset += label_size.height;
  271 + }
  272 +
  273 + return 0;
  274 +}
  1 +// Tencent is pleased to support the open source community by making ncnn available.
  2 +//
  3 +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
  4 +//
  5 +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6 +// in compliance with the License. You may obtain a copy of the License at
  7 +//
  8 +// https://opensource.org/licenses/BSD-3-Clause
  9 +//
  10 +// Unless required by applicable law or agreed to in writing, software distributed
  11 +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12 +// CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13 +// specific language governing permissions and limitations under the License.
  14 +
  15 +// 1. install
  16 +// pip3 install -U ultralytics pnnx ncnn
  17 +// 2. export yolov8 torchscript
  18 +// yolo export model=yolov8n.pt format=torchscript
  19 +// 3. convert torchscript with static shape
  20 +// pnnx yolov8n.torchscript
  21 +// 4. modify yolov8n_pnnx.py for dynamic shape inference
  22 +// A. modify reshape to support dynamic image sizes
  23 +// B. permute tensor before concat and adjust concat axis
  24 +// C. drop post-process part
  25 +// before:
  26 +// v_165 = v_142.view(1, 144, 6400)
  27 +// v_166 = v_153.view(1, 144, 1600)
  28 +// v_167 = v_164.view(1, 144, 400)
  29 +// v_168 = torch.cat((v_165, v_166, v_167), dim=2)
  30 +// ...
  31 +// after:
  32 +// v_165 = v_142.view(1, 144, -1).transpose(1, 2)
  33 +// v_166 = v_153.view(1, 144, -1).transpose(1, 2)
  34 +// v_167 = v_164.view(1, 144, -1).transpose(1, 2)
  35 +// v_168 = torch.cat((v_165, v_166, v_167), dim=1)
  36 +// return v_168
  37 +// 5. re-export yolov8 torchscript
  38 +// python3 -c 'import yolov8n_pnnx; yolov8n_pnnx.export_torchscript()'
  39 +// 6. convert new torchscript with dynamic shape
  40 +// pnnx yolov8n_pnnx.py.pt inputshape=[1,3,640,640] inputshape2=[1,3,320,320]
  41 +// 7. now you get ncnn model files
  42 +// mv yolov8n_pnnx.py.ncnn.param yolov8n.ncnn.param
  43 +// mv yolov8n_pnnx.py.ncnn.bin yolov8n.ncnn.bin
  44 +
  45 +// the out blob would be a 2-dim tensor with w=144 h=8400
  46 +//
  47 +// | bbox-reg 16 x 4 | per-class scores(80) |
  48 +// +-----+-----+-----+-----+----------------------+
  49 +// | dx0 | dy0 | dx1 | dy1 |0.1 0.0 0.0 0.5 ......|
  50 +// all /| | | | | . |
  51 +// boxes | .. | .. | .. | .. |0.0 0.9 0.0 0.0 ......|
  52 +// (8400)| | | | | . |
  53 +// \| | | | | . |
  54 +// +-----+-----+-----+-----+----------------------+
  55 +//
  56 +
  57 +#include "yolov8.h"
  58 +
  59 +#include <opencv2/core/core.hpp>
  60 +#include <opencv2/imgproc/imgproc.hpp>
  61 +
  62 +static inline float intersection_area(const Object& a, const Object& b)
  63 +{
  64 + cv::Rect_<float> inter = a.rect & b.rect;
  65 + return inter.area();
  66 +}
  67 +
  68 +static void qsort_descent_inplace(std::vector<Object>& objects, int left, int right)
  69 +{
  70 + int i = left;
  71 + int j = right;
  72 + float p = objects[(left + right) / 2].prob;
  73 +
  74 + while (i <= j)
  75 + {
  76 + while (objects[i].prob > p)
  77 + i++;
  78 +
  79 + while (objects[j].prob < p)
  80 + j--;
  81 +
  82 + if (i <= j)
  83 + {
  84 + // swap
  85 + std::swap(objects[i], objects[j]);
  86 +
  87 + i++;
  88 + j--;
  89 + }
  90 + }
  91 +
  92 + // #pragma omp parallel sections
  93 + {
  94 + // #pragma omp section
  95 + {
  96 + if (left < j) qsort_descent_inplace(objects, left, j);
  97 + }
  98 + // #pragma omp section
  99 + {
  100 + if (i < right) qsort_descent_inplace(objects, i, right);
  101 + }
  102 + }
  103 +}
  104 +
  105 +static void qsort_descent_inplace(std::vector<Object>& objects)
  106 +{
  107 + if (objects.empty())
  108 + return;
  109 +
  110 + qsort_descent_inplace(objects, 0, objects.size() - 1);
  111 +}
  112 +
  113 +static void nms_sorted_bboxes(const std::vector<Object>& objects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
  114 +{
  115 + picked.clear();
  116 +
  117 + const int n = objects.size();
  118 +
  119 + std::vector<float> areas(n);
  120 + for (int i = 0; i < n; i++)
  121 + {
  122 + areas[i] = objects[i].rect.area();
  123 + }
  124 +
  125 + for (int i = 0; i < n; i++)
  126 + {
  127 + const Object& a = objects[i];
  128 +
  129 + int keep = 1;
  130 + for (int j = 0; j < (int)picked.size(); j++)
  131 + {
  132 + const Object& b = objects[picked[j]];
  133 +
  134 + if (!agnostic && a.label != b.label)
  135 + continue;
  136 +
  137 + // intersection over union
  138 + float inter_area = intersection_area(a, b);
  139 + float union_area = areas[i] + areas[picked[j]] - inter_area;
  140 + // float IoU = inter_area / union_area
  141 + if (inter_area / union_area > nms_threshold)
  142 + keep = 0;
  143 + }
  144 +
  145 + if (keep)
  146 + picked.push_back(i);
  147 + }
  148 +}
  149 +
  150 +static inline float sigmoid(float x)
  151 +{
  152 + return 1.0f / (1.0f + expf(-x));
  153 +}
  154 +
  155 +static void generate_proposals(const ncnn::Mat& pred, int stride, const ncnn::Mat& in_pad, float prob_threshold, std::vector<Object>& objects)
  156 +{
  157 + const int w = in_pad.w;
  158 + const int h = in_pad.h;
  159 +
  160 + const int num_grid_x = w / stride;
  161 + const int num_grid_y = h / stride;
  162 +
  163 + const int reg_max_1 = 16;
  164 + const int num_class = pred.w - reg_max_1 * 4; // number of classes. 80 for COCO
  165 +
  166 + for (int y = 0; y < num_grid_y; y++)
  167 + {
  168 + for (int x = 0; x < num_grid_x; x++)
  169 + {
  170 + const ncnn::Mat pred_grid = pred.row_range(y * num_grid_x + x, 1);
  171 +
  172 + // find label with max score
  173 + int label = -1;
  174 + float score = -FLT_MAX;
  175 + {
  176 + const ncnn::Mat pred_score = pred_grid.range(reg_max_1 * 4, num_class);
  177 +
  178 + for (int k = 0; k < num_class; k++)
  179 + {
  180 + float s = pred_score[k];
  181 + if (s > score)
  182 + {
  183 + label = k;
  184 + score = s;
  185 + }
  186 + }
  187 +
  188 + score = sigmoid(score);
  189 + }
  190 +
  191 + if (score >= prob_threshold)
  192 + {
  193 + ncnn::Mat pred_bbox = pred_grid.range(0, reg_max_1 * 4).reshape(reg_max_1, 4);
  194 +
  195 + {
  196 + ncnn::Layer* softmax = ncnn::create_layer("Softmax");
  197 +
  198 + ncnn::ParamDict pd;
  199 + pd.set(0, 1); // axis
  200 + pd.set(1, 1);
  201 + softmax->load_param(pd);
  202 +
  203 + ncnn::Option opt;
  204 + opt.num_threads = 1;
  205 + opt.use_packing_layout = false;
  206 +
  207 + softmax->create_pipeline(opt);
  208 +
  209 + softmax->forward_inplace(pred_bbox, opt);
  210 +
  211 + softmax->destroy_pipeline(opt);
  212 +
  213 + delete softmax;
  214 + }
  215 +
  216 + float pred_ltrb[4];
  217 + for (int k = 0; k < 4; k++)
  218 + {
  219 + float dis = 0.f;
  220 + const float* dis_after_sm = pred_bbox.row(k);
  221 + for (int l = 0; l < reg_max_1; l++)
  222 + {
  223 + dis += l * dis_after_sm[l];
  224 + }
  225 +
  226 + pred_ltrb[k] = dis * stride;
  227 + }
  228 +
  229 + float pb_cx = (x + 0.5f) * stride;
  230 + float pb_cy = (y + 0.5f) * stride;
  231 +
  232 + float x0 = pb_cx - pred_ltrb[0];
  233 + float y0 = pb_cy - pred_ltrb[1];
  234 + float x1 = pb_cx + pred_ltrb[2];
  235 + float y1 = pb_cy + pred_ltrb[3];
  236 +
  237 + Object obj;
  238 + obj.rect.x = x0;
  239 + obj.rect.y = y0;
  240 + obj.rect.width = x1 - x0;
  241 + obj.rect.height = y1 - y0;
  242 + obj.label = label;
  243 + obj.prob = score;
  244 +
  245 + objects.push_back(obj);
  246 + }
  247 + }
  248 + }
  249 +}
  250 +
  251 +static void generate_proposals(const ncnn::Mat& pred, const std::vector<int>& strides, const ncnn::Mat& in_pad, float prob_threshold, std::vector<Object>& objects)
  252 +{
  253 + const int w = in_pad.w;
  254 + const int h = in_pad.h;
  255 +
  256 + int pred_row_offset = 0;
  257 + for (size_t i = 0; i < strides.size(); i++)
  258 + {
  259 + const int stride = strides[i];
  260 +
  261 + const int num_grid_x = w / stride;
  262 + const int num_grid_y = h / stride;
  263 + const int num_grid = num_grid_x * num_grid_y;
  264 +
  265 + generate_proposals(pred.row_range(pred_row_offset, num_grid), stride, in_pad, prob_threshold, objects);
  266 + pred_row_offset += num_grid;
  267 + }
  268 +}
  269 +
  270 +int YOLOv8_det::detect(const cv::Mat& rgb, std::vector<Object>& objects)
  271 +{
  272 + const int target_size = det_target_size;//640;
  273 + const float prob_threshold = 0.25f;
  274 + const float nms_threshold = 0.45f;
  275 +
  276 + int img_w = rgb.cols;
  277 + int img_h = rgb.rows;
  278 +
  279 + // ultralytics/cfg/models/v8/yolov8.yaml
  280 + std::vector<int> strides(3);
  281 + strides[0] = 8;
  282 + strides[1] = 16;
  283 + strides[2] = 32;
  284 + const int max_stride = 32;
  285 +
  286 + // letterbox pad to multiple of max_stride
  287 + int w = img_w;
  288 + int h = img_h;
  289 + float scale = 1.f;
  290 + if (w > h)
  291 + {
  292 + scale = (float)target_size / w;
  293 + w = target_size;
  294 + h = h * scale;
  295 + }
  296 + else
  297 + {
  298 + scale = (float)target_size / h;
  299 + h = target_size;
  300 + w = w * scale;
  301 + }
  302 +
  303 + ncnn::Mat in = ncnn::Mat::from_pixels_resize(rgb.data, ncnn::Mat::PIXEL_RGB, img_w, img_h, w, h);
  304 +
  305 + // letterbox pad to target_size rectangle
  306 + int wpad = (w + max_stride - 1) / max_stride * max_stride - w;
  307 + int hpad = (h + max_stride - 1) / max_stride * max_stride - h;
  308 + ncnn::Mat in_pad;
  309 + ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f);
  310 +
  311 + const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
  312 + in_pad.substract_mean_normalize(0, norm_vals);
  313 +
  314 + ncnn::Extractor ex = yolov8.create_extractor();
  315 +
  316 + ex.input("in0", in_pad);
  317 +
  318 + ncnn::Mat out;
  319 + ex.extract("out0", out);
  320 +
  321 + std::vector<Object> proposals;
  322 + generate_proposals(out, strides, in_pad, prob_threshold, proposals);
  323 +
  324 + // sort all proposals by score from highest to lowest
  325 + qsort_descent_inplace(proposals);
  326 +
  327 + // apply nms with nms_threshold
  328 + std::vector<int> picked;
  329 + nms_sorted_bboxes(proposals, picked, nms_threshold);
  330 +
  331 + int count = picked.size();
  332 +
  333 + objects.resize(count);
  334 + for (int i = 0; i < count; i++)
  335 + {
  336 + objects[i] = proposals[picked[i]];
  337 +
  338 + // adjust offset to original unpadded
  339 + float x0 = (objects[i].rect.x - (wpad / 2)) / scale;
  340 + float y0 = (objects[i].rect.y - (hpad / 2)) / scale;
  341 + float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale;
  342 + float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale;
  343 +
  344 + // clip
  345 + x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
  346 + y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
  347 + x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
  348 + y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);
  349 +
  350 + objects[i].rect.x = x0;
  351 + objects[i].rect.y = y0;
  352 + objects[i].rect.width = x1 - x0;
  353 + objects[i].rect.height = y1 - y0;
  354 + }
  355 +
  356 + // sort objects by area
  357 + struct
  358 + {
  359 + bool operator()(const Object& a, const Object& b) const
  360 + {
  361 + return a.rect.area() > b.rect.area();
  362 + }
  363 + } objects_area_greater;
  364 + std::sort(objects.begin(), objects.end(), objects_area_greater);
  365 +
  366 + return 0;
  367 +}
  368 +
  369 +int YOLOv8_det_coco::draw(cv::Mat& rgb, const std::vector<Object>& objects)
  370 +{
  371 + static const char* class_names[] = {
  372 + "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
  373 + "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
  374 + "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
  375 + "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
  376 + "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
  377 + "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
  378 + "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
  379 + "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
  380 + "hair drier", "toothbrush"
  381 + };
  382 +
  383 + static cv::Scalar colors[] = {
  384 + cv::Scalar( 67, 54, 244),
  385 + cv::Scalar( 30, 99, 233),
  386 + cv::Scalar( 39, 176, 156),
  387 + cv::Scalar( 58, 183, 103),
  388 + cv::Scalar( 81, 181, 63),
  389 + cv::Scalar(150, 243, 33),
  390 + cv::Scalar(169, 244, 3),
  391 + cv::Scalar(188, 212, 0),
  392 + cv::Scalar(150, 136, 0),
  393 + cv::Scalar(175, 80, 76),
  394 + cv::Scalar(195, 74, 139),
  395 + cv::Scalar(220, 57, 205),
  396 + cv::Scalar(235, 59, 255),
  397 + cv::Scalar(193, 7, 255),
  398 + cv::Scalar(152, 0, 255),
  399 + cv::Scalar( 87, 34, 255),
  400 + cv::Scalar( 85, 72, 121),
  401 + cv::Scalar(158, 158, 158),
  402 + cv::Scalar(125, 139, 96)
  403 + };
  404 +
  405 + for (size_t i = 0; i < objects.size(); i++)
  406 + {
  407 + const Object& obj = objects[i];
  408 +
  409 + const cv::Scalar& color = colors[i % 19];
  410 +
  411 + // fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
  412 + // obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
  413 +
  414 + cv::rectangle(rgb, obj.rect, color);
  415 +
  416 + char text[256];
  417 + sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
  418 +
  419 + int baseLine = 0;
  420 + cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
  421 +
  422 + int x = obj.rect.x;
  423 + int y = obj.rect.y - label_size.height - baseLine;
  424 + if (y < 0)
  425 + y = 0;
  426 + if (x + label_size.width > rgb.cols)
  427 + x = rgb.cols - label_size.width;
  428 +
  429 + cv::rectangle(rgb, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
  430 + cv::Scalar(255, 255, 255), -1);
  431 +
  432 + cv::putText(rgb, text, cv::Point(x, y + label_size.height),
  433 + cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
  434 + }
  435 +
  436 + return 0;
  437 +}
  438 +
  439 +int YOLOv8_det_oiv7::draw(cv::Mat& rgb, const std::vector<Object>& objects)
  440 +{
  441 + static const char* class_names[] = {
  442 + "Accordion", "Adhesive tape", "Aircraft", "Airplane", "Alarm clock", "Alpaca", "Ambulance", "Animal",
  443 + "Ant", "Antelope", "Apple", "Armadillo", "Artichoke", "Auto part", "Axe", "Backpack", "Bagel",
  444 + "Baked goods", "Balance beam", "Ball", "Balloon", "Banana", "Band-aid", "Banjo", "Barge", "Barrel",
  445 + "Baseball bat", "Baseball glove", "Bat (Animal)", "Bathroom accessory", "Bathroom cabinet", "Bathtub",
  446 + "Beaker", "Bear", "Bed", "Bee", "Beehive", "Beer", "Beetle", "Bell pepper", "Belt", "Bench", "Bicycle",
  447 + "Bicycle helmet", "Bicycle wheel", "Bidet", "Billboard", "Billiard table", "Binoculars", "Bird",
  448 + "Blender", "Blue jay", "Boat", "Bomb", "Book", "Bookcase", "Boot", "Bottle", "Bottle opener",
  449 + "Bow and arrow", "Bowl", "Bowling equipment", "Box", "Boy", "Brassiere", "Bread", "Briefcase",
  450 + "Broccoli", "Bronze sculpture", "Brown bear", "Building", "Bull", "Burrito", "Bus", "Bust", "Butterfly",
  451 + "Cabbage", "Cabinetry", "Cake", "Cake stand", "Calculator", "Camel", "Camera", "Can opener", "Canary",
  452 + "Candle", "Candy", "Cannon", "Canoe", "Cantaloupe", "Car", "Carnivore", "Carrot", "Cart", "Cassette deck",
  453 + "Castle", "Cat", "Cat furniture", "Caterpillar", "Cattle", "Ceiling fan", "Cello", "Centipede",
  454 + "Chainsaw", "Chair", "Cheese", "Cheetah", "Chest of drawers", "Chicken", "Chime", "Chisel", "Chopsticks",
  455 + "Christmas tree", "Clock", "Closet", "Clothing", "Coat", "Cocktail", "Cocktail shaker", "Coconut",
  456 + "Coffee", "Coffee cup", "Coffee table", "Coffeemaker", "Coin", "Common fig", "Common sunflower",
  457 + "Computer keyboard", "Computer monitor", "Computer mouse", "Container", "Convenience store", "Cookie",
  458 + "Cooking spray", "Corded phone", "Cosmetics", "Couch", "Countertop", "Cowboy hat", "Crab", "Cream",
  459 + "Cricket ball", "Crocodile", "Croissant", "Crown", "Crutch", "Cucumber", "Cupboard", "Curtain",
  460 + "Cutting board", "Dagger", "Dairy Product", "Deer", "Desk", "Dessert", "Diaper", "Dice", "Digital clock",
  461 + "Dinosaur", "Dishwasher", "Dog", "Dog bed", "Doll", "Dolphin", "Door", "Door handle", "Doughnut",
  462 + "Dragonfly", "Drawer", "Dress", "Drill (Tool)", "Drink", "Drinking straw", "Drum", "Duck", "Dumbbell",
  463 + "Eagle", "Earrings", "Egg (Food)", "Elephant", "Envelope", "Eraser", "Face powder", "Facial tissue holder",
  464 + "Falcon", "Fashion accessory", "Fast food", "Fax", "Fedora", "Filing cabinet", "Fire hydrant",
  465 + "Fireplace", "Fish", "Flag", "Flashlight", "Flower", "Flowerpot", "Flute", "Flying disc", "Food",
  466 + "Food processor", "Football", "Football helmet", "Footwear", "Fork", "Fountain", "Fox", "French fries",
  467 + "French horn", "Frog", "Fruit", "Frying pan", "Furniture", "Garden Asparagus", "Gas stove", "Giraffe",
  468 + "Girl", "Glasses", "Glove", "Goat", "Goggles", "Goldfish", "Golf ball", "Golf cart", "Gondola",
  469 + "Goose", "Grape", "Grapefruit", "Grinder", "Guacamole", "Guitar", "Hair dryer", "Hair spray", "Hamburger",
  470 + "Hammer", "Hamster", "Hand dryer", "Handbag", "Handgun", "Harbor seal", "Harmonica", "Harp",
  471 + "Harpsichord", "Hat", "Headphones", "Heater", "Hedgehog", "Helicopter", "Helmet", "High heels",
  472 + "Hiking equipment", "Hippopotamus", "Home appliance", "Honeycomb", "Horizontal bar", "Horse", "Hot dog",
  473 + "House", "Houseplant", "Human arm", "Human beard", "Human body", "Human ear", "Human eye", "Human face",
  474 + "Human foot", "Human hair", "Human hand", "Human head", "Human leg", "Human mouth", "Human nose",
  475 + "Humidifier", "Ice cream", "Indoor rower", "Infant bed", "Insect", "Invertebrate", "Ipod", "Isopod",
  476 + "Jacket", "Jacuzzi", "Jaguar (Animal)", "Jeans", "Jellyfish", "Jet ski", "Jug", "Juice", "Kangaroo",
  477 + "Kettle", "Kitchen & dining room table", "Kitchen appliance", "Kitchen knife", "Kitchen utensil",
  478 + "Kitchenware", "Kite", "Knife", "Koala", "Ladder", "Ladle", "Ladybug", "Lamp", "Land vehicle",
  479 + "Lantern", "Laptop", "Lavender (Plant)", "Lemon", "Leopard", "Light bulb", "Light switch", "Lighthouse",
  480 + "Lily", "Limousine", "Lion", "Lipstick", "Lizard", "Lobster", "Loveseat", "Luggage and bags", "Lynx",
  481 + "Magpie", "Mammal", "Man", "Mango", "Maple", "Maracas", "Marine invertebrates", "Marine mammal",
  482 + "Measuring cup", "Mechanical fan", "Medical equipment", "Microphone", "Microwave oven", "Milk",
  483 + "Miniskirt", "Mirror", "Missile", "Mixer", "Mixing bowl", "Mobile phone", "Monkey", "Moths and butterflies",
  484 + "Motorcycle", "Mouse", "Muffin", "Mug", "Mule", "Mushroom", "Musical instrument", "Musical keyboard",
  485 + "Nail (Construction)", "Necklace", "Nightstand", "Oboe", "Office building", "Office supplies", "Orange",
  486 + "Organ (Musical Instrument)", "Ostrich", "Otter", "Oven", "Owl", "Oyster", "Paddle", "Palm tree",
  487 + "Pancake", "Panda", "Paper cutter", "Paper towel", "Parachute", "Parking meter", "Parrot", "Pasta",
  488 + "Pastry", "Peach", "Pear", "Pen", "Pencil case", "Pencil sharpener", "Penguin", "Perfume", "Person",
  489 + "Personal care", "Personal flotation device", "Piano", "Picnic basket", "Picture frame", "Pig",
  490 + "Pillow", "Pineapple", "Pitcher (Container)", "Pizza", "Pizza cutter", "Plant", "Plastic bag", "Plate",
  491 + "Platter", "Plumbing fixture", "Polar bear", "Pomegranate", "Popcorn", "Porch", "Porcupine", "Poster",
  492 + "Potato", "Power plugs and sockets", "Pressure cooker", "Pretzel", "Printer", "Pumpkin", "Punching bag",
  493 + "Rabbit", "Raccoon", "Racket", "Radish", "Ratchet (Device)", "Raven", "Rays and skates", "Red panda",
  494 + "Refrigerator", "Remote control", "Reptile", "Rhinoceros", "Rifle", "Ring binder", "Rocket",
  495 + "Roller skates", "Rose", "Rugby ball", "Ruler", "Salad", "Salt and pepper shakers", "Sandal",
  496 + "Sandwich", "Saucer", "Saxophone", "Scale", "Scarf", "Scissors", "Scoreboard", "Scorpion",
  497 + "Screwdriver", "Sculpture", "Sea lion", "Sea turtle", "Seafood", "Seahorse", "Seat belt", "Segway",
  498 + "Serving tray", "Sewing machine", "Shark", "Sheep", "Shelf", "Shellfish", "Shirt", "Shorts",
  499 + "Shotgun", "Shower", "Shrimp", "Sink", "Skateboard", "Ski", "Skirt", "Skull", "Skunk", "Skyscraper",
  500 + "Slow cooker", "Snack", "Snail", "Snake", "Snowboard", "Snowman", "Snowmobile", "Snowplow",
  501 + "Soap dispenser", "Sock", "Sofa bed", "Sombrero", "Sparrow", "Spatula", "Spice rack", "Spider",
  502 + "Spoon", "Sports equipment", "Sports uniform", "Squash (Plant)", "Squid", "Squirrel", "Stairs",
  503 + "Stapler", "Starfish", "Stationary bicycle", "Stethoscope", "Stool", "Stop sign", "Strawberry",
  504 + "Street light", "Stretcher", "Studio couch", "Submarine", "Submarine sandwich", "Suit", "Suitcase",
  505 + "Sun hat", "Sunglasses", "Surfboard", "Sushi", "Swan", "Swim cap", "Swimming pool", "Swimwear",
  506 + "Sword", "Syringe", "Table", "Table tennis racket", "Tablet computer", "Tableware", "Taco", "Tank",
  507 + "Tap", "Tart", "Taxi", "Tea", "Teapot", "Teddy bear", "Telephone", "Television", "Tennis ball",
  508 + "Tennis racket", "Tent", "Tiara", "Tick", "Tie", "Tiger", "Tin can", "Tire", "Toaster", "Toilet",
  509 + "Toilet paper", "Tomato", "Tool", "Toothbrush", "Torch", "Tortoise", "Towel", "Tower", "Toy",
  510 + "Traffic light", "Traffic sign", "Train", "Training bench", "Treadmill", "Tree", "Tree house",
  511 + "Tripod", "Trombone", "Trousers", "Truck", "Trumpet", "Turkey", "Turtle", "Umbrella", "Unicycle",
  512 + "Van", "Vase", "Vegetable", "Vehicle", "Vehicle registration plate", "Violin", "Volleyball (Ball)",
  513 + "Waffle", "Waffle iron", "Wall clock", "Wardrobe", "Washing machine", "Waste container", "Watch",
  514 + "Watercraft", "Watermelon", "Weapon", "Whale", "Wheel", "Wheelchair", "Whisk", "Whiteboard", "Willow",
  515 + "Window", "Window blind", "Wine", "Wine glass", "Wine rack", "Winter melon", "Wok", "Woman",
  516 + "Wood-burning stove", "Woodpecker", "Worm", "Wrench", "Zebra", "Zucchini"
  517 + };
  518 +
  519 + static cv::Scalar colors[] = {
  520 + cv::Scalar( 67, 54, 244),
  521 + cv::Scalar( 30, 99, 233),
  522 + cv::Scalar( 39, 176, 156),
  523 + cv::Scalar( 58, 183, 103),
  524 + cv::Scalar( 81, 181, 63),
  525 + cv::Scalar(150, 243, 33),
  526 + cv::Scalar(169, 244, 3),
  527 + cv::Scalar(188, 212, 0),
  528 + cv::Scalar(150, 136, 0),
  529 + cv::Scalar(175, 80, 76),
  530 + cv::Scalar(195, 74, 139),
  531 + cv::Scalar(220, 57, 205),
  532 + cv::Scalar(235, 59, 255),
  533 + cv::Scalar(193, 7, 255),
  534 + cv::Scalar(152, 0, 255),
  535 + cv::Scalar( 87, 34, 255),
  536 + cv::Scalar( 85, 72, 121),
  537 + cv::Scalar(158, 158, 158),
  538 + cv::Scalar(125, 139, 96)
  539 + };
  540 +
  541 + for (size_t i = 0; i < objects.size(); i++)
  542 + {
  543 + const Object& obj = objects[i];
  544 +
  545 + const cv::Scalar& color = colors[i % 19];
  546 +
  547 + // fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
  548 + // obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
  549 +
  550 + cv::rectangle(rgb, obj.rect, color);
  551 +
  552 + char text[256];
  553 + sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
  554 +
  555 + int baseLine = 0;
  556 + cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
  557 +
  558 + int x = obj.rect.x;
  559 + int y = obj.rect.y - label_size.height - baseLine;
  560 + if (y < 0)
  561 + y = 0;
  562 + if (x + label_size.width > rgb.cols)
  563 + x = rgb.cols - label_size.width;
  564 +
  565 + cv::rectangle(rgb, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
  566 + cv::Scalar(255, 255, 255), -1);
  567 +
  568 + cv::putText(rgb, text, cv::Point(x, y + label_size.height),
  569 + cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
  570 + }
  571 +
  572 + return 0;
  573 +}
  1 +// Tencent is pleased to support the open source community by making ncnn available.
  2 +//
  3 +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
  4 +//
  5 +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6 +// in compliance with the License. You may obtain a copy of the License at
  7 +//
  8 +// https://opensource.org/licenses/BSD-3-Clause
  9 +//
  10 +// Unless required by applicable law or agreed to in writing, software distributed
  11 +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12 +// CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13 +// specific language governing permissions and limitations under the License.
  14 +
  15 +// 1. install
  16 +// pip3 install -U ultralytics pnnx ncnn
  17 +// 2. export yolov8-obb torchscript
  18 +// yolo export model=yolov8n-obb.pt format=torchscript
  19 +// 3. convert torchscript with static shape
  20 +// pnnx yolov8n-obb.torchscript
  21 +// 4. modify yolov8n_obb_pnnx.py for dynamic shape inference
  22 +// A. modify reshape to support dynamic image sizes
  23 +// B. permute tensor before concat and adjust concat axis
  24 +// C. drop post-process part
  25 +// before:
  26 +// v_137 = v_136.view(1, 1, 16384)
  27 +// v_143 = v_142.view(1, 1, 4096)
  28 +// v_149 = v_148.view(1, 1, 1024)
  29 +// v_150 = torch.cat((v_137, v_143, v_149), dim=2)
  30 +// ...
  31 +// v_186 = v_163.view(1, 79, 16384)
  32 +// v_187 = v_174.view(1, 79, 4096)
  33 +// v_188 = v_185.view(1, 79, 1024)
  34 +// v_189 = torch.cat((v_186, v_187, v_188), dim=2)
  35 +// ...
  36 +// after:
  37 +// v_137 = v_136.view(1, 1, -1).transpose(1, 2)
  38 +// v_143 = v_142.view(1, 1, -1).transpose(1, 2)
  39 +// v_149 = v_148.view(1, 1, -1).transpose(1, 2)
  40 +// v_150 = torch.cat((v_137, v_143, v_149), dim=1)
  41 +// ...
  42 +// v_186 = v_163.view(1, 79, -1).transpose(1, 2)
  43 +// v_187 = v_174.view(1, 79, -1).transpose(1, 2)
  44 +// v_188 = v_185.view(1, 79, -1).transpose(1, 2)
  45 +// v_189 = torch.cat((v_186, v_187, v_188), dim=1)
  46 +// return v_189, v_150
  47 +// 5. re-export yolov8-obb torchscript
  48 +// python3 -c 'import yolov8n_obb_pnnx; yolov8n_obb_pnnx.export_torchscript()'
  49 +// 6. convert new torchscript with dynamic shape
  50 +// pnnx yolov8n_obb_pnnx.py.pt inputshape=[1,3,1024,1024] inputshape2=[1,3,512,512]
  51 +// 7. now you get ncnn model files
  52 +// mv yolov8n_obb_pnnx.py.ncnn.param yolov8n_obb.ncnn.param
  53 +// mv yolov8n_obb_pnnx.py.ncnn.bin yolov8n_obb.ncnn.bin
  54 +
  55 +// the out blob would be a 2-dim tensor with w=79 h=21504
  56 +//
  57 +// | bbox-reg 16 x 4 |score(15)|
  58 +// +-----+-----+-----+-----+---------+
  59 +// | dx0 | dy0 | dx1 | dy1 | 0.1 ... |
  60 +// all /| | | | | ... |
  61 +// boxes | .. | .. | .. | .. | 0.0 ... |
  62 +// (21504)| | | | | . ... |
  63 +// \| | | | | . ... |
  64 +// +-----+-----+-----+-----+---------+
  65 +//
  66 +
  67 +// the out blob would be a 2-dim tensor with w=1 h=21504
  68 +//
  69 +// | degree(1)|
  70 +// +----------+
  71 +// | 0.1 |
  72 +// all /| |
  73 +// boxes | 0.0 |
  74 +// (21504)| . |
  75 +// \| . |
  76 +// +----------+
  77 +//
  78 +
  79 +#include "yolov8.h"
  80 +
  81 +#include "layer.h"
  82 +
  83 +#include <opencv2/core/core.hpp>
  84 +#include <opencv2/imgproc/imgproc.hpp>
  85 +
  86 +#include <float.h>
  87 +#include <stdio.h>
  88 +#include <vector>
  89 +
  90 +static inline float intersection_area(const Object& a, const Object& b)
  91 +{
  92 + std::vector<cv::Point2f> intersection;
  93 + cv::rotatedRectangleIntersection(a.rrect, b.rrect, intersection);
  94 + if (intersection.empty())
  95 + return 0.f;
  96 +
  97 + return cv::contourArea(intersection);
  98 +}
  99 +
  100 +static void qsort_descent_inplace(std::vector<Object>& objects, int left, int right)
  101 +{
  102 + int i = left;
  103 + int j = right;
  104 + float p = objects[(left + right) / 2].prob;
  105 +
  106 + while (i <= j)
  107 + {
  108 + while (objects[i].prob > p)
  109 + i++;
  110 +
  111 + while (objects[j].prob < p)
  112 + j--;
  113 +
  114 + if (i <= j)
  115 + {
  116 + // swap
  117 + std::swap(objects[i], objects[j]);
  118 +
  119 + i++;
  120 + j--;
  121 + }
  122 + }
  123 +
  124 + // #pragma omp parallel sections
  125 + {
  126 + // #pragma omp section
  127 + {
  128 + if (left < j) qsort_descent_inplace(objects, left, j);
  129 + }
  130 + // #pragma omp section
  131 + {
  132 + if (i < right) qsort_descent_inplace(objects, i, right);
  133 + }
  134 + }
  135 +}
  136 +
  137 +static void qsort_descent_inplace(std::vector<Object>& objects)
  138 +{
  139 + if (objects.empty())
  140 + return;
  141 +
  142 + qsort_descent_inplace(objects, 0, objects.size() - 1);
  143 +}
  144 +
  145 +static void nms_sorted_bboxes(const std::vector<Object>& objects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
  146 +{
  147 + picked.clear();
  148 +
  149 + const int n = objects.size();
  150 +
  151 + std::vector<float> areas(n);
  152 + for (int i = 0; i < n; i++)
  153 + {
  154 + areas[i] = objects[i].rrect.size.area();
  155 + }
  156 +
  157 + for (int i = 0; i < n; i++)
  158 + {
  159 + const Object& a = objects[i];
  160 +
  161 + int keep = 1;
  162 + for (int j = 0; j < (int)picked.size(); j++)
  163 + {
  164 + const Object& b = objects[picked[j]];
  165 +
  166 + if (!agnostic && a.label != b.label)
  167 + continue;
  168 +
  169 + // intersection over union
  170 + float inter_area = intersection_area(a, b);
  171 + float union_area = areas[i] + areas[picked[j]] - inter_area;
  172 + // float IoU = inter_area / union_area;
  173 + if (inter_area / union_area > nms_threshold)
  174 + keep = 0;
  175 + }
  176 +
  177 + if (keep)
  178 + picked.push_back(i);
  179 + }
  180 +}
  181 +
  182 +static inline float sigmoid(float x)
  183 +{
  184 + return 1.0f / (1.0f + expf(-x));
  185 +}
  186 +
  187 +static void generate_proposals(const ncnn::Mat& pred, const ncnn::Mat& pred_angle, int stride, const ncnn::Mat& in_pad, float prob_threshold, std::vector<Object>& objects)
  188 +{
  189 + const int w = in_pad.w;
  190 + const int h = in_pad.h;
  191 +
  192 + const int num_grid_x = w / stride;
  193 + const int num_grid_y = h / stride;
  194 +
  195 + const int reg_max_1 = 16;
  196 + const int num_class = pred.w - reg_max_1 * 4; // number of classes. 15 for DOTAv1
  197 +
  198 + for (int y = 0; y < num_grid_y; y++)
  199 + {
  200 + for (int x = 0; x < num_grid_x; x++)
  201 + {
  202 + const ncnn::Mat pred_grid = pred.row_range(y * num_grid_x + x, 1);
  203 +
  204 + // find label with max score
  205 + int label = -1;
  206 + float score = -FLT_MAX;
  207 + {
  208 + const ncnn::Mat pred_score = pred_grid.range(reg_max_1 * 4, num_class);
  209 +
  210 + for (int k = 0; k < num_class; k++)
  211 + {
  212 + float s = pred_score[k];
  213 + if (s > score)
  214 + {
  215 + label = k;
  216 + score = s;
  217 + }
  218 + }
  219 +
  220 + score = sigmoid(score);
  221 + }
  222 +
  223 + if (score >= prob_threshold)
  224 + {
  225 + ncnn::Mat pred_bbox = pred_grid.range(0, reg_max_1 * 4).reshape(reg_max_1, 4).clone();
  226 +
  227 + {
  228 + ncnn::Layer* softmax = ncnn::create_layer("Softmax");
  229 +
  230 + ncnn::ParamDict pd;
  231 + pd.set(0, 1); // axis
  232 + pd.set(1, 1);
  233 + softmax->load_param(pd);
  234 +
  235 + ncnn::Option opt;
  236 + opt.num_threads = 1;
  237 + opt.use_packing_layout = false;
  238 +
  239 + softmax->create_pipeline(opt);
  240 +
  241 + softmax->forward_inplace(pred_bbox, opt);
  242 +
  243 + softmax->destroy_pipeline(opt);
  244 +
  245 + delete softmax;
  246 + }
  247 +
  248 + float pred_ltrb[4];
  249 + for (int k = 0; k < 4; k++)
  250 + {
  251 + float dis = 0.f;
  252 + const float* dis_after_sm = pred_bbox.row(k);
  253 + for (int l = 0; l < reg_max_1; l++)
  254 + {
  255 + dis += l * dis_after_sm[l];
  256 + }
  257 +
  258 + pred_ltrb[k] = dis * stride;
  259 + }
  260 +
  261 + float pb_cx = (x + 0.5f) * stride;
  262 + float pb_cy = (y + 0.5f) * stride;
  263 +
  264 + const float angle = sigmoid(pred_angle.row(y * num_grid_x + x)[0]) - 0.25f;
  265 +
  266 + const float angle_rad = angle * 3.14159265358979323846f;
  267 + const float angle_degree = angle * 180.f;
  268 +
  269 + float cos = cosf(angle_rad);
  270 + float sin = sinf(angle_rad);
  271 +
  272 + float xx = (pred_ltrb[2] - pred_ltrb[0]) * 0.5f;
  273 + float yy = (pred_ltrb[3] - pred_ltrb[1]) * 0.5f;
  274 + float xr = xx * cos - yy * sin;
  275 + float yr = xx * sin + yy * cos;
  276 + const float cx = pb_cx + xr;
  277 + const float cy = pb_cy + yr;
  278 + const float ww = pred_ltrb[2] + pred_ltrb[0];
  279 + const float hh = pred_ltrb[3] + pred_ltrb[1];
  280 +
  281 + Object obj;
  282 + obj.rrect = cv::RotatedRect(cv::Point2f(cx, cy), cv::Size_<float>(ww, hh), angle_degree);
  283 + obj.label = label;
  284 + obj.prob = score;
  285 +
  286 + objects.push_back(obj);
  287 + }
  288 + }
  289 + }
  290 +}
  291 +
  292 +static void generate_proposals(const ncnn::Mat& pred, const ncnn::Mat& pred_angle, const std::vector<int>& strides, const ncnn::Mat& in_pad, float prob_threshold, std::vector<Object>& objects)
  293 +{
  294 + const int w = in_pad.w;
  295 + const int h = in_pad.h;
  296 +
  297 + int pred_row_offset = 0;
  298 + for (size_t i = 0; i < strides.size(); i++)
  299 + {
  300 + const int stride = strides[i];
  301 +
  302 + const int num_grid_x = w / stride;
  303 + const int num_grid_y = h / stride;
  304 + const int num_grid = num_grid_x * num_grid_y;
  305 +
  306 + generate_proposals(pred.row_range(pred_row_offset, num_grid), pred_angle.row_range(pred_row_offset, num_grid), stride, in_pad, prob_threshold, objects);
  307 +
  308 + pred_row_offset += num_grid;
  309 + }
  310 +}
  311 +
  312 +int YOLOv8_obb::detect(const cv::Mat& rgb, std::vector<Object>& objects)
  313 +{
  314 + const int target_size = det_target_size;//1024;
  315 + const float prob_threshold = 0.25f;
  316 + const float nms_threshold = 0.45f;
  317 +
  318 + int img_w = rgb.cols;
  319 + int img_h = rgb.rows;
  320 +
  321 + // ultralytics/cfg/models/v8/yolov8.yaml
  322 + std::vector<int> strides(3);
  323 + strides[0] = 8;
  324 + strides[1] = 16;
  325 + strides[2] = 32;
  326 + const int max_stride = 32;
  327 +
  328 + // letterbox pad to multiple of max_stride
  329 + int w = img_w;
  330 + int h = img_h;
  331 + float scale = 1.f;
  332 + if (w > h)
  333 + {
  334 + scale = (float)target_size / w;
  335 + w = target_size;
  336 + h = h * scale;
  337 + }
  338 + else
  339 + {
  340 + scale = (float)target_size / h;
  341 + h = target_size;
  342 + w = w * scale;
  343 + }
  344 +
  345 + ncnn::Mat in = ncnn::Mat::from_pixels_resize(rgb.data, ncnn::Mat::PIXEL_RGB, img_w, img_h, w, h);
  346 +
  347 + // letterbox pad to target_size rectangle
  348 + int wpad = (w + max_stride - 1) / max_stride * max_stride - w;
  349 + int hpad = (h + max_stride - 1) / max_stride * max_stride - h;
  350 + ncnn::Mat in_pad;
  351 + ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f);
  352 +
  353 + const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
  354 + in_pad.substract_mean_normalize(0, norm_vals);
  355 +
  356 + ncnn::Extractor ex = yolov8.create_extractor();
  357 +
  358 + ex.input("in0", in_pad);
  359 +
  360 + ncnn::Mat out;
  361 + ex.extract("out0", out);
  362 +
  363 + ncnn::Mat out_angle;
  364 + ex.extract("out1", out_angle);
  365 +
  366 + std::vector<Object> proposals;
  367 + generate_proposals(out, out_angle, strides, in_pad, prob_threshold, proposals);
  368 +
  369 + // sort all proposals by score from highest to lowest
  370 + qsort_descent_inplace(proposals);
  371 +
  372 + // apply nms with nms_threshold
  373 + std::vector<int> picked;
  374 + nms_sorted_bboxes(proposals, picked, nms_threshold);
  375 +
  376 + int count = picked.size();
  377 + if (count == 0)
  378 + return 0;
  379 +
  380 + objects.resize(count);
  381 + for (int i = 0; i < count; i++)
  382 + {
  383 + Object obj = proposals[picked[i]];
  384 +
  385 + // adjust offset to original unpadded
  386 + obj.rrect.center.x = (obj.rrect.center.x - (wpad / 2)) / scale;
  387 + obj.rrect.center.y = (obj.rrect.center.y - (hpad / 2)) / scale;
  388 + obj.rrect.size.width = (obj.rrect.size.width) / scale;
  389 + obj.rrect.size.height = (obj.rrect.size.height) / scale;
  390 +
  391 + objects[i] = obj;
  392 + }
  393 +
  394 + return 0;
  395 +}
  396 +
  397 +int YOLOv8_obb::draw(cv::Mat& rgb, const std::vector<Object>& objects)
  398 +{
  399 + static const char* class_names[] = {
  400 + "plane", "ship", "storage tank", "baseball diamond", "tennis court",
  401 + "basketball court", "ground track field", "harbor", "bridge", "large vehicle",
  402 + "small vehicle", "helicopter", "roundabout", "soccer ball field", "swimming pool"
  403 + };
  404 +
  405 + static const cv::Scalar colors[] = {
  406 + cv::Scalar( 39, 176, 156),
  407 + cv::Scalar( 58, 183, 103),
  408 + cv::Scalar( 81, 181, 63),
  409 + cv::Scalar(150, 243, 33),
  410 + cv::Scalar(169, 244, 3),
  411 + cv::Scalar(188, 212, 0),
  412 + cv::Scalar(150, 136, 0),
  413 + cv::Scalar(175, 80, 76),
  414 + cv::Scalar(195, 74, 139),
  415 + cv::Scalar(220, 57, 205),
  416 + cv::Scalar(235, 59, 255),
  417 + cv::Scalar(193, 7, 255),
  418 + cv::Scalar(152, 0, 255),
  419 + cv::Scalar( 87, 34, 255),
  420 + cv::Scalar( 85, 72, 121),
  421 + cv::Scalar(158, 158, 158),
  422 + cv::Scalar(125, 139, 96)
  423 + };
  424 +
  425 + for (size_t i = 0; i < objects.size(); i++)
  426 + {
  427 + const Object& obj = objects[i];
  428 +
  429 + const cv::Scalar& color = colors[obj.label];
  430 +
  431 + // fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f @ %.2f\n", obj.label, obj.prob,
  432 + // obj.rrect.center.x, obj.rrect.center.y, obj.rrect.size.width, obj.rrect.size.height, obj.rrect.angle);
  433 +
  434 + cv::Point2f corners[4];
  435 + obj.rrect.points(corners);
  436 + cv::line(rgb, corners[0], corners[1], color);
  437 + cv::line(rgb, corners[1], corners[2], color);
  438 + cv::line(rgb, corners[2], corners[3], color);
  439 + cv::line(rgb, corners[3], corners[0], color);
  440 + }
  441 +
  442 + for (size_t i = 0; i < objects.size(); i++)
  443 + {
  444 + const Object& obj = objects[i];
  445 +
  446 + const cv::Scalar& color = colors[obj.label];
  447 +
  448 + char text[256];
  449 + sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
  450 +
  451 + int baseLine = 0;
  452 + cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
  453 +
  454 + int x = obj.rrect.center.x - label_size.width / 2;
  455 + int y = obj.rrect.center.y - label_size.height / 2 - baseLine;
  456 + if (y < 0)
  457 + y = 0;
  458 + if (y + label_size.height > rgb.rows)
  459 + y = rgb.rows - label_size.height;
  460 + if (x < 0)
  461 + x = 0;
  462 + if (x + label_size.width > rgb.cols)
  463 + x = rgb.cols - label_size.width;
  464 +
  465 + cv::rectangle(rgb, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
  466 + cv::Scalar(255, 255, 255), -1);
  467 +
  468 + cv::putText(rgb, text, cv::Point(x, y + label_size.height),
  469 + cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
  470 + }
  471 +
  472 + return 0;
  473 +}
  1 +// Tencent is pleased to support the open source community by making ncnn available.
  2 +//
  3 +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
  4 +//
  5 +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6 +// in compliance with the License. You may obtain a copy of the License at
  7 +//
  8 +// https://opensource.org/licenses/BSD-3-Clause
  9 +//
  10 +// Unless required by applicable law or agreed to in writing, software distributed
  11 +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12 +// CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13 +// specific language governing permissions and limitations under the License.
  14 +
  15 +// 1. install
  16 +// pip3 install -U ultralytics pnnx ncnn
  17 +// 2. export yolov8-pose torchscript
  18 +// yolo export model=yolov8n-pose.pt format=torchscript
  19 +// 3. convert torchscript with static shape
  20 +// pnnx yolov8n-pose.torchscript
  21 +// 4. modify yolov8n_pose_pnnx.py for dynamic shape inference
  22 +// A. modify reshape to support dynamic image sizes
  23 +// B. permute tensor before concat and adjust concat axis
  24 +// C. drop post-process part
  25 +// before:
  26 +// v_137 = v_136.view(1, 51, 6400)
  27 +// v_143 = v_142.view(1, 51, 1600)
  28 +// v_149 = v_148.view(1, 51, 400)
  29 +// v_150 = torch.cat((v_137, v_143, v_149), dim=-1)
  30 +// ...
  31 +// v_184 = v_161.view(1, 65, 6400)
  32 +// v_185 = v_172.view(1, 65, 1600)
  33 +// v_186 = v_183.view(1, 65, 400)
  34 +// v_187 = torch.cat((v_184, v_185, v_186), dim=2)
  35 +// ...
  36 +// after:
  37 +// v_137 = v_136.view(1, 51, -1).transpose(1, 2)
  38 +// v_143 = v_142.view(1, 51, -1).transpose(1, 2)
  39 +// v_149 = v_148.view(1, 51, -1).transpose(1, 2)
  40 +// v_150 = torch.cat((v_137, v_143, v_149), dim=1)
  41 +// ...
  42 +// v_184 = v_161.view(1, 65, -1).transpose(1, 2)
  43 +// v_185 = v_172.view(1, 65, -1).transpose(1, 2)
  44 +// v_186 = v_183.view(1, 65, -1).transpose(1, 2)
  45 +// v_187 = torch.cat((v_184, v_185, v_186), dim=1)
  46 +// return v_187, v_150
  47 +// 5. re-export yolov8-pose torchscript
  48 +// python3 -c 'import yolov8n_pose_pnnx; yolov8n_pose_pnnx.export_torchscript()'
  49 +// 6. convert new torchscript with dynamic shape
  50 +// pnnx yolov8n_pose_pnnx.py.pt inputshape=[1,3,640,640] inputshape2=[1,3,320,320]
  51 +// 7. now you get ncnn model files
  52 +// mv yolov8n_pose_pnnx.py.ncnn.param yolov8n_pose.ncnn.param
  53 +// mv yolov8n_pose_pnnx.py.ncnn.bin yolov8n_pose.ncnn.bin
  54 +
  55 +// the out blob would be a 2-dim tensor with w=65 h=8400
  56 +//
  57 +// | bbox-reg 16 x 4 |score(1)|
  58 +// +-----+-----+-----+-----+--------+
  59 +// | dx0 | dy0 | dx1 | dy1 | 0.1 |
  60 +// all /| | | | | |
  61 +// boxes | .. | .. | .. | .. | 0.0 |
  62 +// (8400)| | | | | . |
  63 +// \| | | | | . |
  64 +// +-----+-----+-----+-----+--------+
  65 +//
  66 +
  67 +//
  68 +// | pose (51) |
  69 +// +-----------+
  70 +// |0.1........|
  71 +// all /| |
  72 +// boxes |0.0........|
  73 +// (8400)| . |
  74 +// \| . |
  75 +// +-----------+
  76 +//
  77 +
  78 +#include "yolov8.h"
  79 +
  80 +#include "layer.h"
  81 +
  82 +#include <opencv2/core/core.hpp>
  83 +#include <opencv2/imgproc/imgproc.hpp>
  84 +
  85 +#include <float.h>
  86 +#include <stdio.h>
  87 +#include <vector>
  88 +
  89 +static inline float intersection_area(const Object& a, const Object& b)
  90 +{
  91 + cv::Rect_<float> inter = a.rect & b.rect;
  92 + return inter.area();
  93 +}
  94 +
  95 +static void qsort_descent_inplace(std::vector<Object>& objects, int left, int right)
  96 +{
  97 + int i = left;
  98 + int j = right;
  99 + float p = objects[(left + right) / 2].prob;
  100 +
  101 + while (i <= j)
  102 + {
  103 + while (objects[i].prob > p)
  104 + i++;
  105 +
  106 + while (objects[j].prob < p)
  107 + j--;
  108 +
  109 + if (i <= j)
  110 + {
  111 + // swap
  112 + std::swap(objects[i], objects[j]);
  113 +
  114 + i++;
  115 + j--;
  116 + }
  117 + }
  118 +
  119 + // #pragma omp parallel sections
  120 + {
  121 + // #pragma omp section
  122 + {
  123 + if (left < j) qsort_descent_inplace(objects, left, j);
  124 + }
  125 + // #pragma omp section
  126 + {
  127 + if (i < right) qsort_descent_inplace(objects, i, right);
  128 + }
  129 + }
  130 +}
  131 +
  132 +static void qsort_descent_inplace(std::vector<Object>& objects)
  133 +{
  134 + if (objects.empty())
  135 + return;
  136 +
  137 + qsort_descent_inplace(objects, 0, objects.size() - 1);
  138 +}
  139 +
  140 +static void nms_sorted_bboxes(const std::vector<Object>& objects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
  141 +{
  142 + picked.clear();
  143 +
  144 + const int n = objects.size();
  145 +
  146 + std::vector<float> areas(n);
  147 + for (int i = 0; i < n; i++)
  148 + {
  149 + areas[i] = objects[i].rect.area();
  150 + }
  151 +
  152 + for (int i = 0; i < n; i++)
  153 + {
  154 + const Object& a = objects[i];
  155 +
  156 + int keep = 1;
  157 + for (int j = 0; j < (int)picked.size(); j++)
  158 + {
  159 + const Object& b = objects[picked[j]];
  160 +
  161 + if (!agnostic && a.label != b.label)
  162 + continue;
  163 +
  164 + // intersection over union
  165 + float inter_area = intersection_area(a, b);
  166 + float union_area = areas[i] + areas[picked[j]] - inter_area;
  167 + // float IoU = inter_area / union_area
  168 + if (inter_area / union_area > nms_threshold)
  169 + keep = 0;
  170 + }
  171 +
  172 + if (keep)
  173 + picked.push_back(i);
  174 + }
  175 +}
  176 +
  177 +static inline float sigmoid(float x)
  178 +{
  179 + return 1.0f / (1.0f + expf(-x));
  180 +}
  181 +
  182 +static void generate_proposals(const ncnn::Mat& pred, const ncnn::Mat& pred_points, int stride, const ncnn::Mat& in_pad, float prob_threshold, std::vector<Object>& objects)
  183 +{
  184 + const int w = in_pad.w;
  185 + const int h = in_pad.h;
  186 +
  187 + const int num_grid_x = w / stride;
  188 + const int num_grid_y = h / stride;
  189 +
  190 + const int reg_max_1 = 16;
  191 + const int num_points = pred_points.w / 3;
  192 +
  193 + for (int y = 0; y < num_grid_y; y++)
  194 + {
  195 + for (int x = 0; x < num_grid_x; x++)
  196 + {
  197 + const ncnn::Mat pred_grid = pred.row_range(y * num_grid_x + x, 1);
  198 + const ncnn::Mat pred_points_grid = pred_points.row_range(y * num_grid_x + x, 1).reshape(3, num_points);
  199 +
  200 + // find label with max score
  201 + int label = 0;
  202 + float score = sigmoid(pred_grid[reg_max_1 * 4]);
  203 +
  204 + if (score >= prob_threshold)
  205 + {
  206 + ncnn::Mat pred_bbox = pred_grid.range(0, reg_max_1 * 4).reshape(reg_max_1, 4).clone();
  207 +
  208 + {
  209 + ncnn::Layer* softmax = ncnn::create_layer("Softmax");
  210 +
  211 + ncnn::ParamDict pd;
  212 + pd.set(0, 1); // axis
  213 + pd.set(1, 1);
  214 + softmax->load_param(pd);
  215 +
  216 + ncnn::Option opt;
  217 + opt.num_threads = 1;
  218 + opt.use_packing_layout = false;
  219 +
  220 + softmax->create_pipeline(opt);
  221 +
  222 + softmax->forward_inplace(pred_bbox, opt);
  223 +
  224 + softmax->destroy_pipeline(opt);
  225 +
  226 + delete softmax;
  227 + }
  228 +
  229 + float pred_ltrb[4];
  230 + for (int k = 0; k < 4; k++)
  231 + {
  232 + float dis = 0.f;
  233 + const float* dis_after_sm = pred_bbox.row(k);
  234 + for (int l = 0; l < reg_max_1; l++)
  235 + {
  236 + dis += l * dis_after_sm[l];
  237 + }
  238 +
  239 + pred_ltrb[k] = dis * stride;
  240 + }
  241 +
  242 + float pb_cx = (x + 0.5f) * stride;
  243 + float pb_cy = (y + 0.5f) * stride;
  244 +
  245 + float x0 = pb_cx - pred_ltrb[0];
  246 + float y0 = pb_cy - pred_ltrb[1];
  247 + float x1 = pb_cx + pred_ltrb[2];
  248 + float y1 = pb_cy + pred_ltrb[3];
  249 +
  250 + std::vector<KeyPoint> keypoints;
  251 + for (int k = 0; k < num_points; k++)
  252 + {
  253 + KeyPoint keypoint;
  254 + keypoint.p.x = (x + pred_points_grid.row(k)[0] * 2) * stride;
  255 + keypoint.p.y = (y + pred_points_grid.row(k)[1] * 2) * stride;
  256 + keypoint.prob = sigmoid(pred_points_grid.row(k)[2]);
  257 + keypoints.push_back(keypoint);
  258 + }
  259 +
  260 + Object obj;
  261 + obj.rect.x = x0;
  262 + obj.rect.y = y0;
  263 + obj.rect.width = x1 - x0;
  264 + obj.rect.height = y1 - y0;
  265 + obj.label = label;
  266 + obj.prob = score;
  267 + obj.keypoints = keypoints;
  268 +
  269 + objects.push_back(obj);
  270 + }
  271 + }
  272 + }
  273 +}
  274 +
  275 +static void generate_proposals(const ncnn::Mat& pred, const ncnn::Mat& pred_points, const std::vector<int>& strides, const ncnn::Mat& in_pad, float prob_threshold, std::vector<Object>& objects)
  276 +{
  277 + const int w = in_pad.w;
  278 + const int h = in_pad.h;
  279 +
  280 + int pred_row_offset = 0;
  281 + for (size_t i = 0; i < strides.size(); i++)
  282 + {
  283 + const int stride = strides[i];
  284 +
  285 + const int num_grid_x = w / stride;
  286 + const int num_grid_y = h / stride;
  287 + const int num_grid = num_grid_x * num_grid_y;
  288 +
  289 + generate_proposals(pred.row_range(pred_row_offset, num_grid), pred_points.row_range(pred_row_offset, num_grid), stride, in_pad, prob_threshold, objects);
  290 +
  291 + pred_row_offset += num_grid;
  292 + }
  293 +}
  294 +
  295 +int YOLOv8_pose::detect(const cv::Mat& rgb, std::vector<Object>& objects)
  296 +{
  297 + const int target_size = det_target_size;//640;
  298 + const float prob_threshold = 0.25f;
  299 + const float nms_threshold = 0.45f;
  300 + const float mask_threshold = 0.5f;
  301 +
  302 + int img_w = rgb.cols;
  303 + int img_h = rgb.rows;
  304 +
  305 + // ultralytics/cfg/models/v8/yolov8.yaml
  306 + std::vector<int> strides(3);
  307 + strides[0] = 8;
  308 + strides[1] = 16;
  309 + strides[2] = 32;
  310 + const int max_stride = 32;
  311 +
  312 + // letterbox pad to multiple of max_stride
  313 + int w = img_w;
  314 + int h = img_h;
  315 + float scale = 1.f;
  316 + if (w > h)
  317 + {
  318 + scale = (float)target_size / w;
  319 + w = target_size;
  320 + h = h * scale;
  321 + }
  322 + else
  323 + {
  324 + scale = (float)target_size / h;
  325 + h = target_size;
  326 + w = w * scale;
  327 + }
  328 +
  329 + ncnn::Mat in = ncnn::Mat::from_pixels_resize(rgb.data, ncnn::Mat::PIXEL_RGB, img_w, img_h, w, h);
  330 +
  331 + // letterbox pad to target_size rectangle
  332 + int wpad = (w + max_stride - 1) / max_stride * max_stride - w;
  333 + int hpad = (h + max_stride - 1) / max_stride * max_stride - h;
  334 + ncnn::Mat in_pad;
  335 + ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f);
  336 +
  337 + const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
  338 + in_pad.substract_mean_normalize(0, norm_vals);
  339 +
  340 + ncnn::Extractor ex = yolov8.create_extractor();
  341 +
  342 + ex.input("in0", in_pad);
  343 +
  344 + ncnn::Mat out;
  345 + ex.extract("out0", out);
  346 +
  347 + ncnn::Mat out_points;
  348 + ex.extract("out1", out_points);
  349 +
  350 + std::vector<Object> proposals;
  351 + generate_proposals(out, out_points, strides, in_pad, prob_threshold, proposals);
  352 +
  353 + // sort all proposals by score from highest to lowest
  354 + qsort_descent_inplace(proposals);
  355 +
  356 + // apply nms with nms_threshold
  357 + std::vector<int> picked;
  358 + nms_sorted_bboxes(proposals, picked, nms_threshold);
  359 +
  360 + int count = picked.size();
  361 + if (count == 0)
  362 + return 0;
  363 +
  364 + const int num_points = out_points.w / 3;
  365 +
  366 + objects.resize(count);
  367 + for (int i = 0; i < count; i++)
  368 + {
  369 + objects[i] = proposals[picked[i]];
  370 +
  371 + // adjust offset to original unpadded
  372 + float x0 = (objects[i].rect.x - (wpad / 2)) / scale;
  373 + float y0 = (objects[i].rect.y - (hpad / 2)) / scale;
  374 + float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale;
  375 + float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale;
  376 +
  377 + for (int j = 0; j < num_points; j++)
  378 + {
  379 + objects[i].keypoints[j].p.x = (objects[i].keypoints[j].p.x - (wpad / 2)) / scale;
  380 + objects[i].keypoints[j].p.y = (objects[i].keypoints[j].p.y - (hpad / 2)) / scale;
  381 + }
  382 +
  383 + // clip
  384 + x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
  385 + y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
  386 + x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
  387 + y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);
  388 +
  389 + objects[i].rect.x = x0;
  390 + objects[i].rect.y = y0;
  391 + objects[i].rect.width = x1 - x0;
  392 + objects[i].rect.height = y1 - y0;
  393 + }
  394 +
  395 + // sort objects by area
  396 + struct
  397 + {
  398 + bool operator()(const Object& a, const Object& b) const
  399 + {
  400 + return a.rect.area() > b.rect.area();
  401 + }
  402 + } objects_area_greater;
  403 + std::sort(objects.begin(), objects.end(), objects_area_greater);
  404 +
  405 + return 0;
  406 +}
  407 +
  408 +int YOLOv8_pose::draw(cv::Mat& rgb, const std::vector<Object>& objects)
  409 +{
  410 + static const char* class_names[] = {"person"};
  411 +
  412 + static const cv::Scalar colors[] = {
  413 + cv::Scalar( 67, 54, 244),
  414 + cv::Scalar( 30, 99, 233),
  415 + cv::Scalar( 39, 176, 156),
  416 + cv::Scalar( 58, 183, 103),
  417 + cv::Scalar( 81, 181, 63),
  418 + cv::Scalar(150, 243, 33),
  419 + cv::Scalar(169, 244, 3),
  420 + cv::Scalar(188, 212, 0),
  421 + cv::Scalar(150, 136, 0),
  422 + cv::Scalar(175, 80, 76),
  423 + cv::Scalar(195, 74, 139),
  424 + cv::Scalar(220, 57, 205),
  425 + cv::Scalar(235, 59, 255),
  426 + cv::Scalar(193, 7, 255),
  427 + cv::Scalar(152, 0, 255),
  428 + cv::Scalar( 87, 34, 255),
  429 + cv::Scalar( 85, 72, 121),
  430 + cv::Scalar(158, 158, 158),
  431 + cv::Scalar(125, 139, 96)
  432 + };
  433 +
  434 + for (size_t i = 0; i < objects.size(); i++)
  435 + {
  436 + const Object& obj = objects[i];
  437 +
  438 + const cv::Scalar& color = colors[i % 19];
  439 +
  440 + // fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
  441 + // obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
  442 +
  443 + // draw bone
  444 + static const int joint_pairs[16][2] = {
  445 + {0, 1}, {1, 3}, {0, 2}, {2, 4}, {5, 6}, {5, 7}, {7, 9}, {6, 8}, {8, 10}, {5, 11}, {6, 12}, {11, 12}, {11, 13}, {12, 14}, {13, 15}, {14, 16}
  446 + };
  447 + static const cv::Scalar bone_colors[] = {
  448 + cv::Scalar( 0, 0, 255),
  449 + cv::Scalar( 0, 0, 255),
  450 + cv::Scalar( 0, 0, 255),
  451 + cv::Scalar( 0, 0, 255),
  452 + cv::Scalar( 0, 255, 128),
  453 + cv::Scalar( 0, 255, 128),
  454 + cv::Scalar( 0, 255, 128),
  455 + cv::Scalar( 0, 255, 128),
  456 + cv::Scalar( 0, 255, 128),
  457 + cv::Scalar(255, 255, 51),
  458 + cv::Scalar(255, 255, 51),
  459 + cv::Scalar(255, 255, 51),
  460 + cv::Scalar(255, 51, 153),
  461 + cv::Scalar(255, 51, 153),
  462 + cv::Scalar(255, 51, 153),
  463 + cv::Scalar(255, 51, 153),
  464 + };
  465 +
  466 + for (int j = 0; j < 16; j++)
  467 + {
  468 + const KeyPoint& p1 = obj.keypoints[joint_pairs[j][0]];
  469 + const KeyPoint& p2 = obj.keypoints[joint_pairs[j][1]];
  470 +
  471 + if (p1.prob < 0.2f || p2.prob < 0.2f)
  472 + continue;
  473 +
  474 + cv::line(rgb, p1.p, p2.p, bone_colors[j], 2);
  475 + }
  476 +
  477 + // draw joint
  478 + for (size_t j = 0; j < obj.keypoints.size(); j++)
  479 + {
  480 + const KeyPoint& keypoint = obj.keypoints[j];
  481 +
  482 + // fprintf(stderr, "%.2f %.2f = %.5f\n", keypoint.p.x, keypoint.p.y, keypoint.prob);
  483 +
  484 + if (keypoint.prob < 0.2f)
  485 + continue;
  486 +
  487 + cv::circle(rgb, keypoint.p, 3, color, -1);
  488 + }
  489 +
  490 + cv::rectangle(rgb, obj.rect, color);
  491 +
  492 + char text[256];
  493 + sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
  494 +
  495 + int baseLine = 0;
  496 + cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
  497 +
  498 + int x = obj.rect.x;
  499 + int y = obj.rect.y - label_size.height - baseLine;
  500 + if (y < 0)
  501 + y = 0;
  502 + if (x + label_size.width > rgb.cols)
  503 + x = rgb.cols - label_size.width;
  504 +
  505 + cv::rectangle(rgb, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
  506 + cv::Scalar(255, 255, 255), -1);
  507 +
  508 + cv::putText(rgb, text, cv::Point(x, y + label_size.height),
  509 + cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
  510 + }
  511 +
  512 + return 0;
  513 +}
  1 +// Tencent is pleased to support the open source community by making ncnn available.
  2 +//
  3 +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
  4 +//
  5 +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6 +// in compliance with the License. You may obtain a copy of the License at
  7 +//
  8 +// https://opensource.org/licenses/BSD-3-Clause
  9 +//
  10 +// Unless required by applicable law or agreed to in writing, software distributed
  11 +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12 +// CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13 +// specific language governing permissions and limitations under the License.
  14 +
  15 +// 1. install
  16 +// pip3 install -U ultralytics pnnx ncnn
  17 +// 2. export yolov8-seg torchscript
  18 +// yolo export model=yolov8n-seg.pt format=torchscript
  19 +// 3. convert torchscript with static shape
  20 +// pnnx yolov8n-seg.torchscript
  21 +// 4. modify yolov8n_seg_pnnx.py for dynamic shape inference
  22 +// A. modify reshape to support dynamic image sizes
  23 +// B. permute tensor before concat and adjust concat axis
  24 +// C. drop post-process part
  25 +// before:
  26 +// v_144 = v_143.view(1, 32, 6400)
  27 +// v_150 = v_149.view(1, 32, 1600)
  28 +// v_156 = v_155.view(1, 32, 400)
  29 +// v_157 = torch.cat((v_144, v_150, v_156), dim=2)
  30 +// ...
  31 +// v_191 = v_168.view(1, 144, 6400)
  32 +// v_192 = v_179.view(1, 144, 1600)
  33 +// v_193 = v_190.view(1, 144, 400)
  34 +// v_194 = torch.cat((v_191, v_192, v_193), dim=2)
  35 +// ...
  36 +// v_215 = (v_214, v_138, )
  37 +// return v_215
  38 +// after:
  39 +// v_144 = v_143.view(1, 32, -1).transpose(1, 2)
  40 +// v_150 = v_149.view(1, 32, -1).transpose(1, 2)
  41 +// v_156 = v_155.view(1, 32, -1).transpose(1, 2)
  42 +// v_157 = torch.cat((v_144, v_150, v_156), dim=1)
  43 +// ...
  44 +// v_191 = v_168.view(1, 144, -1).transpose(1, 2)
  45 +// v_192 = v_179.view(1, 144, -1).transpose(1, 2)
  46 +// v_193 = v_190.view(1, 144, -1).transpose(1, 2)
  47 +// v_194 = torch.cat((v_191, v_192, v_193), dim=1)
  48 +// return v_194, v_157, v_138
  49 +// 5. re-export yolov8-seg torchscript
  50 +// python3 -c 'import yolov8n_seg_pnnx; yolov8n_seg_pnnx.export_torchscript()'
  51 +// 6. convert new torchscript with dynamic shape
  52 +// pnnx yolov8n_seg_pnnx.py.pt inputshape=[1,3,640,640] inputshape2=[1,3,320,320]
  53 +// 7. now you get ncnn model files
  54 +// mv yolov8n_seg_pnnx.py.ncnn.param yolov8n_seg.ncnn.param
  55 +// mv yolov8n_seg_pnnx.py.ncnn.bin yolov8n_seg.ncnn.bin
  56 +
  57 +// the out blob would be a 2-dim tensor with w=176 h=8400
  58 +//
  59 +// | bbox-reg 16 x 4 | per-class scores(80) |
  60 +// +-----+-----+-----+-----+----------------------+
  61 +// | dx0 | dy0 | dx1 | dy1 |0.1 0.0 0.0 0.5 ......|
  62 +// all /| | | | | . |
  63 +// boxes | .. | .. | .. | .. |0.0 0.9 0.0 0.0 ......|
  64 +// (8400)| | | | | . |
  65 +// \| | | | | . |
  66 +// +-----+-----+-----+-----+----------------------+
  67 +//
  68 +
  69 +//
  70 +// | mask (32) |
  71 +// +-----------+
  72 +// |0.1........|
  73 +// all /| |
  74 +// boxes |0.0........|
  75 +// (8400)| . |
  76 +// \| . |
  77 +// +-----------+
  78 +//
  79 +
  80 +#include "yolov8.h"
  81 +
  82 +#include "layer.h"
  83 +
  84 +#include <opencv2/core/core.hpp>
  85 +#include <opencv2/imgproc/imgproc.hpp>
  86 +
  87 +#include <float.h>
  88 +#include <stdio.h>
  89 +#include <vector>
  90 +
  91 +static inline float intersection_area(const Object& a, const Object& b)
  92 +{
  93 + cv::Rect_<float> inter = a.rect & b.rect;
  94 + return inter.area();
  95 +}
  96 +
  97 +static void qsort_descent_inplace(std::vector<Object>& objects, int left, int right)
  98 +{
  99 + int i = left;
  100 + int j = right;
  101 + float p = objects[(left + right) / 2].prob;
  102 +
  103 + while (i <= j)
  104 + {
  105 + while (objects[i].prob > p)
  106 + i++;
  107 +
  108 + while (objects[j].prob < p)
  109 + j--;
  110 +
  111 + if (i <= j)
  112 + {
  113 + // swap
  114 + std::swap(objects[i], objects[j]);
  115 +
  116 + i++;
  117 + j--;
  118 + }
  119 + }
  120 +
  121 + // #pragma omp parallel sections
  122 + {
  123 + // #pragma omp section
  124 + {
  125 + if (left < j) qsort_descent_inplace(objects, left, j);
  126 + }
  127 + // #pragma omp section
  128 + {
  129 + if (i < right) qsort_descent_inplace(objects, i, right);
  130 + }
  131 + }
  132 +}
  133 +
  134 +static void qsort_descent_inplace(std::vector<Object>& objects)
  135 +{
  136 + if (objects.empty())
  137 + return;
  138 +
  139 + qsort_descent_inplace(objects, 0, objects.size() - 1);
  140 +}
  141 +
  142 +static void nms_sorted_bboxes(const std::vector<Object>& objects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
  143 +{
  144 + picked.clear();
  145 +
  146 + const int n = objects.size();
  147 +
  148 + std::vector<float> areas(n);
  149 + for (int i = 0; i < n; i++)
  150 + {
  151 + areas[i] = objects[i].rect.area();
  152 + }
  153 +
  154 + for (int i = 0; i < n; i++)
  155 + {
  156 + const Object& a = objects[i];
  157 +
  158 + int keep = 1;
  159 + for (int j = 0; j < (int)picked.size(); j++)
  160 + {
  161 + const Object& b = objects[picked[j]];
  162 +
  163 + if (!agnostic && a.label != b.label)
  164 + continue;
  165 +
  166 + // intersection over union
  167 + float inter_area = intersection_area(a, b);
  168 + float union_area = areas[i] + areas[picked[j]] - inter_area;
  169 + // float IoU = inter_area / union_area
  170 + if (inter_area / union_area > nms_threshold)
  171 + keep = 0;
  172 + }
  173 +
  174 + if (keep)
  175 + picked.push_back(i);
  176 + }
  177 +}
  178 +
  179 +static inline float sigmoid(float x)
  180 +{
  181 + return 1.0f / (1.0f + expf(-x));
  182 +}
  183 +
  184 +static void generate_proposals(const ncnn::Mat& pred, int stride, const ncnn::Mat& in_pad, float prob_threshold, std::vector<Object>& objects)
  185 +{
  186 + const int w = in_pad.w;
  187 + const int h = in_pad.h;
  188 +
  189 + const int num_grid_x = w / stride;
  190 + const int num_grid_y = h / stride;
  191 +
  192 + const int reg_max_1 = 16;
  193 + const int num_class = pred.w - reg_max_1 * 4; // number of classes. 80 for COCO
  194 +
  195 + for (int y = 0; y < num_grid_y; y++)
  196 + {
  197 + for (int x = 0; x < num_grid_x; x++)
  198 + {
  199 + const ncnn::Mat pred_grid = pred.row_range(y * num_grid_x + x, 1);
  200 +
  201 + // find label with max score
  202 + int label = -1;
  203 + float score = -FLT_MAX;
  204 + {
  205 + const ncnn::Mat pred_score = pred_grid.range(reg_max_1 * 4, num_class);
  206 +
  207 + for (int k = 0; k < num_class; k++)
  208 + {
  209 + float s = pred_score[k];
  210 + if (s > score)
  211 + {
  212 + label = k;
  213 + score = s;
  214 + }
  215 + }
  216 +
  217 + score = sigmoid(score);
  218 + }
  219 +
  220 + if (score >= prob_threshold)
  221 + {
  222 + ncnn::Mat pred_bbox = pred_grid.range(0, reg_max_1 * 4).reshape(reg_max_1, 4).clone();
  223 +
  224 + {
  225 + ncnn::Layer* softmax = ncnn::create_layer("Softmax");
  226 +
  227 + ncnn::ParamDict pd;
  228 + pd.set(0, 1); // axis
  229 + pd.set(1, 1);
  230 + softmax->load_param(pd);
  231 +
  232 + ncnn::Option opt;
  233 + opt.num_threads = 1;
  234 + opt.use_packing_layout = false;
  235 +
  236 + softmax->create_pipeline(opt);
  237 +
  238 + softmax->forward_inplace(pred_bbox, opt);
  239 +
  240 + softmax->destroy_pipeline(opt);
  241 +
  242 + delete softmax;
  243 + }
  244 +
  245 + float pred_ltrb[4];
  246 + for (int k = 0; k < 4; k++)
  247 + {
  248 + float dis = 0.f;
  249 + const float* dis_after_sm = pred_bbox.row(k);
  250 + for (int l = 0; l < reg_max_1; l++)
  251 + {
  252 + dis += l * dis_after_sm[l];
  253 + }
  254 +
  255 + pred_ltrb[k] = dis * stride;
  256 + }
  257 +
  258 + float pb_cx = (x + 0.5f) * stride;
  259 + float pb_cy = (y + 0.5f) * stride;
  260 +
  261 + float x0 = pb_cx - pred_ltrb[0];
  262 + float y0 = pb_cy - pred_ltrb[1];
  263 + float x1 = pb_cx + pred_ltrb[2];
  264 + float y1 = pb_cy + pred_ltrb[3];
  265 +
  266 + Object obj;
  267 + obj.rect.x = x0;
  268 + obj.rect.y = y0;
  269 + obj.rect.width = x1 - x0;
  270 + obj.rect.height = y1 - y0;
  271 + obj.label = label;
  272 + obj.prob = score;
  273 + obj.gindex = y * num_grid_x + x;
  274 +
  275 + objects.push_back(obj);
  276 + }
  277 + }
  278 + }
  279 +}
  280 +
  281 +static void generate_proposals(const ncnn::Mat& pred, const std::vector<int>& strides, const ncnn::Mat& in_pad, float prob_threshold, std::vector<Object>& objects)
  282 +{
  283 + const int w = in_pad.w;
  284 + const int h = in_pad.h;
  285 +
  286 + int pred_row_offset = 0;
  287 + for (size_t i = 0; i < strides.size(); i++)
  288 + {
  289 + const int stride = strides[i];
  290 +
  291 + const int num_grid_x = w / stride;
  292 + const int num_grid_y = h / stride;
  293 + const int num_grid = num_grid_x * num_grid_y;
  294 +
  295 + std::vector<Object> objects_stride;
  296 + generate_proposals(pred.row_range(pred_row_offset, num_grid), stride, in_pad, prob_threshold, objects_stride);
  297 +
  298 + for (size_t j = 0; j < objects_stride.size(); j++)
  299 + {
  300 + Object obj = objects_stride[j];
  301 + obj.gindex += pred_row_offset;
  302 + objects.push_back(obj);
  303 + }
  304 +
  305 + pred_row_offset += num_grid;
  306 + }
  307 +}
  308 +
  309 +int YOLOv8_seg::detect(const cv::Mat& rgb, std::vector<Object>& objects)
  310 +{
  311 + const int target_size = det_target_size;//640;
  312 + const float prob_threshold = 0.25f;
  313 + const float nms_threshold = 0.45f;
  314 + const float mask_threshold = 0.5f;
  315 +
  316 + int img_w = rgb.cols;
  317 + int img_h = rgb.rows;
  318 +
  319 + // ultralytics/cfg/models/v8/yolov8.yaml
  320 + std::vector<int> strides(3);
  321 + strides[0] = 8;
  322 + strides[1] = 16;
  323 + strides[2] = 32;
  324 + const int max_stride = 32;
  325 +
  326 + // letterbox pad to multiple of max_stride
  327 + int w = img_w;
  328 + int h = img_h;
  329 + float scale = 1.f;
  330 + if (w > h)
  331 + {
  332 + scale = (float)target_size / w;
  333 + w = target_size;
  334 + h = h * scale;
  335 + }
  336 + else
  337 + {
  338 + scale = (float)target_size / h;
  339 + h = target_size;
  340 + w = w * scale;
  341 + }
  342 +
  343 + ncnn::Mat in = ncnn::Mat::from_pixels_resize(rgb.data, ncnn::Mat::PIXEL_RGB, img_w, img_h, w, h);
  344 +
  345 + // letterbox pad to target_size rectangle
  346 + int wpad = (w + max_stride - 1) / max_stride * max_stride - w;
  347 + int hpad = (h + max_stride - 1) / max_stride * max_stride - h;
  348 + ncnn::Mat in_pad;
  349 + ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f);
  350 +
  351 + const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
  352 + in_pad.substract_mean_normalize(0, norm_vals);
  353 +
  354 + ncnn::Extractor ex = yolov8.create_extractor();
  355 +
  356 + ex.input("in0", in_pad);
  357 +
  358 + ncnn::Mat out;
  359 + ex.extract("out0", out);
  360 +
  361 + std::vector<Object> proposals;
  362 + generate_proposals(out, strides, in_pad, prob_threshold, proposals);
  363 +
  364 + // sort all proposals by score from highest to lowest
  365 + qsort_descent_inplace(proposals);
  366 +
  367 + // apply nms with nms_threshold
  368 + std::vector<int> picked;
  369 + nms_sorted_bboxes(proposals, picked, nms_threshold);
  370 +
  371 + int count = picked.size();
  372 + if (count == 0)
  373 + return 0;
  374 +
  375 + ncnn::Mat mask_feat;
  376 + ex.extract("out1", mask_feat);
  377 +
  378 + ncnn::Mat mask_protos;
  379 + ex.extract("out2", mask_protos);
  380 +
  381 + ncnn::Mat objects_mask_feat(mask_feat.w, 1, count);
  382 +
  383 + objects.resize(count);
  384 + for (int i = 0; i < count; i++)
  385 + {
  386 + objects[i] = proposals[picked[i]];
  387 +
  388 + // adjust offset to original unpadded
  389 + float x0 = (objects[i].rect.x - (wpad / 2)) / scale;
  390 + float y0 = (objects[i].rect.y - (hpad / 2)) / scale;
  391 + float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale;
  392 + float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale;
  393 +
  394 + // clip
  395 + x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
  396 + y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
  397 + x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
  398 + y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);
  399 +
  400 + objects[i].rect.x = x0;
  401 + objects[i].rect.y = y0;
  402 + objects[i].rect.width = x1 - x0;
  403 + objects[i].rect.height = y1 - y0;
  404 +
  405 + // pick mask feat
  406 + memcpy(objects_mask_feat.channel(i), mask_feat.row(objects[i].gindex), mask_feat.w * sizeof(float));
  407 + }
  408 +
  409 + // process mask
  410 + ncnn::Mat objects_mask;
  411 + {
  412 + ncnn::Layer* gemm = ncnn::create_layer("Gemm");
  413 +
  414 + ncnn::ParamDict pd;
  415 + pd.set(6, 1); // constantC
  416 + pd.set(7, count); // constantM
  417 + pd.set(8, mask_protos.w * mask_protos.h); // constantN
  418 + pd.set(9, mask_feat.w); // constantK
  419 + pd.set(10, -1); // constant_broadcast_type_C
  420 + pd.set(11, 1); // output_N1M
  421 + gemm->load_param(pd);
  422 +
  423 + ncnn::Option opt;
  424 + opt.num_threads = 1;
  425 + opt.use_packing_layout = false;
  426 +
  427 + gemm->create_pipeline(opt);
  428 +
  429 + std::vector<ncnn::Mat> gemm_inputs(2);
  430 + gemm_inputs[0] = objects_mask_feat;
  431 + gemm_inputs[1] = mask_protos.reshape(mask_protos.w * mask_protos.h, 1, mask_protos.c);
  432 + std::vector<ncnn::Mat> gemm_outputs(1);
  433 + gemm->forward(gemm_inputs, gemm_outputs, opt);
  434 + objects_mask = gemm_outputs[0].reshape(mask_protos.w, mask_protos.h, count);
  435 +
  436 + gemm->destroy_pipeline(opt);
  437 +
  438 + delete gemm;
  439 + }
  440 + {
  441 + ncnn::Layer* sigmoid = ncnn::create_layer("Sigmoid");
  442 +
  443 + ncnn::Option opt;
  444 + opt.num_threads = 1;
  445 + opt.use_packing_layout = false;
  446 +
  447 + sigmoid->create_pipeline(opt);
  448 +
  449 + sigmoid->forward_inplace(objects_mask, opt);
  450 +
  451 + sigmoid->destroy_pipeline(opt);
  452 +
  453 + delete sigmoid;
  454 + }
  455 +
  456 + // resize mask map
  457 + {
  458 + ncnn::Mat objects_mask_resized;
  459 + ncnn::resize_bilinear(objects_mask, objects_mask_resized, in_pad.w / scale, in_pad.h / scale);
  460 + objects_mask = objects_mask_resized;
  461 + }
  462 +
  463 + // create per-object mask
  464 + for (int i = 0; i < count; i++)
  465 + {
  466 + Object& obj = objects[i];
  467 +
  468 + const ncnn::Mat mm = objects_mask.channel(i);
  469 +
  470 + obj.mask = cv::Mat((int)obj.rect.height, (int)obj.rect.width, CV_8UC1);
  471 +
  472 + // adjust offset to original unpadded and clip inside object box
  473 + for (int y = 0; y < (int)obj.rect.height; y++)
  474 + {
  475 + const float* pmm = mm.row((int)(hpad / 2 / scale + obj.rect.y + y)) + (int)(wpad / 2 / scale + obj.rect.x);
  476 + uchar* pmask = obj.mask.ptr<uchar>(y);
  477 + for (int x = 0; x < (int)obj.rect.width; x++)
  478 + {
  479 + pmask[x] = pmm[x] > mask_threshold ? 1 : 0;
  480 + }
  481 + }
  482 + }
  483 +
  484 + // sort objects by area
  485 + struct
  486 + {
  487 + bool operator()(const Object& a, const Object& b) const
  488 + {
  489 + return a.rect.area() > b.rect.area();
  490 + }
  491 + } objects_area_greater;
  492 + std::sort(objects.begin(), objects.end(), objects_area_greater);
  493 +
  494 + return 0;
  495 +}
  496 +
  497 +int YOLOv8_seg::draw(cv::Mat& rgb, const std::vector<Object>& objects)
  498 +{
  499 + static const char* class_names[] = {
  500 + "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
  501 + "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
  502 + "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
  503 + "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
  504 + "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
  505 + "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
  506 + "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
  507 + "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
  508 + "hair drier", "toothbrush"
  509 + };
  510 +
  511 + static cv::Scalar colors[] = {
  512 + cv::Scalar( 67, 54, 244),
  513 + cv::Scalar( 30, 99, 233),
  514 + cv::Scalar( 39, 176, 156),
  515 + cv::Scalar( 58, 183, 103),
  516 + cv::Scalar( 81, 181, 63),
  517 + cv::Scalar(150, 243, 33),
  518 + cv::Scalar(169, 244, 3),
  519 + cv::Scalar(188, 212, 0),
  520 + cv::Scalar(150, 136, 0),
  521 + cv::Scalar(175, 80, 76),
  522 + cv::Scalar(195, 74, 139),
  523 + cv::Scalar(220, 57, 205),
  524 + cv::Scalar(235, 59, 255),
  525 + cv::Scalar(193, 7, 255),
  526 + cv::Scalar(152, 0, 255),
  527 + cv::Scalar( 87, 34, 255),
  528 + cv::Scalar( 85, 72, 121),
  529 + cv::Scalar(158, 158, 158),
  530 + cv::Scalar(125, 139, 96)
  531 + };
  532 +
  533 + for (size_t i = 0; i < objects.size(); i++)
  534 + {
  535 + const Object& obj = objects[i];
  536 +
  537 + const cv::Scalar& color = colors[i % 19];
  538 +
  539 + // fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
  540 + // obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
  541 +
  542 + for (int y = 0; y < (int)obj.rect.height; y++)
  543 + {
  544 + const uchar* maskptr = obj.mask.ptr<const uchar>(y);
  545 + uchar* bgrptr = rgb.ptr<uchar>((int)obj.rect.y + y) + (int)obj.rect.x * 3;
  546 + for (int x = 0; x < (int)obj.rect.width; x++)
  547 + {
  548 + if (maskptr[x])
  549 + {
  550 + bgrptr[0] = bgrptr[0] * 0.5 + color[0] * 0.5;
  551 + bgrptr[1] = bgrptr[1] * 0.5 + color[1] * 0.5;
  552 + bgrptr[2] = bgrptr[2] * 0.5 + color[2] * 0.5;
  553 + }
  554 + bgrptr += 3;
  555 + }
  556 + }
  557 +
  558 + cv::rectangle(rgb, obj.rect, color);
  559 +
  560 + char text[256];
  561 + sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
  562 +
  563 + int baseLine = 0;
  564 + cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
  565 +
  566 + int x = obj.rect.x;
  567 + int y = obj.rect.y - label_size.height - baseLine;
  568 + if (y < 0)
  569 + y = 0;
  570 + if (x + label_size.width > rgb.cols)
  571 + x = rgb.cols - label_size.width;
  572 +
  573 + cv::rectangle(rgb, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
  574 + cv::Scalar(255, 255, 255), -1);
  575 +
  576 + cv::putText(rgb, text, cv::Point(x, y + label_size.height),
  577 + cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
  578 + }
  579 +
  580 + return 0;
  581 +}
  1 +// Tencent is pleased to support the open source community by making ncnn available.
  2 +//
  3 +// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
  4 +//
  5 +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6 +// in compliance with the License. You may obtain a copy of the License at
  7 +//
  8 +// https://opensource.org/licenses/BSD-3-Clause
  9 +//
  10 +// Unless required by applicable law or agreed to in writing, software distributed
  11 +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12 +// CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13 +// specific language governing permissions and limitations under the License.
  14 +
  15 +#include <android/asset_manager_jni.h>
  16 +#include <android/native_window_jni.h>
  17 +#include <android/native_window.h>
  18 +
  19 +#include <android/log.h>
  20 +
  21 +#include <jni.h>
  22 +
  23 +#include <string>
  24 +#include <vector>
  25 +
  26 +#include <platform.h>
  27 +#include <benchmark.h>
  28 +
  29 +#include "yolov8.h"
  30 +
  31 +#include "ndkcamera.h"
  32 +
  33 +#include <opencv2/core/core.hpp>
  34 +#include <opencv2/imgproc/imgproc.hpp>
  35 +
  36 +#if __ARM_NEON
  37 +#include <arm_neon.h>
  38 +#endif // __ARM_NEON
  39 +
  40 +static int draw_unsupported(cv::Mat& rgb)
  41 +{
  42 + const char text[] = "unsupported";
  43 +
  44 + int baseLine = 0;
  45 + cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 1.0, 1, &baseLine);
  46 +
  47 + int y = (rgb.rows - label_size.height) / 2;
  48 + int x = (rgb.cols - label_size.width) / 2;
  49 +
  50 + cv::rectangle(rgb, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
  51 + cv::Scalar(255, 255, 255), -1);
  52 +
  53 + cv::putText(rgb, text, cv::Point(x, y + label_size.height),
  54 + cv::FONT_HERSHEY_SIMPLEX, 1.0, cv::Scalar(0, 0, 0));
  55 +
  56 + return 0;
  57 +}
  58 +
  59 +static int draw_fps(cv::Mat& rgb)
  60 +{
  61 + // resolve moving average
  62 + float avg_fps = 0.f;
  63 + {
  64 + static double t0 = 0.f;
  65 + static float fps_history[10] = {0.f};
  66 +
  67 + double t1 = ncnn::get_current_time();
  68 + if (t0 == 0.f)
  69 + {
  70 + t0 = t1;
  71 + return 0;
  72 + }
  73 +
  74 + float fps = 1000.f / (t1 - t0);
  75 + t0 = t1;
  76 +
  77 + for (int i = 9; i >= 1; i--)
  78 + {
  79 + fps_history[i] = fps_history[i - 1];
  80 + }
  81 + fps_history[0] = fps;
  82 +
  83 + if (fps_history[9] == 0.f)
  84 + {
  85 + return 0;
  86 + }
  87 +
  88 + for (int i = 0; i < 10; i++)
  89 + {
  90 + avg_fps += fps_history[i];
  91 + }
  92 + avg_fps /= 10.f;
  93 + }
  94 +
  95 + char text[32];
  96 + sprintf(text, "FPS=%.2f", avg_fps);
  97 +
  98 + int baseLine = 0;
  99 + cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
  100 +
  101 + int y = 0;
  102 + int x = rgb.cols - label_size.width;
  103 +
  104 + cv::rectangle(rgb, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
  105 + cv::Scalar(255, 255, 255), -1);
  106 +
  107 + cv::putText(rgb, text, cv::Point(x, y + label_size.height),
  108 + cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
  109 +
  110 + return 0;
  111 +}
  112 +
  113 +static YOLOv8* g_yolov8 = 0;
  114 +static ncnn::Mutex lock;
  115 +
  116 +class MyNdkCamera : public NdkCameraWindow
  117 +{
  118 +public:
  119 + virtual void on_image_render(cv::Mat& rgb) const;
  120 +};
  121 +
  122 +void MyNdkCamera::on_image_render(cv::Mat& rgb) const
  123 +{
  124 + // yolov8
  125 + {
  126 + ncnn::MutexLockGuard g(lock);
  127 +
  128 + if (g_yolov8)
  129 + {
  130 + std::vector<Object> objects;
  131 + g_yolov8->detect(rgb, objects);
  132 +
  133 + g_yolov8->draw(rgb, objects);
  134 + }
  135 + else
  136 + {
  137 + draw_unsupported(rgb);
  138 + }
  139 + }
  140 +
  141 + draw_fps(rgb);
  142 +}
  143 +
  144 +static MyNdkCamera* g_camera = 0;
  145 +
  146 +extern "C" {
  147 +
  148 +JNIEXPORT jint JNI_OnLoad(JavaVM* vm, void* reserved)
  149 +{
  150 + __android_log_print(ANDROID_LOG_DEBUG, "ncnn", "JNI_OnLoad");
  151 +
  152 + g_camera = new MyNdkCamera;
  153 +
  154 + ncnn::create_gpu_instance();
  155 +
  156 + return JNI_VERSION_1_4;
  157 +}
  158 +
  159 +JNIEXPORT void JNI_OnUnload(JavaVM* vm, void* reserved)
  160 +{
  161 + __android_log_print(ANDROID_LOG_DEBUG, "ncnn", "JNI_OnUnload");
  162 +
  163 + {
  164 + ncnn::MutexLockGuard g(lock);
  165 +
  166 + delete g_yolov8;
  167 + g_yolov8 = 0;
  168 + }
  169 +
  170 + ncnn::destroy_gpu_instance();
  171 +
  172 + delete g_camera;
  173 + g_camera = 0;
  174 +}
  175 +
  176 +// public native boolean loadModel(AssetManager mgr, int taskid, int modelid, int cpugpu);
  177 +JNIEXPORT jboolean JNICALL Java_com_tencent_yolov8ncnn_YOLOv8Ncnn_loadModel(JNIEnv* env, jobject thiz, jobject assetManager, jint taskid, jint modelid, jint cpugpu)
  178 +{
  179 + if (taskid < 0 || taskid > 5 || modelid < 0 || modelid > 8 || cpugpu < 0 || cpugpu > 2)
  180 + {
  181 + return JNI_FALSE;
  182 + }
  183 +
  184 + AAssetManager* mgr = AAssetManager_fromJava(env, assetManager);
  185 +
  186 + __android_log_print(ANDROID_LOG_DEBUG, "ncnn", "loadModel %p", mgr);
  187 +
  188 + const char* tasknames[6] =
  189 + {
  190 + "",
  191 + "_oiv7",
  192 + "_seg",
  193 + "_pose",
  194 + "_cls",
  195 + "_obb"
  196 + };
  197 +
  198 + const char* modeltypes[9] =
  199 + {
  200 + "n",
  201 + "s",
  202 + "m",
  203 + "n",
  204 + "s",
  205 + "m",
  206 + "n",
  207 + "s",
  208 + "m"
  209 + };
  210 +
  211 + std::string parampath = std::string("yolov8") + modeltypes[(int)modelid] + tasknames[(int)taskid] + ".ncnn.param";
  212 + std::string modelpath = std::string("yolov8") + modeltypes[(int)modelid] + tasknames[(int)taskid] + ".ncnn.bin";
  213 + bool use_gpu = (int)cpugpu == 1;
  214 + bool use_turnip = (int)cpugpu == 2;
  215 +
  216 + // reload
  217 + {
  218 + ncnn::MutexLockGuard g(lock);
  219 +
  220 + {
  221 + static int old_taskid = 0;
  222 + static int old_modelid = 0;
  223 + static int old_cpugpu = 0;
  224 + if (taskid != old_taskid || (modelid % 3) != old_modelid || cpugpu != old_cpugpu)
  225 + {
  226 + // taskid or model or cpugpu changed
  227 + delete g_yolov8;
  228 + g_yolov8 = 0;
  229 + }
  230 + old_taskid = taskid;
  231 + old_modelid = modelid % 3;
  232 + old_cpugpu = cpugpu;
  233 +
  234 + ncnn::destroy_gpu_instance();
  235 +
  236 + if (use_turnip)
  237 + {
  238 + ncnn::create_gpu_instance("libvulkan_freedreno.so");
  239 + }
  240 + else if (use_gpu)
  241 + {
  242 + ncnn::create_gpu_instance();
  243 + }
  244 +
  245 + if (!g_yolov8)
  246 + {
  247 + if (taskid == 0) g_yolov8 = new YOLOv8_det_coco;
  248 + if (taskid == 1) g_yolov8 = new YOLOv8_det_oiv7;
  249 + if (taskid == 2) g_yolov8 = new YOLOv8_seg;
  250 + if (taskid == 3) g_yolov8 = new YOLOv8_pose;
  251 + if (taskid == 4) g_yolov8 = new YOLOv8_cls;
  252 + if (taskid == 5) g_yolov8 = new YOLOv8_obb;
  253 +
  254 + g_yolov8->load(mgr, parampath.c_str(), modelpath.c_str(), use_gpu || use_turnip);
  255 + }
  256 + int target_size = 320;
  257 + if ((int)modelid >= 3)
  258 + target_size = 480;
  259 + if ((int)modelid >= 6)
  260 + target_size = 640;
  261 + g_yolov8->set_det_target_size(target_size);
  262 + }
  263 + }
  264 +
  265 + return JNI_TRUE;
  266 +}
  267 +
  268 +// public native boolean openCamera(int facing);
  269 +JNIEXPORT jboolean JNICALL Java_com_tencent_yolov8ncnn_YOLOv8Ncnn_openCamera(JNIEnv* env, jobject thiz, jint facing)
  270 +{
  271 + if (facing < 0 || facing > 1)
  272 + return JNI_FALSE;
  273 +
  274 + __android_log_print(ANDROID_LOG_DEBUG, "ncnn", "openCamera %d", facing);
  275 +
  276 + g_camera->open((int)facing);
  277 +
  278 + return JNI_TRUE;
  279 +}
  280 +
  281 +// public native boolean closeCamera();
  282 +JNIEXPORT jboolean JNICALL Java_com_tencent_yolov8ncnn_YOLOv8Ncnn_closeCamera(JNIEnv* env, jobject thiz)
  283 +{
  284 + __android_log_print(ANDROID_LOG_DEBUG, "ncnn", "closeCamera");
  285 +
  286 + g_camera->close();
  287 +
  288 + return JNI_TRUE;
  289 +}
  290 +
  291 +// public native boolean setOutputWindow(Surface surface);
  292 +JNIEXPORT jboolean JNICALL Java_com_tencent_yolov8ncnn_YOLOv8Ncnn_setOutputWindow(JNIEnv* env, jobject thiz, jobject surface)
  293 +{
  294 + ANativeWindow* win = ANativeWindow_fromSurface(env, surface);
  295 +
  296 + __android_log_print(ANDROID_LOG_DEBUG, "ncnn", "setOutputWindow %p", win);
  297 +
  298 + g_camera->set_window(win);
  299 +
  300 + return JNI_TRUE;
  301 +}
  302 +
  303 +}