import time import cupy_backends.cuda.api.runtime import cv2 from numba import njit, prange, types, int64, typed from simple_decode import decode from numpy import fromfile, uint8, float32 import numpy as np from cupy.cuda import UnownedMemory, MemoryPointer, set_allocator, stream from cupy import ndarray, stack, pad, ascontiguousarray, asnumpy from cupyx.scipy.ndimage import zoom from torch import Tensor, from_dlpack from os import listdir, path from more_itertools import chunked from onnxruntime import InferenceSession import math from matplotlib import pyplot from PIL import Image set_allocator() IMAGE_SIZE = 640 BATCH_SIZE = 1 def resize_image(image: ndarray, scales: list[float], stz: list[stream.Event], stream: stream.Stream) -> ndarray: with stream: if image.shape[1] <= IMAGE_SIZE and image.shape[2] <= IMAGE_SIZE: scale = 1.0 pass else: scale = ((IMAGE_SIZE - 1) / max(image.shape[1], image.shape[2])) image = zoom(image, (1, scale, scale), mode="constant") ret = pad(image, pad_width=[ (0, 0), (0, IMAGE_SIZE - image.shape[1]), (0, IMAGE_SIZE - image.shape[2]) ]) scales.append(scale) stz.append(stream.record()) return ret # @njit def prior_box(min_sizes: list[list[int]], steps: list[int], clip: bool, image_sizes: list[int]) -> np.ndarray: feature_maps = [[math.ceil(image_sizes[0] / step), math.ceil(image_sizes[1] / step)] for step in steps] anchors = [] for k, f in enumerate(feature_maps): min_sizes_k = min_sizes[k] for i in range(f[0]): for j in range(f[1]): for min_size in min_sizes_k: s_kx = min_size / image_sizes[1] s_ky = min_size / image_sizes[0] dense_cx = [x * steps[k] / image_sizes[1] for x in [j + 0.5]] dense_cy = [y * steps[k] / image_sizes[0] for y in [i + 0.5]] for cy in dense_cy: for cx in dense_cx: anchors.append([cx, cy, s_kx, s_ky]) # for cy, cx in np.nditer([np.array(dense_cy), np.array(dense_cx).reshape((-1, 1))]): output = np.array(anchors) if clip: output.clip(.0, 1.0) return output # @njit def loc_decode(loc: np.ndarray, priors: np.ndarray, variances: list[float]): boxes = np.concatenate(( priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:], priors[:, 2:] * np.exp(loc[:, 2:] * variances[1])), 1) boxes[:, :2] -= boxes[:, 2:] / 2 boxes[:, 2:] += boxes[:, :2] return boxes # @njit def decode_landm(pre: np.ndarray, priors: np.ndarray, variances: list[float]): return np.concatenate((priors[:, :2] + pre[:, :2] * variances[0] * priors[:, 2:], priors[:, :2] + pre[:, 2:4] * variances[0] * priors[:, 2:], priors[:, :2] + pre[:, 4:6] * variances[0] * priors[:, 2:], priors[:, :2] + pre[:, 6:8] * variances[0] * priors[:, 2:], priors[:, :2] + pre[:, 8:10] * variances[0] * priors[:, 2:], ), axis=1) # @njit def py_cpu_nms(dets: np.ndarray, thresh: float): x1 = dets[:, 0] y1 = dets[:, 1] x2 = dets[:, 2] y2 = dets[:, 3] scores = dets[:, 4] areas = (x2 - x1 + 1) * (y2 - y1 + 1) order = scores.argsort()[::-1] keep = [] while order.size > 0: i = order[0] keep.append(i) xx1 = np.maximum(x1[i], x1[order[1:]]) yy1 = np.maximum(y1[i], y1[order[1:]]) xx2 = np.minimum(x2[i], x2[order[1:]]) yy2 = np.minimum(y2[i], y2[order[1:]]) w = np.maximum(0.0, xx2 - xx1 + 1) h = np.maximum(0.0, yy2 - yy1 + 1) inter = w * h ovr = inter / (areas[i] + areas[order[1:]] - inter) inds = np.where(ovr <= thresh)[0] order = order[inds + 1] return keep # @njit def softmax(x, axis=-1): x_max = np.max(x, axis=axis, keepdims=True) exp_x = np.exp(x - x_max) return exp_x / np.sum(exp_x, axis=axis, keepdims=True) # @njit def post_process(landms: np.ndarray, conf: np.ndarray, loc: np.ndarray, image_sizes: list[int], resize_scale: float, confidence_threshold: float, top_k: int, nms_threshold: float, keep_top_k: int) -> np.ndarray: priors = prior_box(min_sizes=[[16, 32], [64, 128], [256, 512]], steps=[8, 16, 32], clip=False, image_sizes=image_sizes) boxes = loc_decode(loc, priors, [0.1, 0.2]) boxes_scale = np.array([image_sizes[1], image_sizes[0]] * 2) boxes = boxes * boxes_scale / resize_scale conf = softmax(conf) scores = conf[:, 1] landms = decode_landm(landms, priors, [0.1, 0.2]) landms_scale = np.array([image_sizes[1], image_sizes[0]] * 5) landms = landms * landms_scale / resize_scale inds = np.where(scores > confidence_threshold)[0] boxes = boxes[inds] landms = landms[inds] scores = scores[inds] order = scores.argsort()[::-1][:top_k] boxes = boxes[order] landms = landms[order] scores = scores[order] dets = np.hstack((boxes, scores[:, np.newaxis])).astype(np.float32, copy=False) keep = py_cpu_nms(dets, nms_threshold) dets = dets[keep, :] landms = landms[keep] dets = dets[:keep_top_k, :] landms = landms[:keep_top_k, :] dets = np.concatenate((dets, landms), axis=1) return dets # @njit(parallel=True) def post_loop(landms: np.ndarray, conf: np.ndarray, loc: np.ndarray, resize_scales: list[float]): res = [] for i in range(BATCH_SIZE): rep = post_process(landms=landms[i, :, :], conf=conf[i, :, :], loc=loc[i, :, :], image_sizes=[IMAGE_SIZE, IMAGE_SIZE], resize_scale=resize_scales[i], confidence_threshold=0.4, top_k=5000, nms_threshold=0.4, keep_top_k=750) res.append(rep) return res root_dir = r"D:\helloproject-ai-data\blog_images" model_path = r"C:\Users\tomokazu\build\retinaface\retinaface_only_nn.onnx" # session = InferenceSession( # path_or_bytes=model_path, # providers=[ # ('TensorrtExecutionProvider', { # 'trt_engine_cache_enable': True, # 'trt_engine_cache_path': 'trt_cache', # 'trt_fp16_enable': True, # 'trt_profile_min_shapes': f'input:1x3x{IMAGE_SIZE}x{IMAGE_SIZE}', # 'trt_profile_max_shapes': f'input:{BATCH_SIZE}x3x{IMAGE_SIZE}x{IMAGE_SIZE}', # 'trt_profile_opt_shapes': f'input:{BATCH_SIZE}x3x{IMAGE_SIZE}x{IMAGE_SIZE}', # }), # 'CUDAExecutionProvider', # 'CPUExecutionProvider' # ] # ) # from matplotlib import pyplot files = [] for name in listdir(root_dir): if name != "真野恵里菜": continue pass for file_name in listdir(path.join(root_dir, name)): files.append(path.join(root_dir, name, file_name)) # files = files[:32] for file_chunk in chunked(files, BATCH_SIZE): stacks = [] ptrs = [] filenames = [] # st1 = stream.Stream() stz = [] for st1, file in zip([stream.Stream() for _ in range(BATCH_SIZE)], file_chunk): with st1: start = time.time() _input = fromfile(file=file, dtype=uint8) if _input.shape[0] == 0: continue filenames.append(file) try: ptr, (_, (width, height)) = decode(_input, st1.ptr) ptrs.append(ptr) # print(ptr, width, height) unownedmemory = UnownedMemory(ptr, height * width * 3 * uint8().itemsize, None) gpu_arr = ndarray((height * width * 3,), dtype=float32, memptr=MemoryPointer(unownedmemory, 0)) gpu_image: ndarray = gpu_arr.reshape((3, height, width)) except: pil_im = Image.open(file) pil_im.width stacks.append(gpu_image) # tens: Tensor = from_dlpack(gpu_image) print(file, gpu_image.shape, time.time() - start, sep='\t') # tens.cpu() stz.append(st1.record()) [st.synchronize() for st in stz] max_height = max([i.shape[1] for i in stacks]) max_width = max([i.shape[2] for i in stacks]) if stacks.__len__() != BATCH_SIZE: stacks.extend([ndarray([3, IMAGE_SIZE, IMAGE_SIZE])] * (BATCH_SIZE - stacks.__len__())) resize_scales = [] stz = [] stacked_images = stack([resize_image(gpu_image, resize_scales, stz, stream.Stream()) for gpu_image in stacks]) print(stacked_images.shape) contiguous_stacked = ascontiguousarray(stacked_images) [st.synchronize() for st in stz] # io_binding = session.io_binding() # io_binding.bind_input( # name="input", # device_type='cuda', # device_id=stacked_images.device, # element_type=float32, # shape=tuple(stacked_images.shape), # buffer_ptr=contiguous_stacked.data.ptr # ) # io_binding.bind_output("landmark") # io_binding.bind_output("confidence") # io_binding.bind_output("bbox") # session.run_with_iobinding(iobinding=io_binding) # landms, conf, loc = io_binding.copy_outputs_to_cpu() # start = time.time() # detected = post_loop(landms, conf, loc, resize_scales) # print(f"post process time: {time.time() - start}") # for (i, filename, gpu_image) in zip(detected, filenames, stacks): # print(f'"{filename}"') # host_image = (asnumpy(gpu_image).transpose((1, 2, 0)) * 255).astype(uint8).copy() # print(host_image.shape) # for j in i.tolist(): # j = [int(_j) for _j in j] # cv2.rectangle(host_image, (j[0], j[1]), (j[2], j[3]), (255, 0, 0), 2, cv2.LINE_AA) # cv2.putText(host_image, str(j[4]), (0, 0), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 0, 0)) # for k in range(5): # cv2.circle(host_image, (j[5 + k * 2], j[5 + k * 2 + 1]), 2, (255, 0, 0)) # # print(j) # # pyplot.imshow(host_image) # # pyplot.show() # # time.sleep(.5) [cupy_backends.cuda.api.runtime.free(ptr) for ptr in ptrs] # host_im = gpu_image.get() # pyplot.imshow(host_im.transpose((1, 2, 0))) # pyplot.show()