update

2024-08-17 21:07:55 +09:00 · 2024-08-17 21:07:55 +09:00 · 70bb7fa5ba
parent 435adff46b
commit 70bb7fa5ba
2 changed files with 428 additions and 0 deletions
--- a/batch_retinaface.py
+++ b/batch_retinaface.py
@ -0,0 +1,275 @@
 import time
 import cupy_backends.cuda.api.runtime
 import cv2
 from numba import njit, prange, types, int64, typed
 from simple_decode import decode
 from numpy import fromfile, uint8, float32
 import numpy as np
 from cupy.cuda import UnownedMemory, MemoryPointer, set_allocator, stream
 from cupy import ndarray, stack, pad, ascontiguousarray, asnumpy
 from cupyx.scipy.ndimage import zoom
 from torch import Tensor, from_dlpack
 from os import listdir, path
 from more_itertools import chunked
 from onnxruntime import InferenceSession
 import math
 from matplotlib import pyplot
 from PIL import Image
 set_allocator()
 IMAGE_SIZE = 640
 BATCH_SIZE = 1
 def resize_image(image: ndarray, scales: list[float], stz: list[stream.Event], stream: stream.Stream) -> ndarray:
    with stream:
        if image.shape[1] <= IMAGE_SIZE and image.shape[2] <= IMAGE_SIZE:
            scale = 1.0
            pass
        else:
            scale = ((IMAGE_SIZE - 1) / max(image.shape[1], image.shape[2]))
            image = zoom(image, (1, scale, scale), mode="constant")
        ret = pad(image, pad_width=[
            (0, 0),
            (0, IMAGE_SIZE - image.shape[1]),
            (0, IMAGE_SIZE - image.shape[2])
        ])
        scales.append(scale)
        stz.append(stream.record())
        return ret
 # @njit
 def prior_box(min_sizes: list[list[int]], steps: list[int], clip: bool, image_sizes: list[int]) -> np.ndarray:
    feature_maps = [[math.ceil(image_sizes[0] / step), math.ceil(image_sizes[1] / step)] for step in steps]
    anchors = []
    for k, f in enumerate(feature_maps):
        min_sizes_k = min_sizes[k]
        for i in range(f[0]):
            for j in range(f[1]):
                for min_size in min_sizes_k:
                    s_kx = min_size / image_sizes[1]
                    s_ky = min_size / image_sizes[0]
                    dense_cx = [x * steps[k] / image_sizes[1] for x in [j + 0.5]]
                    dense_cy = [y * steps[k] / image_sizes[0] for y in [i + 0.5]]
                    for cy in dense_cy:
                        for cx in dense_cx:
                            anchors.append([cx, cy, s_kx, s_ky])
                    # for cy, cx in np.nditer([np.array(dense_cy), np.array(dense_cx).reshape((-1, 1))]):
    output = np.array(anchors)
    if clip:
        output.clip(.0, 1.0)
    return output
 # @njit
 def loc_decode(loc: np.ndarray, priors: np.ndarray, variances: list[float]):
    boxes = np.concatenate((
        priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
        priors[:, 2:] * np.exp(loc[:, 2:] * variances[1])), 1)
    boxes[:, :2] -= boxes[:, 2:] / 2
    boxes[:, 2:] += boxes[:, :2]
    return boxes
 # @njit
 def decode_landm(pre: np.ndarray, priors: np.ndarray, variances: list[float]):
    return np.concatenate((priors[:, :2] + pre[:, :2] * variances[0] * priors[:, 2:],
                           priors[:, :2] + pre[:, 2:4] * variances[0] * priors[:, 2:],
                           priors[:, :2] + pre[:, 4:6] * variances[0] * priors[:, 2:],
                           priors[:, :2] + pre[:, 6:8] * variances[0] * priors[:, 2:],
                           priors[:, :2] + pre[:, 8:10] * variances[0] * priors[:, 2:],
                           ), axis=1)
 # @njit
 def py_cpu_nms(dets: np.ndarray, thresh: float):
    x1 = dets[:, 0]
    y1 = dets[:, 1]
    x2 = dets[:, 2]
    y2 = dets[:, 3]
    scores = dets[:, 4]
    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
    order = scores.argsort()[::-1]
    keep = []
    while order.size > 0:
        i = order[0]
        keep.append(i)
        xx1 = np.maximum(x1[i], x1[order[1:]])
        yy1 = np.maximum(y1[i], y1[order[1:]])
        xx2 = np.minimum(x2[i], x2[order[1:]])
        yy2 = np.minimum(y2[i], y2[order[1:]])
        w = np.maximum(0.0, xx2 - xx1 + 1)
        h = np.maximum(0.0, yy2 - yy1 + 1)
        inter = w * h
        ovr = inter / (areas[i] + areas[order[1:]] - inter)
        inds = np.where(ovr <= thresh)[0]
        order = order[inds + 1]
    return keep
 # @njit
 def softmax(x, axis=-1):
    x_max = np.max(x, axis=axis, keepdims=True)
    exp_x = np.exp(x - x_max)
    return exp_x / np.sum(exp_x, axis=axis, keepdims=True)
 # @njit
 def post_process(landms: np.ndarray, conf: np.ndarray, loc: np.ndarray, image_sizes: list[int], resize_scale: float,
                 confidence_threshold: float, top_k: int, nms_threshold: float, keep_top_k: int) -> np.ndarray:
    priors = prior_box(min_sizes=[[16, 32], [64, 128], [256, 512]], steps=[8, 16, 32], clip=False,
                       image_sizes=image_sizes)
    boxes = loc_decode(loc, priors, [0.1, 0.2])
    boxes_scale = np.array([image_sizes[1], image_sizes[0]] * 2)
    boxes = boxes * boxes_scale / resize_scale
    conf = softmax(conf)
    scores = conf[:, 1]
    landms = decode_landm(landms, priors, [0.1, 0.2])
    landms_scale = np.array([image_sizes[1], image_sizes[0]] * 5)
    landms = landms * landms_scale / resize_scale
    inds = np.where(scores > confidence_threshold)[0]
    boxes = boxes[inds]
    landms = landms[inds]
    scores = scores[inds]
    order = scores.argsort()[::-1][:top_k]
    boxes = boxes[order]
    landms = landms[order]
    scores = scores[order]
    dets = np.hstack((boxes, scores[:, np.newaxis])).astype(np.float32, copy=False)
    keep = py_cpu_nms(dets, nms_threshold)
    dets = dets[keep, :]
    landms = landms[keep]
    dets = dets[:keep_top_k, :]
    landms = landms[:keep_top_k, :]
    dets = np.concatenate((dets, landms), axis=1)
    return dets
 # @njit(parallel=True)
 def post_loop(landms: np.ndarray, conf: np.ndarray, loc: np.ndarray, resize_scales: list[float]):
    res = []
    for i in range(BATCH_SIZE):
        rep = post_process(landms=landms[i, :, :], conf=conf[i, :, :], loc=loc[i, :, :],
                           image_sizes=[IMAGE_SIZE, IMAGE_SIZE],
                           resize_scale=resize_scales[i], confidence_threshold=0.4, top_k=5000, nms_threshold=0.4,
                           keep_top_k=750)
        res.append(rep)
    return res
 root_dir = r"D:\helloproject-ai-data\blog_images"
 model_path = r"C:\Users\tomokazu\build\retinaface\retinaface_only_nn.onnx"
 # session = InferenceSession(
 #     path_or_bytes=model_path,
 #     providers=[
 #         ('TensorrtExecutionProvider', {
 #             'trt_engine_cache_enable': True,
 #             'trt_engine_cache_path': 'trt_cache',
 #             'trt_fp16_enable': True,
 #             'trt_profile_min_shapes': f'input:1x3x{IMAGE_SIZE}x{IMAGE_SIZE}',
 #             'trt_profile_max_shapes': f'input:{BATCH_SIZE}x3x{IMAGE_SIZE}x{IMAGE_SIZE}',
 #             'trt_profile_opt_shapes': f'input:{BATCH_SIZE}x3x{IMAGE_SIZE}x{IMAGE_SIZE}',
 #         }),
 #         'CUDAExecutionProvider',
 #         'CPUExecutionProvider'
 #     ]
 # )
 # from matplotlib import pyplot
 files = []
 for name in listdir(root_dir):
    if name != "真野恵里菜":
        continue
        pass
    for file_name in listdir(path.join(root_dir, name)):
        files.append(path.join(root_dir, name, file_name))
 # files = files[:32]
 for file_chunk in chunked(files, BATCH_SIZE):
    stacks = []
    ptrs = []
    filenames = []
    # st1 = stream.Stream()
    stz = []
    for st1, file in zip([stream.Stream() for _ in range(BATCH_SIZE)], file_chunk):
        with st1:
            start = time.time()
            _input = fromfile(file=file, dtype=uint8)
            if _input.shape[0] == 0:
                continue
            filenames.append(file)
            try:
                ptr, (_, (width, height)) = decode(_input, st1.ptr)
                ptrs.append(ptr)
                # print(ptr, width, height)
                unownedmemory = UnownedMemory(ptr, height * width * 3 * uint8().itemsize, None)
                gpu_arr = ndarray((height * width * 3,), dtype=float32, memptr=MemoryPointer(unownedmemory, 0))
                gpu_image: ndarray = gpu_arr.reshape((3, height, width))
            except:
                pil_im = Image.open(file)
                pil_im.width
            stacks.append(gpu_image)
            # tens: Tensor = from_dlpack(gpu_image)
            print(file, gpu_image.shape, time.time() - start, sep='\t')
            # tens.cpu()
        stz.append(st1.record())
    [st.synchronize() for st in stz]
    max_height = max([i.shape[1] for i in stacks])
    max_width = max([i.shape[2] for i in stacks])
    if stacks.__len__() != BATCH_SIZE:
        stacks.extend([ndarray([3, IMAGE_SIZE, IMAGE_SIZE])] * (BATCH_SIZE - stacks.__len__()))
    resize_scales = []
    stz = []
    stacked_images = stack([resize_image(gpu_image, resize_scales, stz, stream.Stream()) for gpu_image in stacks])
    print(stacked_images.shape)
    contiguous_stacked = ascontiguousarray(stacked_images)
    [st.synchronize() for st in stz]
    # io_binding = session.io_binding()
    # io_binding.bind_input(
    #     name="input",
    #     device_type='cuda',
    #     device_id=stacked_images.device,
    #     element_type=float32,
    #     shape=tuple(stacked_images.shape),
    #     buffer_ptr=contiguous_stacked.data.ptr
    # )
    # io_binding.bind_output("landmark")
    # io_binding.bind_output("confidence")
    # io_binding.bind_output("bbox")
    # session.run_with_iobinding(iobinding=io_binding)
    # landms, conf, loc = io_binding.copy_outputs_to_cpu()
    # start = time.time()
    # detected = post_loop(landms, conf, loc, resize_scales)
    # print(f"post process time: {time.time() - start}")
    # for (i, filename, gpu_image) in zip(detected, filenames, stacks):
    #     print(f'"{filename}"')
    #     host_image = (asnumpy(gpu_image).transpose((1, 2, 0)) * 255).astype(uint8).copy()
    #     print(host_image.shape)
    #     for j in i.tolist():
    #         j = [int(_j) for _j in j]
    #         cv2.rectangle(host_image, (j[0], j[1]), (j[2], j[3]), (255, 0, 0), 2, cv2.LINE_AA)
    #         cv2.putText(host_image, str(j[4]), (0, 0), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 0, 0))
    #         for k in range(5):
    #             cv2.circle(host_image, (j[5 + k * 2], j[5 + k * 2 + 1]), 2, (255, 0, 0))
    #
    #         print(j)
    #     # pyplot.imshow(host_image)
    #     # pyplot.show()
    #     # time.sleep(.5)
    [cupy_backends.cuda.api.runtime.free(ptr) for ptr in ptrs]
 # host_im = gpu_image.get()
 # pyplot.imshow(host_im.transpose((1, 2, 0)))
 # pyplot.show()
--- a/test_script/torchivision_jpeg_decode.py
+++ b/test_script/torchivision_jpeg_decode.py
@ -0,0 +1,153 @@
 import os
 from concurrent.futures.process import ProcessPoolExecutor
 from multiprocessing import shared_memory
 from io import BytesIO
 from os import listdir, path, pathsep, makedirs
 from pprint import pprint
 import more_itertools
 import numpy as np
 import tqdm
 from PIL import Image
 from numpy import ndarray
 from onnxruntime import InferenceSession
 from torch import tensor
 import aiofiles
 import numpy
 import torch
 from torchvision.io import decode_jpeg
 from asyncio import run, gather, Semaphore
 from site import getsitepackages
 from rust_retinaface_post_processor import resnet_post_process
 os.environ["Path"] = path.join(getsitepackages()[-1], "tensorrt_libs") + pathsep + os.environ["Path"]
 root_dir = r"D:\helloproject-ai-data\blog_images"
 model_path = r"C:\Users\tomokazu\build\retinaface\retinaface_only_nn_fp16.onnx"
 # makedirs("memmap", exist_ok=True)
 files = []
 files_data: dict[str, numpy.ndarray | None] = {}
 chunk_size = 32
 image_size = 640
 device = torch.device("cuda")
 async def async_read(path: str, semaphore: Semaphore):
    async with semaphore:
        async with aiofiles.open(file=path, mode="rb") as fp:
            return await fp.read()
 async def gather_runner(l: list, fn):
    sem = Semaphore(2048)
    return await gather(*[fn(p, sem) for p in l])
 def post_processor(outputs, batch_size, image_size):
    # print("aaa", flush=True)
    outputs = [numpy.ascontiguousarray(output.astype(numpy.float32)) for output in outputs]
    res = resnet_post_process([output.__array_interface__["data"][0] for output in outputs], batch_size, image_size)
    return res
 def post_processor_memmap(tmp_filename, sizes, batch_size, image_size):
    # print("aaa", flush=True)
    outputs = [
        numpy.memmap(filename=path.join("memmap", tmp_filename + str(order)), dtype=numpy.float16, mode="r", shape=size)
        for order, size in enumerate(sizes)]
    outputs = [numpy.ascontiguousarray(output.astype(numpy.float32)) for output in outputs]
    res = resnet_post_process([output.__array_interface__["data"][0] for output in outputs], batch_size, image_size)
    return res
 def post_processor_shm(shm_name, sizes, batch_size, image_size):
    shms = [shared_memory.SharedMemory(name=shm_name + "_" + str(i)) for i in range(3)]
    outputs = \
        [numpy.ascontiguousarray(numpy.ndarray(shape=size, dtype=numpy.float16, buffer=shm.buf).astype(numpy.float32))
         for size, shm in zip(sizes, shms)]
    res = resnet_post_process([output.__array_interface__["data"][0] for output in outputs], batch_size, image_size)
    [shm.close() for shm in shms]
    return res
 if __name__ == '__main__':
    from kornia.augmentation import LongestMaxSize, PadTo, Normalize
    from kornia.constants import Resample
    longest_max_size = LongestMaxSize(max_size=640,resample=Resample.NEAREST)
    pad_to = PadTo(size=(640, 640), pad_value=1.)
    normalize = Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
    session = InferenceSession(
        path_or_bytes=model_path,
        providers=[
            ('TensorrtExecutionProvider', {
                'trt_engine_cache_enable': True,
                'trt_engine_cache_path': 'trt_cache',
                'trt_fp16_enable': True,
                'trt_profile_min_shapes': f'input:1x3x{image_size}x{image_size}',
                'trt_profile_max_shapes': f'input:{chunk_size}x3x{image_size}x{image_size}',
                'trt_profile_opt_shapes': f'input:{chunk_size}x3x{image_size}x{image_size}',
            }),
            'CUDAExecutionProvider',
            'CPUExecutionProvider'
        ]
    )
    for name in listdir(root_dir):
        with (ProcessPoolExecutor(max_workers=4) as executor):
            if name != "下井谷幸穂":
                # continue
                pass
            file_names = listdir(path.join(root_dir, name))
            name_files = [path.join(root_dir, name, file_name) for file_name in file_names]
            files_data = {file_name: numpy.frombuffer(dat, dtype=numpy.uint8) for file_name, dat in
                          zip(file_names, run(gather_runner(name_files, async_read)))}
            # print(k_1)
            for cnk in more_itertools.chunked(tqdm.tqdm(files_data.items(), desc=name), n=chunk_size):
                stack = []
                tmp_file_name = ""
                for file, dat in cnk:
                    tmp_file_name = file
                    try:
                        decoded_image = decode_jpeg(tensor(dat), device=device)
                    except:
                        decoded_image = tensor(numpy.array(Image.open(BytesIO(dat.tobytes()))).transpose([2, 0, 1])).to(
                            device)
                    decoded_image = decoded_image.to(torch.float16) / 255
                    decoded_image = normalize(decoded_image)
                    decoded_image_resized = longest_max_size(decoded_image)
                    decoded_image_padded = pad_to(decoded_image_resized)
                    stack.append(decoded_image_padded.squeeze())
                [stack.append(torch.zeros(size=[3, 640, 640], dtype=torch.float16, device=device)) for _ in
                 range(32 - stack.__len__())]
                stacked = torch.stack(stack).contiguous()
                # print(stacked.shape)
                io_binding = session.io_binding()
                io_binding.bind_input(
                    name="input",
                    device_type=stacked.device.type,
                    device_id=stacked.device.index,
                    element_type='float16',
                    shape=tuple(stacked.shape),
                    buffer_ptr=stacked.data_ptr()
                )
                io_binding.bind_output("landmark")
                io_binding.bind_output("confidence")
                io_binding.bind_output("bbox")
                session.run_with_iobinding(iobinding=io_binding)
                outputs: list[numpy.ndarray] = io_binding.copy_outputs_to_cpu()
                # [numpy.memmap(filename=path.join("memmap", tmp_file_name + str(order)), dtype=numpy.float16,
                #               mode="w+", shape=output.shape) for order, output in enumerate(outputs)]
                shared_array: list[shared_memory.SharedMemory] = \
                    [shared_memory.SharedMemory(name=tmp_file_name + "_" + str(order), create=True, size=output.nbytes)
                     for order, output in enumerate(outputs)]
                shared_ndarray = [numpy.ndarray(shape=output.shape, dtype=numpy.float16, buffer=shm.buf)
                                  for shm, output in zip(shared_array, outputs, strict=True)]
                for shm, output in zip(shared_ndarray, outputs, strict=True):
                    shm[:] = output[:]
                future = executor.submit(post_processor_shm, tmp_file_name, [output.shape for output in outputs],
                                         chunk_size, [image_size, image_size])
                # print(future.result())
                # future.add_done_callback(pprint)
                # exit(0)