This commit is contained in:
yayoimizuha 2024-08-17 21:07:55 +09:00
parent 435adff46b
commit 70bb7fa5ba
2 changed files with 428 additions and 0 deletions

275
batch_retinaface.py Normal file
View File

@ -0,0 +1,275 @@
import time
import cupy_backends.cuda.api.runtime
import cv2
from numba import njit, prange, types, int64, typed
from simple_decode import decode
from numpy import fromfile, uint8, float32
import numpy as np
from cupy.cuda import UnownedMemory, MemoryPointer, set_allocator, stream
from cupy import ndarray, stack, pad, ascontiguousarray, asnumpy
from cupyx.scipy.ndimage import zoom
from torch import Tensor, from_dlpack
from os import listdir, path
from more_itertools import chunked
from onnxruntime import InferenceSession
import math
from matplotlib import pyplot
from PIL import Image
set_allocator()
IMAGE_SIZE = 640
BATCH_SIZE = 1
def resize_image(image: ndarray, scales: list[float], stz: list[stream.Event], stream: stream.Stream) -> ndarray:
with stream:
if image.shape[1] <= IMAGE_SIZE and image.shape[2] <= IMAGE_SIZE:
scale = 1.0
pass
else:
scale = ((IMAGE_SIZE - 1) / max(image.shape[1], image.shape[2]))
image = zoom(image, (1, scale, scale), mode="constant")
ret = pad(image, pad_width=[
(0, 0),
(0, IMAGE_SIZE - image.shape[1]),
(0, IMAGE_SIZE - image.shape[2])
])
scales.append(scale)
stz.append(stream.record())
return ret
# @njit
def prior_box(min_sizes: list[list[int]], steps: list[int], clip: bool, image_sizes: list[int]) -> np.ndarray:
feature_maps = [[math.ceil(image_sizes[0] / step), math.ceil(image_sizes[1] / step)] for step in steps]
anchors = []
for k, f in enumerate(feature_maps):
min_sizes_k = min_sizes[k]
for i in range(f[0]):
for j in range(f[1]):
for min_size in min_sizes_k:
s_kx = min_size / image_sizes[1]
s_ky = min_size / image_sizes[0]
dense_cx = [x * steps[k] / image_sizes[1] for x in [j + 0.5]]
dense_cy = [y * steps[k] / image_sizes[0] for y in [i + 0.5]]
for cy in dense_cy:
for cx in dense_cx:
anchors.append([cx, cy, s_kx, s_ky])
# for cy, cx in np.nditer([np.array(dense_cy), np.array(dense_cx).reshape((-1, 1))]):
output = np.array(anchors)
if clip:
output.clip(.0, 1.0)
return output
# @njit
def loc_decode(loc: np.ndarray, priors: np.ndarray, variances: list[float]):
boxes = np.concatenate((
priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
priors[:, 2:] * np.exp(loc[:, 2:] * variances[1])), 1)
boxes[:, :2] -= boxes[:, 2:] / 2
boxes[:, 2:] += boxes[:, :2]
return boxes
# @njit
def decode_landm(pre: np.ndarray, priors: np.ndarray, variances: list[float]):
return np.concatenate((priors[:, :2] + pre[:, :2] * variances[0] * priors[:, 2:],
priors[:, :2] + pre[:, 2:4] * variances[0] * priors[:, 2:],
priors[:, :2] + pre[:, 4:6] * variances[0] * priors[:, 2:],
priors[:, :2] + pre[:, 6:8] * variances[0] * priors[:, 2:],
priors[:, :2] + pre[:, 8:10] * variances[0] * priors[:, 2:],
), axis=1)
# @njit
def py_cpu_nms(dets: np.ndarray, thresh: float):
x1 = dets[:, 0]
y1 = dets[:, 1]
x2 = dets[:, 2]
y2 = dets[:, 3]
scores = dets[:, 4]
areas = (x2 - x1 + 1) * (y2 - y1 + 1)
order = scores.argsort()[::-1]
keep = []
while order.size > 0:
i = order[0]
keep.append(i)
xx1 = np.maximum(x1[i], x1[order[1:]])
yy1 = np.maximum(y1[i], y1[order[1:]])
xx2 = np.minimum(x2[i], x2[order[1:]])
yy2 = np.minimum(y2[i], y2[order[1:]])
w = np.maximum(0.0, xx2 - xx1 + 1)
h = np.maximum(0.0, yy2 - yy1 + 1)
inter = w * h
ovr = inter / (areas[i] + areas[order[1:]] - inter)
inds = np.where(ovr <= thresh)[0]
order = order[inds + 1]
return keep
# @njit
def softmax(x, axis=-1):
x_max = np.max(x, axis=axis, keepdims=True)
exp_x = np.exp(x - x_max)
return exp_x / np.sum(exp_x, axis=axis, keepdims=True)
# @njit
def post_process(landms: np.ndarray, conf: np.ndarray, loc: np.ndarray, image_sizes: list[int], resize_scale: float,
confidence_threshold: float, top_k: int, nms_threshold: float, keep_top_k: int) -> np.ndarray:
priors = prior_box(min_sizes=[[16, 32], [64, 128], [256, 512]], steps=[8, 16, 32], clip=False,
image_sizes=image_sizes)
boxes = loc_decode(loc, priors, [0.1, 0.2])
boxes_scale = np.array([image_sizes[1], image_sizes[0]] * 2)
boxes = boxes * boxes_scale / resize_scale
conf = softmax(conf)
scores = conf[:, 1]
landms = decode_landm(landms, priors, [0.1, 0.2])
landms_scale = np.array([image_sizes[1], image_sizes[0]] * 5)
landms = landms * landms_scale / resize_scale
inds = np.where(scores > confidence_threshold)[0]
boxes = boxes[inds]
landms = landms[inds]
scores = scores[inds]
order = scores.argsort()[::-1][:top_k]
boxes = boxes[order]
landms = landms[order]
scores = scores[order]
dets = np.hstack((boxes, scores[:, np.newaxis])).astype(np.float32, copy=False)
keep = py_cpu_nms(dets, nms_threshold)
dets = dets[keep, :]
landms = landms[keep]
dets = dets[:keep_top_k, :]
landms = landms[:keep_top_k, :]
dets = np.concatenate((dets, landms), axis=1)
return dets
# @njit(parallel=True)
def post_loop(landms: np.ndarray, conf: np.ndarray, loc: np.ndarray, resize_scales: list[float]):
res = []
for i in range(BATCH_SIZE):
rep = post_process(landms=landms[i, :, :], conf=conf[i, :, :], loc=loc[i, :, :],
image_sizes=[IMAGE_SIZE, IMAGE_SIZE],
resize_scale=resize_scales[i], confidence_threshold=0.4, top_k=5000, nms_threshold=0.4,
keep_top_k=750)
res.append(rep)
return res
root_dir = r"D:\helloproject-ai-data\blog_images"
model_path = r"C:\Users\tomokazu\build\retinaface\retinaface_only_nn.onnx"
# session = InferenceSession(
# path_or_bytes=model_path,
# providers=[
# ('TensorrtExecutionProvider', {
# 'trt_engine_cache_enable': True,
# 'trt_engine_cache_path': 'trt_cache',
# 'trt_fp16_enable': True,
# 'trt_profile_min_shapes': f'input:1x3x{IMAGE_SIZE}x{IMAGE_SIZE}',
# 'trt_profile_max_shapes': f'input:{BATCH_SIZE}x3x{IMAGE_SIZE}x{IMAGE_SIZE}',
# 'trt_profile_opt_shapes': f'input:{BATCH_SIZE}x3x{IMAGE_SIZE}x{IMAGE_SIZE}',
# }),
# 'CUDAExecutionProvider',
# 'CPUExecutionProvider'
# ]
# )
# from matplotlib import pyplot
files = []
for name in listdir(root_dir):
if name != "真野恵里菜":
continue
pass
for file_name in listdir(path.join(root_dir, name)):
files.append(path.join(root_dir, name, file_name))
# files = files[:32]
for file_chunk in chunked(files, BATCH_SIZE):
stacks = []
ptrs = []
filenames = []
# st1 = stream.Stream()
stz = []
for st1, file in zip([stream.Stream() for _ in range(BATCH_SIZE)], file_chunk):
with st1:
start = time.time()
_input = fromfile(file=file, dtype=uint8)
if _input.shape[0] == 0:
continue
filenames.append(file)
try:
ptr, (_, (width, height)) = decode(_input, st1.ptr)
ptrs.append(ptr)
# print(ptr, width, height)
unownedmemory = UnownedMemory(ptr, height * width * 3 * uint8().itemsize, None)
gpu_arr = ndarray((height * width * 3,), dtype=float32, memptr=MemoryPointer(unownedmemory, 0))
gpu_image: ndarray = gpu_arr.reshape((3, height, width))
except:
pil_im = Image.open(file)
pil_im.width
stacks.append(gpu_image)
# tens: Tensor = from_dlpack(gpu_image)
print(file, gpu_image.shape, time.time() - start, sep='\t')
# tens.cpu()
stz.append(st1.record())
[st.synchronize() for st in stz]
max_height = max([i.shape[1] for i in stacks])
max_width = max([i.shape[2] for i in stacks])
if stacks.__len__() != BATCH_SIZE:
stacks.extend([ndarray([3, IMAGE_SIZE, IMAGE_SIZE])] * (BATCH_SIZE - stacks.__len__()))
resize_scales = []
stz = []
stacked_images = stack([resize_image(gpu_image, resize_scales, stz, stream.Stream()) for gpu_image in stacks])
print(stacked_images.shape)
contiguous_stacked = ascontiguousarray(stacked_images)
[st.synchronize() for st in stz]
# io_binding = session.io_binding()
# io_binding.bind_input(
# name="input",
# device_type='cuda',
# device_id=stacked_images.device,
# element_type=float32,
# shape=tuple(stacked_images.shape),
# buffer_ptr=contiguous_stacked.data.ptr
# )
# io_binding.bind_output("landmark")
# io_binding.bind_output("confidence")
# io_binding.bind_output("bbox")
# session.run_with_iobinding(iobinding=io_binding)
# landms, conf, loc = io_binding.copy_outputs_to_cpu()
# start = time.time()
# detected = post_loop(landms, conf, loc, resize_scales)
# print(f"post process time: {time.time() - start}")
# for (i, filename, gpu_image) in zip(detected, filenames, stacks):
# print(f'"{filename}"')
# host_image = (asnumpy(gpu_image).transpose((1, 2, 0)) * 255).astype(uint8).copy()
# print(host_image.shape)
# for j in i.tolist():
# j = [int(_j) for _j in j]
# cv2.rectangle(host_image, (j[0], j[1]), (j[2], j[3]), (255, 0, 0), 2, cv2.LINE_AA)
# cv2.putText(host_image, str(j[4]), (0, 0), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 0, 0))
# for k in range(5):
# cv2.circle(host_image, (j[5 + k * 2], j[5 + k * 2 + 1]), 2, (255, 0, 0))
#
# print(j)
# # pyplot.imshow(host_image)
# # pyplot.show()
# # time.sleep(.5)
[cupy_backends.cuda.api.runtime.free(ptr) for ptr in ptrs]
# host_im = gpu_image.get()
# pyplot.imshow(host_im.transpose((1, 2, 0)))
# pyplot.show()

View File

@ -0,0 +1,153 @@
import os
from concurrent.futures.process import ProcessPoolExecutor
from multiprocessing import shared_memory
from io import BytesIO
from os import listdir, path, pathsep, makedirs
from pprint import pprint
import more_itertools
import numpy as np
import tqdm
from PIL import Image
from numpy import ndarray
from onnxruntime import InferenceSession
from torch import tensor
import aiofiles
import numpy
import torch
from torchvision.io import decode_jpeg
from asyncio import run, gather, Semaphore
from site import getsitepackages
from rust_retinaface_post_processor import resnet_post_process
os.environ["Path"] = path.join(getsitepackages()[-1], "tensorrt_libs") + pathsep + os.environ["Path"]
root_dir = r"D:\helloproject-ai-data\blog_images"
model_path = r"C:\Users\tomokazu\build\retinaface\retinaface_only_nn_fp16.onnx"
# makedirs("memmap", exist_ok=True)
files = []
files_data: dict[str, numpy.ndarray | None] = {}
chunk_size = 32
image_size = 640
device = torch.device("cuda")
async def async_read(path: str, semaphore: Semaphore):
async with semaphore:
async with aiofiles.open(file=path, mode="rb") as fp:
return await fp.read()
async def gather_runner(l: list, fn):
sem = Semaphore(2048)
return await gather(*[fn(p, sem) for p in l])
def post_processor(outputs, batch_size, image_size):
# print("aaa", flush=True)
outputs = [numpy.ascontiguousarray(output.astype(numpy.float32)) for output in outputs]
res = resnet_post_process([output.__array_interface__["data"][0] for output in outputs], batch_size, image_size)
return res
def post_processor_memmap(tmp_filename, sizes, batch_size, image_size):
# print("aaa", flush=True)
outputs = [
numpy.memmap(filename=path.join("memmap", tmp_filename + str(order)), dtype=numpy.float16, mode="r", shape=size)
for order, size in enumerate(sizes)]
outputs = [numpy.ascontiguousarray(output.astype(numpy.float32)) for output in outputs]
res = resnet_post_process([output.__array_interface__["data"][0] for output in outputs], batch_size, image_size)
return res
def post_processor_shm(shm_name, sizes, batch_size, image_size):
shms = [shared_memory.SharedMemory(name=shm_name + "_" + str(i)) for i in range(3)]
outputs = \
[numpy.ascontiguousarray(numpy.ndarray(shape=size, dtype=numpy.float16, buffer=shm.buf).astype(numpy.float32))
for size, shm in zip(sizes, shms)]
res = resnet_post_process([output.__array_interface__["data"][0] for output in outputs], batch_size, image_size)
[shm.close() for shm in shms]
return res
if __name__ == '__main__':
from kornia.augmentation import LongestMaxSize, PadTo, Normalize
from kornia.constants import Resample
longest_max_size = LongestMaxSize(max_size=640,resample=Resample.NEAREST)
pad_to = PadTo(size=(640, 640), pad_value=1.)
normalize = Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
session = InferenceSession(
path_or_bytes=model_path,
providers=[
('TensorrtExecutionProvider', {
'trt_engine_cache_enable': True,
'trt_engine_cache_path': 'trt_cache',
'trt_fp16_enable': True,
'trt_profile_min_shapes': f'input:1x3x{image_size}x{image_size}',
'trt_profile_max_shapes': f'input:{chunk_size}x3x{image_size}x{image_size}',
'trt_profile_opt_shapes': f'input:{chunk_size}x3x{image_size}x{image_size}',
}),
'CUDAExecutionProvider',
'CPUExecutionProvider'
]
)
for name in listdir(root_dir):
with (ProcessPoolExecutor(max_workers=4) as executor):
if name != "下井谷幸穂":
# continue
pass
file_names = listdir(path.join(root_dir, name))
name_files = [path.join(root_dir, name, file_name) for file_name in file_names]
files_data = {file_name: numpy.frombuffer(dat, dtype=numpy.uint8) for file_name, dat in
zip(file_names, run(gather_runner(name_files, async_read)))}
# print(k_1)
for cnk in more_itertools.chunked(tqdm.tqdm(files_data.items(), desc=name), n=chunk_size):
stack = []
tmp_file_name = ""
for file, dat in cnk:
tmp_file_name = file
try:
decoded_image = decode_jpeg(tensor(dat), device=device)
except:
decoded_image = tensor(numpy.array(Image.open(BytesIO(dat.tobytes()))).transpose([2, 0, 1])).to(
device)
decoded_image = decoded_image.to(torch.float16) / 255
decoded_image = normalize(decoded_image)
decoded_image_resized = longest_max_size(decoded_image)
decoded_image_padded = pad_to(decoded_image_resized)
stack.append(decoded_image_padded.squeeze())
[stack.append(torch.zeros(size=[3, 640, 640], dtype=torch.float16, device=device)) for _ in
range(32 - stack.__len__())]
stacked = torch.stack(stack).contiguous()
# print(stacked.shape)
io_binding = session.io_binding()
io_binding.bind_input(
name="input",
device_type=stacked.device.type,
device_id=stacked.device.index,
element_type='float16',
shape=tuple(stacked.shape),
buffer_ptr=stacked.data_ptr()
)
io_binding.bind_output("landmark")
io_binding.bind_output("confidence")
io_binding.bind_output("bbox")
session.run_with_iobinding(iobinding=io_binding)
outputs: list[numpy.ndarray] = io_binding.copy_outputs_to_cpu()
# [numpy.memmap(filename=path.join("memmap", tmp_file_name + str(order)), dtype=numpy.float16,
# mode="w+", shape=output.shape) for order, output in enumerate(outputs)]
shared_array: list[shared_memory.SharedMemory] = \
[shared_memory.SharedMemory(name=tmp_file_name + "_" + str(order), create=True, size=output.nbytes)
for order, output in enumerate(outputs)]
shared_ndarray = [numpy.ndarray(shape=output.shape, dtype=numpy.float16, buffer=shm.buf)
for shm, output in zip(shared_array, outputs, strict=True)]
for shm, output in zip(shared_ndarray, outputs, strict=True):
shm[:] = output[:]
future = executor.submit(post_processor_shm, tmp_file_name, [output.shape for output in outputs],
chunk_size, [image_size, image_size])
# print(future.result())
# future.add_done_callback(pprint)
# exit(0)