From f3bc7e55296a3adfeb5800f201a21ea8b9ec1666 Mon Sep 17 00:00:00 2001 From: yayoimizuha Date: Sat, 7 Dec 2024 16:18:07 +0900 Subject: [PATCH] update --- matrix_infer.py | 2 +- test_script/face_cropper.py | 13 +- test_script/qsv_decode_test.py | 61 ++++++ test_script/qsv_jpeg_decode.py | 248 ++++++++++++++++++++++++ test_script/torchivision_jpeg_decode.py | 146 +++++++++----- 5 files changed, 417 insertions(+), 53 deletions(-) create mode 100644 test_script/qsv_decode_test.py create mode 100644 test_script/qsv_jpeg_decode.py diff --git a/matrix_infer.py b/matrix_infer.py index 7181ee9..804d029 100644 --- a/matrix_infer.py +++ b/matrix_infer.py @@ -14,7 +14,7 @@ from concurrent.futures import ThreadPoolExecutor from pandas import DataFrame from seaborn import heatmap, color_palette, set_palette from matplotlib import pyplot -from japanize_matplotlib import japanize +from matplotlib_fontja import japanize device = device('cuda' if is_available() else 'cpu') # device = 'cpu' diff --git a/test_script/face_cropper.py b/test_script/face_cropper.py index 58f9b5c..eb411ee 100644 --- a/test_script/face_cropper.py +++ b/test_script/face_cropper.py @@ -1,10 +1,16 @@ # import cv2 +import os +# print(os.environ) +for p in os.environ['Path'].split(os.pathsep): + if os.path.isdir(p) and p != ".": + print(p) + os.add_dll_directory(p) + import msgspec from torch import tensor import torch from torchvision.transforms import functional, InterpolationMode from torchvision.io import decode_jpeg -import os # import shutil import numpy from PIL import Image @@ -13,12 +19,13 @@ from more_itertools import chunked from tqdm import tqdm import math -ROOT_DIR = r"D:\helloproject-ai-data\blog_images" -CROPPED_DIR = r"D:\helloproject-ai-data\face_cropped" +ROOT_DIR = r"E:\helloproject-ai-data\blog_images" +CROPPED_DIR = r"E:\helloproject-ai-data\face_cropped" CROP_THRESHOLD = 0.8 inference_size = 640 device = torch.device("cuda") +device = torch.device("xpu") if torch.xpu.is_available() else exit(-1) def calc_rotate(landmark: list[list[float]]) -> tuple[tuple[int, int], float]: diff --git a/test_script/qsv_decode_test.py b/test_script/qsv_decode_test.py new file mode 100644 index 0000000..7004587 --- /dev/null +++ b/test_script/qsv_decode_test.py @@ -0,0 +1,61 @@ +import ctypes +import math +import numpy +from numpy.lib._stride_tricks_impl import as_strided +from matplotlib import pyplot +from test_ext import decode + +with open(file=r"C:\Users\tomokazu\すぐ消す\friends-4385686.jpg", mode="rb") as f: + ptr, height, width, pitch = decode(f.read()) + +pitch_h = math.ceil(height / 2) * 2 +pitch_w = math.ceil(width / 2) * 2 + +print(height, width, pitch) +y_arr = numpy.frombuffer((ctypes.c_uint8 * (pitch_h * pitch)).from_address(ptr), dtype=numpy.uint8, + count=pitch_h * pitch) +print(f"{y_arr=}") +uv_arr = numpy.frombuffer((ctypes.c_uint8 * (int(pitch_h * 1.5) * pitch)).from_address(ptr), + dtype=numpy.uint8, + count=int(pitch_h / 2) * pitch, offset=pitch_h * pitch) +print(f"{uv_arr=}") + +y_plane = as_strided(y_arr, (pitch_h, pitch_w), (pitch, 1)) +uv_plane = as_strided(uv_arr, (int(pitch_h / 2), int(pitch_w / 2), 2), (pitch, 2, 1)) +yuv_plane = numpy.stack([y_plane, + uv_plane[:, :, 0].repeat(2, axis=0).repeat(2, axis=1), + uv_plane[:, :, 1].repeat(2, axis=0).repeat(2, axis=1)]) +# print(y_plane.shape) +# print(y_plane.strides) +# print(uv_plane.shape) +# print(uv_plane.strides) +# print(uv_plane[:, :, 0].shape) +print(yuv_plane.shape) +print(yuv_plane.strides) +# print(yuv_plane[:, : 4, : 4]) +# print(yuv_plane.transpose(1, 2, 0)[:4, :4, :]) +pyplot.figure(figsize=(20, 20), dpi=150) +pyplot.imshow(yuv_plane[0, :, :]) +pyplot.show() +pyplot.close("all") +pyplot.figure(figsize=(20, 20), dpi=150) +pyplot.imshow(yuv_plane[1, :, :]) +pyplot.show() +pyplot.close("all") +pyplot.figure(figsize=(20, 20), dpi=150) +pyplot.imshow(yuv_plane[2, :, :]) +pyplot.show() +pyplot.close("all") +ycbcr_mat = yuv_plane.transpose((1, 2, 0)).reshape((-1, 3)) - [0, 128, 128] +# print(ycbcr_mat) +transform_matrix = numpy.array([ + [1.0, 0.0, 1.5748], + [1.0, -0.1873, -0.4681], + [1.0, 1.8556, 0.0] +]) +rgb_plane = (numpy.clip(numpy.dot(ycbcr_mat, transform_matrix.T), 0, 255) + .reshape(pitch_h, pitch_w, 3).astype(numpy.uint8)) +pyplot.figure(figsize=(20, 20), dpi=150) +pyplot.imshow(rgb_plane) +pyplot.show() +pyplot.close("all") diff --git a/test_script/qsv_jpeg_decode.py b/test_script/qsv_jpeg_decode.py new file mode 100644 index 0000000..39bcc28 --- /dev/null +++ b/test_script/qsv_jpeg_decode.py @@ -0,0 +1,248 @@ +import json +import os +import warnings + +warnings.filterwarnings("ignore", lineno=6, category=UserWarning) +from concurrent.futures.process import ProcessPoolExecutor +from itertools import chain +from multiprocessing import shared_memory +from io import BytesIO +from os import listdir, path, pathsep, makedirs +from pprint import pprint +import more_itertools +import msgspec +import pandas.io.json +import tqdm +from PIL import Image +from uuid import uuid4 +from onnxruntime import InferenceSession, SessionOptions, GraphOptimizationLevel +from torch import tensor +import aiofiles +import numpy +import torch +from torchvision.io import decode_jpeg +from asyncio import run, gather, Semaphore +from site import getsitepackages +from rust_retinaface_post_processor import resnet_post_process + +USE_OPENVINO = True +if USE_OPENVINO: + import openvino + + ov_core = openvino.Core() +os.environ["Path"] = path.join(getsitepackages()[-1], "tensorrt_libs") + pathsep + os.environ["Path"] +root_dir = r"E:\helloproject-ai-data\blog_images" +model_path = r"C:\Users\tomokazu\build\retinaface\retinaface_only_nn_fp16.onnx" +# makedirs("memmap", exist_ok=True) + +files = [] +files_data: dict[str, numpy.ndarray | None] = {} +chunk_size = 16 +image_size = 640 +device = torch.device("cpu") if torch.xpu.is_available() else exit(-1) + + +async def async_read(path: str, semaphore: Semaphore): + async with semaphore: + async with aiofiles.open(file=path, mode="rb") as fp: + return await fp.read() + + +async def gather_runner(l: list, fn): + sem = Semaphore(2048) + return await gather(*[fn(p, sem) for p in l]) + + +# def post_processor(outputs, batch_size, image_size): +# # print("aaa", flush=True) +# outputs = [numpy.ascontiguousarray(output.astype(numpy.float32)) for output in outputs] +# res = resnet_post_process([output.__array_interface__["data"][0] for output in outputs], batch_size, image_size) +# return res +# +# +# def post_processor_memmap(tmp_filename, sizes, batch_size, image_size): # print("aaa", flush=True) outputs = [ +# numpy.memmap(filename=path.join("memmap", tmp_filename + str(order)), dtype=numpy.float16, mode="r", shape=size) +# for order, size in enumerate(sizes)] outputs = [numpy.ascontiguousarray(output.astype(numpy.float32)) for output in +# outputs] res = resnet_post_process([output.__array_interface__["data"][0] for output in outputs], batch_size, +# image_size) return res + + +def post_processor_shm(shm_name, sizes, batch_size, image_size): + shms = [shared_memory.SharedMemory(name=shm_name + "_" + str(i)) for i in range(3)] + outputs = \ + [numpy.ascontiguousarray(numpy.ndarray(shape=size, dtype=numpy.float16, buffer=shm.buf).astype(numpy.float32)) + for size, shm in zip(sizes, shms)] + res = resnet_post_process([output.__array_interface__["data"][0] for output in outputs], batch_size, image_size) + # print(res) + return res + + +def dec_jpg(f, fn): + _decoded_image = tensor(numpy.array(Image.open(BytesIO(f.tobytes()))).transpose([2, 0, 1])) + _decoded_image = _decoded_image.to(device, torch.float16) / 255 + _decoded_image = fn[2](_decoded_image) + _decoded_image_resized = fn[0](_decoded_image) + return fn[1](_decoded_image_resized) + + +if __name__ == '__main__': + from kornia.augmentation import LongestMaxSize, PadTo, Normalize + from kornia.constants import Resample + + longest_max_size = LongestMaxSize(max_size=640, resample=Resample.NEAREST) + pad_to = PadTo(size=(640, 640), pad_value=1.) + normalize = Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) + if USE_OPENVINO: + onnx_model = ov_core.read_model(model_path) + onnx_model.reshape([chunk_size, 3, image_size, image_size]) + onnx_model = ov_core.compile_model(onnx_model, device_name='GPU') + + else: + session_options = SessionOptions() + session_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL + # session_options.optimized_model_filepath = GraphOptimizationLevel = "onnx_cache" + session = InferenceSession( + path_or_bytes=model_path, + providers=[ + ('TensorrtExecutionProvider', { + 'trt_engine_cache_enable': True, + 'trt_engine_cache_path': 'trt_cache', + 'trt_fp16_enable': True, + 'trt_profile_min_shapes': f'input:1x3x{image_size}x{image_size}', + 'trt_profile_max_shapes': f'input:{chunk_size}x3x{image_size}x{image_size}', + 'trt_profile_opt_shapes': f'input:{chunk_size}x3x{image_size}x{image_size}', + }), + ('OpenVINOExecutionProvider', { + 'device_type': 'GPU.0', + 'precision': 'FP16', + 'cache_dir': 'openvino_cache' + }), + 'CUDAExecutionProvider', + 'CPUExecutionProvider' + ], + sess_options=session_options + ) + if os.path.exists("faces.jsonl"): + with open(file="faces.jsonl", mode="r", encoding="utf-8") as fp: + already = {list(msgspec.json.decode(line).keys())[0] for line in fp.read().removesuffix("\n").split("\n")} + else: + already = set() + pbar = tqdm.tqdm( + total=(set().union(*[listdir(path.join(root_dir, name)) for name in listdir(root_dir)]) - already).__len__()) + + # print(len(already)) + # exit(0) + + for name in listdir(root_dir): + with (ProcessPoolExecutor(max_workers=16) as executor): + pbar.set_description_str(desc=name, refresh=True) + if name != "ブログ": + # continue + pass + file_names = listdir(path.join(root_dir, name)) + file_names_set = set(file_names) - already + file_names = list(file_names_set) + name_files = [path.join(root_dir, name, file_name) for file_name in file_names] + files_data = {file_name: numpy.frombuffer(dat, dtype=numpy.uint8) for file_name, dat in + zip(file_names, run(gather_runner(name_files, async_read)))} + if files_data.__len__() == 0: + continue + futures = [] + shms = [] + namess = [] + # print(k_1) + for cnk in more_itertools.chunked(files_data.items(), n=chunk_size): + stack = [] + names = [] + if USE_OPENVINO: + fn_pack = [longest_max_size, pad_to, normalize] + submits = [] + for file, dat in cnk: + submits.append(executor.submit(dec_jpg, dat, fn_pack)) + names.append(file) + for submit in submits: + stack.append(submit.result().squeeze()) + else: + for file, dat in cnk: + try: + decoded_image = decode_jpeg(tensor(dat), device=device) + except: + decoded_image = tensor( + numpy.array(Image.open(BytesIO(dat.tobytes()))).transpose([2, 0, 1])) + decoded_image = decoded_image.to(device, torch.float16) / 255 + decoded_image = normalize(decoded_image) + decoded_image_resized = longest_max_size(decoded_image) + decoded_image_padded = pad_to(decoded_image_resized) + stack.append(decoded_image_padded.squeeze()) + names.append(file) + namess.append(names) + [stack.append(torch.zeros(size=[3, 640, 640], dtype=torch.float16, device=device)) for _ in + range(chunk_size - stack.__len__())] + stacked = torch.stack(stack).contiguous() + # print(stacked.shape) + if USE_OPENVINO: + _outputs = onnx_model([stacked]) + # print(_outputs[onnx_model.output(0)]) + outputs = [_outputs[onnx_model.output(i)] for i in range(2, -1, -1)] + # print(outputs) + else: + io_binding = session.io_binding() + io_binding.bind_input( + name="input", + device_type=stacked.device.type, + device_id=stacked.device.index if stacked.device.index is not None else 0, + element_type='float16', + shape=tuple(stacked.shape), + buffer_ptr=stacked.data_ptr() + ) + io_binding.bind_output("landmark") + io_binding.bind_output("confidence") + io_binding.bind_output("bbox") + session.run_with_iobinding(iobinding=io_binding) + outputs: list[numpy.ndarray] = io_binding.copy_outputs_to_cpu() + + # [numpy.memmap(filename=path.join("memmap", tmp_file_name + str(order)), dtype=numpy.float16, + # mode="w+", shape=output.shape) for order, output in enumerate(outputs)] + uuid = uuid4().__str__() + shared_array: list[shared_memory.SharedMemory] = \ + [shared_memory.SharedMemory(name=uuid + "_" + str(order), create=True, size=output.nbytes) + for order, output in enumerate(outputs)] + shared_ndarray = [numpy.ndarray(shape=output.shape, dtype=numpy.float16, buffer=shm.buf) + for shm, output in zip(shared_array, outputs, strict=True)] + for shm, output in zip(shared_ndarray, outputs, strict=True): + shm[:] = output[:] + future = executor.submit(post_processor_shm, uuid, [output.shape for output in outputs], + chunk_size, [image_size, image_size]) + futures.append(future) + shms.extend(shared_array) + # exit(0) + pbar.update(n=cnk.__len__()) + # result_dict = dict() + with open("faces.jsonl", mode="a", encoding="utf-8") as fp: + futures_results = [future.result() for future in futures] + # pprint(futures_results) + for names, futures_result in zip(namess, futures_results): + + for name, results in zip(names, futures_result): + results_list = [] + if results: + # print(name) + for result in results: + # [print(int(a), end=" ") for a in result[0]] + # print(*result[1], end=" ") + # [print(int(a), end=" ") for a in result[2]] + # print() + # results_list.append(list(chain.from_iterable([result]))) + fp.write( + pandas.io.json.ujson_dumps({name: [result[0], result[1][0], result[2]]}, + ensure_ascii=False, double_precision=5) + "\n") + pass + else: + fp.write( + pandas.io.json.ujson_dumps({name: None}, ensure_ascii=False) + "\n") + + # print(name, []) + pass + # result_dict[name] = results_list + # pprint(result_dict) + [shm.close() for shm in shms] diff --git a/test_script/torchivision_jpeg_decode.py b/test_script/torchivision_jpeg_decode.py index b922361..39bcc28 100644 --- a/test_script/torchivision_jpeg_decode.py +++ b/test_script/torchivision_jpeg_decode.py @@ -1,5 +1,8 @@ import json import os +import warnings + +warnings.filterwarnings("ignore", lineno=6, category=UserWarning) from concurrent.futures.process import ProcessPoolExecutor from itertools import chain from multiprocessing import shared_memory @@ -12,7 +15,7 @@ import pandas.io.json import tqdm from PIL import Image from uuid import uuid4 -from onnxruntime import InferenceSession +from onnxruntime import InferenceSession, SessionOptions, GraphOptimizationLevel from torch import tensor import aiofiles import numpy @@ -22,16 +25,21 @@ from asyncio import run, gather, Semaphore from site import getsitepackages from rust_retinaface_post_processor import resnet_post_process +USE_OPENVINO = True +if USE_OPENVINO: + import openvino + + ov_core = openvino.Core() os.environ["Path"] = path.join(getsitepackages()[-1], "tensorrt_libs") + pathsep + os.environ["Path"] -root_dir = r"D:\helloproject-ai-data\blog_images" +root_dir = r"E:\helloproject-ai-data\blog_images" model_path = r"C:\Users\tomokazu\build\retinaface\retinaface_only_nn_fp16.onnx" # makedirs("memmap", exist_ok=True) files = [] files_data: dict[str, numpy.ndarray | None] = {} -chunk_size = 32 +chunk_size = 16 image_size = 640 -device = torch.device("cuda") +device = torch.device("cpu") if torch.xpu.is_available() else exit(-1) async def async_read(path: str, semaphore: Semaphore): @@ -69,6 +77,14 @@ def post_processor_shm(shm_name, sizes, batch_size, image_size): return res +def dec_jpg(f, fn): + _decoded_image = tensor(numpy.array(Image.open(BytesIO(f.tobytes()))).transpose([2, 0, 1])) + _decoded_image = _decoded_image.to(device, torch.float16) / 255 + _decoded_image = fn[2](_decoded_image) + _decoded_image_resized = fn[0](_decoded_image) + return fn[1](_decoded_image_resized) + + if __name__ == '__main__': from kornia.augmentation import LongestMaxSize, PadTo, Normalize from kornia.constants import Resample @@ -76,24 +92,41 @@ if __name__ == '__main__': longest_max_size = LongestMaxSize(max_size=640, resample=Resample.NEAREST) pad_to = PadTo(size=(640, 640), pad_value=1.) normalize = Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) + if USE_OPENVINO: + onnx_model = ov_core.read_model(model_path) + onnx_model.reshape([chunk_size, 3, image_size, image_size]) + onnx_model = ov_core.compile_model(onnx_model, device_name='GPU') - session = InferenceSession( - path_or_bytes=model_path, - providers=[ - ('TensorrtExecutionProvider', { - 'trt_engine_cache_enable': True, - 'trt_engine_cache_path': 'trt_cache', - 'trt_fp16_enable': True, - 'trt_profile_min_shapes': f'input:1x3x{image_size}x{image_size}', - 'trt_profile_max_shapes': f'input:{chunk_size}x3x{image_size}x{image_size}', - 'trt_profile_opt_shapes': f'input:{chunk_size}x3x{image_size}x{image_size}', - }), - 'CUDAExecutionProvider', - 'CPUExecutionProvider' - ] - ) - with open(file="faces.jsonl", mode="r", encoding="utf-8") as fp: - already = {list(msgspec.json.decode(line).keys())[0] for line in fp.read().removesuffix("\n").split("\n")} + else: + session_options = SessionOptions() + session_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL + # session_options.optimized_model_filepath = GraphOptimizationLevel = "onnx_cache" + session = InferenceSession( + path_or_bytes=model_path, + providers=[ + ('TensorrtExecutionProvider', { + 'trt_engine_cache_enable': True, + 'trt_engine_cache_path': 'trt_cache', + 'trt_fp16_enable': True, + 'trt_profile_min_shapes': f'input:1x3x{image_size}x{image_size}', + 'trt_profile_max_shapes': f'input:{chunk_size}x3x{image_size}x{image_size}', + 'trt_profile_opt_shapes': f'input:{chunk_size}x3x{image_size}x{image_size}', + }), + ('OpenVINOExecutionProvider', { + 'device_type': 'GPU.0', + 'precision': 'FP16', + 'cache_dir': 'openvino_cache' + }), + 'CUDAExecutionProvider', + 'CPUExecutionProvider' + ], + sess_options=session_options + ) + if os.path.exists("faces.jsonl"): + with open(file="faces.jsonl", mode="r", encoding="utf-8") as fp: + already = {list(msgspec.json.decode(line).keys())[0] for line in fp.read().removesuffix("\n").split("\n")} + else: + already = set() pbar = tqdm.tqdm( total=(set().union(*[listdir(path.join(root_dir, name)) for name in listdir(root_dir)]) - already).__len__()) @@ -101,7 +134,7 @@ if __name__ == '__main__': # exit(0) for name in listdir(root_dir): - with (ProcessPoolExecutor(max_workers=4) as executor): + with (ProcessPoolExecutor(max_workers=16) as executor): pbar.set_description_str(desc=name, refresh=True) if name != "ブログ": # continue @@ -121,38 +154,53 @@ if __name__ == '__main__': for cnk in more_itertools.chunked(files_data.items(), n=chunk_size): stack = [] names = [] - for file, dat in cnk: - try: - decoded_image = decode_jpeg(tensor(dat), device=device) - except: - decoded_image = tensor( - numpy.array(Image.open(BytesIO(dat.tobytes()))).transpose([2, 0, 1])).to( - device) - decoded_image = decoded_image.to(torch.float16) / 255 - decoded_image = normalize(decoded_image) - decoded_image_resized = longest_max_size(decoded_image) - decoded_image_padded = pad_to(decoded_image_resized) - stack.append(decoded_image_padded.squeeze()) - names.append(file) + if USE_OPENVINO: + fn_pack = [longest_max_size, pad_to, normalize] + submits = [] + for file, dat in cnk: + submits.append(executor.submit(dec_jpg, dat, fn_pack)) + names.append(file) + for submit in submits: + stack.append(submit.result().squeeze()) + else: + for file, dat in cnk: + try: + decoded_image = decode_jpeg(tensor(dat), device=device) + except: + decoded_image = tensor( + numpy.array(Image.open(BytesIO(dat.tobytes()))).transpose([2, 0, 1])) + decoded_image = decoded_image.to(device, torch.float16) / 255 + decoded_image = normalize(decoded_image) + decoded_image_resized = longest_max_size(decoded_image) + decoded_image_padded = pad_to(decoded_image_resized) + stack.append(decoded_image_padded.squeeze()) + names.append(file) namess.append(names) [stack.append(torch.zeros(size=[3, 640, 640], dtype=torch.float16, device=device)) for _ in range(chunk_size - stack.__len__())] stacked = torch.stack(stack).contiguous() # print(stacked.shape) - io_binding = session.io_binding() - io_binding.bind_input( - name="input", - device_type=stacked.device.type, - device_id=stacked.device.index, - element_type='float16', - shape=tuple(stacked.shape), - buffer_ptr=stacked.data_ptr() - ) - io_binding.bind_output("landmark") - io_binding.bind_output("confidence") - io_binding.bind_output("bbox") - session.run_with_iobinding(iobinding=io_binding) - outputs: list[numpy.ndarray] = io_binding.copy_outputs_to_cpu() + if USE_OPENVINO: + _outputs = onnx_model([stacked]) + # print(_outputs[onnx_model.output(0)]) + outputs = [_outputs[onnx_model.output(i)] for i in range(2, -1, -1)] + # print(outputs) + else: + io_binding = session.io_binding() + io_binding.bind_input( + name="input", + device_type=stacked.device.type, + device_id=stacked.device.index if stacked.device.index is not None else 0, + element_type='float16', + shape=tuple(stacked.shape), + buffer_ptr=stacked.data_ptr() + ) + io_binding.bind_output("landmark") + io_binding.bind_output("confidence") + io_binding.bind_output("bbox") + session.run_with_iobinding(iobinding=io_binding) + outputs: list[numpy.ndarray] = io_binding.copy_outputs_to_cpu() + # [numpy.memmap(filename=path.join("memmap", tmp_file_name + str(order)), dtype=numpy.float16, # mode="w+", shape=output.shape) for order, output in enumerate(outputs)] uuid = uuid4().__str__()