Compare commits

..

No commits in common. "master" and "2024-09-11" have entirely different histories.

5 changed files with 53 additions and 456 deletions

View File

@ -14,7 +14,7 @@ from concurrent.futures import ThreadPoolExecutor
from pandas import DataFrame from pandas import DataFrame
from seaborn import heatmap, color_palette, set_palette from seaborn import heatmap, color_palette, set_palette
from matplotlib import pyplot from matplotlib import pyplot
from matplotlib_fontja import japanize from japanize_matplotlib import japanize
device = device('cuda' if is_available() else 'cpu') device = device('cuda' if is_available() else 'cpu')
# device = 'cpu' # device = 'cpu'

View File

@ -1,16 +1,10 @@
# import cv2 # import cv2
import os
# print(os.environ)
for p in os.environ['Path'].split(os.pathsep):
if os.path.isdir(p) and p != ".":
print(p)
os.add_dll_directory(p)
import msgspec import msgspec
from torch import tensor from torch import tensor
import torch import torch
from torchvision.transforms import functional, InterpolationMode from torchvision.transforms import functional, InterpolationMode
from torchvision.io import decode_jpeg from torchvision.io import decode_jpeg
import os
# import shutil # import shutil
import numpy import numpy
from PIL import Image from PIL import Image
@ -19,13 +13,12 @@ from more_itertools import chunked
from tqdm import tqdm from tqdm import tqdm
import math import math
ROOT_DIR = r"E:\helloproject-ai-data\blog_images" ROOT_DIR = r"D:\helloproject-ai-data\blog_images"
CROPPED_DIR = r"E:\helloproject-ai-data\face_cropped" CROPPED_DIR = r"D:\helloproject-ai-data\face_cropped"
CROP_THRESHOLD = 0.8 CROP_THRESHOLD = 0.8
inference_size = 640 inference_size = 640
device = torch.device("cuda") device = torch.device("cuda")
device = torch.device("xpu") if torch.xpu.is_available() else exit(-1)
def calc_rotate(landmark: list[list[float]]) -> tuple[tuple[int, int], float]: def calc_rotate(landmark: list[list[float]]) -> tuple[tuple[int, int], float]:

View File

@ -1,60 +0,0 @@
import ctypes
import math
import numpy
from numpy.lib._stride_tricks_impl import as_strided
from matplotlib import pyplot
from test_ext import decode
with open(file=r"C:\Users\tomokazu\すぐ消す\friends-4385686.jpg", mode="rb") as f:
ptr, height, width, pitch = decode(f.read())
pitch_h = math.ceil(height / 2) * 2
pitch_w = math.ceil(width / 2) * 2
print(height, width, pitch)
y_arr = numpy.frombuffer((ctypes.c_uint8 * (pitch_h * pitch)).from_address(ptr), dtype=numpy.uint8,
count=pitch_h * pitch)
print(f"{y_arr=}")
uv_arr = numpy.frombuffer((ctypes.c_uint8 * (int(pitch_h * 1.5) * pitch)).from_address(ptr),
dtype=numpy.uint8,
count=int(pitch_h / 2) * pitch, offset=pitch_h * pitch)
print(f"{uv_arr=}")
y_plane = as_strided(y_arr, (pitch_h, pitch_w), (pitch, 1))
uv_plane = as_strided(uv_arr, (int(pitch_h / 2), int(pitch_w / 2), 2), (pitch, 2, 1))
yuv_plane = numpy.stack([y_plane,
uv_plane[:, :, 0].repeat(2, axis=0).repeat(2, axis=1),
uv_plane[:, :, 1].repeat(2, axis=0).repeat(2, axis=1)])
# print(y_plane.shape)
# print(y_plane.strides)
# print(uv_plane.shape)
# print(uv_plane.strides)
# print(uv_plane[:, :, 0].shape)
print(yuv_plane.shape)
print(yuv_plane.strides)
# print(yuv_plane[:, : 4, : 4])
# print(yuv_plane.transpose(1, 2, 0)[:4, :4, :])
pyplot.figure(figsize=(20, 20), dpi=150)
pyplot.imshow(yuv_plane[0, :, :])
pyplot.show()
pyplot.close("all")
pyplot.figure(figsize=(20, 20), dpi=150)
pyplot.imshow(yuv_plane[1, :, :])
pyplot.show()
pyplot.close("all")
pyplot.figure(figsize=(20, 20), dpi=150)
pyplot.imshow(yuv_plane[2, :, :])
pyplot.show()
pyplot.close("all")
ycbcr_mat = yuv_plane.transpose((1, 2, 0)) - [0, 128, 128]
# print(ycbcr_mat)
transform_matrix = numpy.array([
[1, 0, 1.402],
[1, -0.344136, -0.714136],
[1, 1.772, 0]
])
rgb_plane = (numpy.clip(numpy.dot(ycbcr_mat, transform_matrix.T), 0, 255).astype(numpy.uint8))
pyplot.figure(figsize=(20, 20), dpi=150)
pyplot.imshow(rgb_plane)
pyplot.show()
pyplot.close("all")

View File

@ -1,288 +0,0 @@
import ctypes
import inspect
import json
import math
import os
import warnings
from numpy.f2py.auxfuncs import throw_error
warnings.filterwarnings("ignore", lineno=6, category=UserWarning)
from concurrent.futures.process import ProcessPoolExecutor
from itertools import chain
from multiprocessing import shared_memory
from io import BytesIO
from os import listdir, path, pathsep, makedirs
from pprint import pprint
import more_itertools
import msgspec
import pandas.io.json
import tqdm
from PIL import Image
from uuid import uuid4
from onnxruntime import InferenceSession, SessionOptions, GraphOptimizationLevel
from torch import tensor, as_strided
import aiofiles
import numpy
import torch
from torchvision.io import decode_jpeg
from asyncio import run, gather, Semaphore
from site import getsitepackages
from rust_retinaface_post_processor import resnet_post_process
from test_ext import decode as qsv_decode
USE_OPENVINO = True
if USE_OPENVINO:
import openvino
ov_core = openvino.Core()
os.environ["Path"] = path.join(getsitepackages()[-1], "tensorrt_libs") + pathsep + os.environ["Path"]
root_dir = r"E:\helloproject-ai-data\blog_images"
model_path = r"C:\Users\tomokazu\build\retinaface\retinaface_only_nn_fp16.onnx"
# makedirs("memmap", exist_ok=True)
files = []
files_data: dict[str, numpy.ndarray | None] = {}
chunk_size = 16
image_size = 640
device = torch.device("xpu") if torch.xpu.is_available() else exit(-1)
async def async_read(path: str, semaphore: Semaphore):
async with semaphore:
async with aiofiles.open(file=path, mode="rb") as fp:
return await fp.read()
async def gather_runner(l: list, fn):
sem = Semaphore(2048)
return await gather(*[fn(p, sem) for p in l])
# def post_processor(outputs, batch_size, image_size):
# # print("aaa", flush=True)
# outputs = [numpy.ascontiguousarray(output.astype(numpy.float32)) for output in outputs]
# res = resnet_post_process([output.__array_interface__["data"][0] for output in outputs], batch_size, image_size)
# return res
#
#
# def post_processor_memmap(tmp_filename, sizes, batch_size, image_size): # print("aaa", flush=True) outputs = [
# numpy.memmap(filename=path.join("memmap", tmp_filename + str(order)), dtype=numpy.float16, mode="r", shape=size)
# for order, size in enumerate(sizes)] outputs = [numpy.ascontiguousarray(output.astype(numpy.float32)) for output in
# outputs] res = resnet_post_process([output.__array_interface__["data"][0] for output in outputs], batch_size,
# image_size) return res
def post_processor_shm(shm_name, sizes, batch_size, image_size):
shms = [shared_memory.SharedMemory(name=shm_name + "_" + str(i)) for i in range(3)]
outputs = \
[numpy.ascontiguousarray(numpy.ndarray(shape=size, dtype=numpy.float16, buffer=shm.buf).astype(numpy.float32))
for size, shm in zip(sizes, shms)]
res = resnet_post_process([output.__array_interface__["data"][0] for output in outputs], batch_size, image_size)
# print(res)
return res
def dec_jpg(f, fn):
# print("USE PILLOW")
_decoded_image = tensor(numpy.array(Image.open(BytesIO(f.tobytes()))).transpose([2, 0, 1]))
_decoded_image = _decoded_image.to(device, torch.float16) / 255
_decoded_image = fn[2](_decoded_image)
_decoded_image_resized = fn[0](_decoded_image)
return fn[1](_decoded_image_resized)
def dec_jpg_qsv(f, fn):
ptr, height, width, pitch = qsv_decode(f.tobytes())
pitch_h = math.ceil(height / 2) * 2
pitch_w = math.ceil(width / 2) * 2
y_arr = torch.frombuffer((ctypes.c_uint8 * (pitch_h * pitch)).from_address(ptr), dtype=torch.uint8,
count=pitch_h * pitch).to(device)
uv_arr = torch.frombuffer((ctypes.c_uint8 * (int(pitch_h * 1.5) * pitch)).from_address(ptr),
dtype=torch.uint8, count=int(pitch_h / 2) * pitch, offset=pitch_h * pitch).to(device)
y_plane = as_strided(y_arr, (pitch_h, pitch_w), (pitch, 1))
uv_plane = as_strided(uv_arr, (int(pitch_h / 2), int(pitch_w / 2), 2), (pitch, 2, 1))
yuv_plane = torch.stack([y_plane,
uv_plane[:, :, 0].repeat_interleave(2, dim=0).repeat_interleave(2, dim=1),
uv_plane[:, :, 1].repeat_interleave(2, dim=0).repeat_interleave(2, dim=1)])
ycbcr_mat = yuv_plane.permute((1, 2, 0)) - torch.Tensor([0, 128, 128]).to(device)
transform_matrix = torch.Tensor([
[1, 0, 1.402],
[1, -0.344136, -0.714136],
[1, 1.772, 0]
]).to(device)
rgb_plane = torch.clip(torch.matmul(ycbcr_mat, transform_matrix.T), 0, 255).to(device, torch.uint8) / 255
_decoded_image = fn[2](rgb_plane.permute((2, 0, 1)))
_decoded_image_resized = fn[0](_decoded_image)
return fn[1](_decoded_image_resized)
if __name__ == '__main__':
from kornia.augmentation import LongestMaxSize, PadTo, Normalize
from kornia.constants import Resample
longest_max_size = LongestMaxSize(max_size=640, resample=Resample.NEAREST)
pad_to = PadTo(size=(640, 640), pad_value=1.)
normalize = Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
if USE_OPENVINO:
for ov_device in ov_core.get_available_devices():
device_name = ov_core.get_property(ov_device, "FULL_DEVICE_NAME")
print(f"{ov_device}: {device_name}")
onnx_model = ov_core.read_model(model_path)
onnx_model.reshape([chunk_size, 3, image_size, image_size])
onnx_model = ov_core.compile_model(onnx_model, device_name='GPU')
# print(onnx_model)
else:
session_options = SessionOptions()
session_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
# session_options.optimized_model_filepath = GraphOptimizationLevel = "onnx_cache"
session = InferenceSession(
path_or_bytes=model_path,
providers=[
('OpenVINOExecutionProvider', {
'device_type': 'GPU.0',
'precision': 'FP16',
'cache_dir': 'openvino_cache'
}),
'CPUExecutionProvider'
],
sess_options=session_options
)
if os.path.exists("faces_qsv.jsonl"):
with open(file="faces_qsv.jsonl", mode="r", encoding="utf-8") as fp:
already = {list(msgspec.json.decode(line).keys())[0] for line in fp.read().removesuffix("\n").split("\n")}
else:
already = set()
pbar = tqdm.tqdm(
total=(set().union(*[listdir(path.join(root_dir, name)) for name in listdir(root_dir)]) - already).__len__())
# print(len(already))
# exit(0)
for name in listdir(root_dir):
with (ProcessPoolExecutor(max_workers=12) as executor):
pbar.set_description_str(desc=name, refresh=True)
if name != "ブログ":
# continue
pass
file_names = listdir(path.join(root_dir, name))
file_names_set = set(file_names) - already
file_names = list(file_names_set)
name_files = [path.join(root_dir, name, file_name) for file_name in file_names]
files_data = {file_name: numpy.frombuffer(dat, dtype=numpy.uint8) for file_name, dat in
zip(file_names, run(gather_runner(name_files, async_read)))}
if files_data.__len__() == 0:
continue
futures = []
shms = []
namess = []
# print(k_1)
for cnk in more_itertools.chunked(files_data.items(), n=chunk_size):
stack = []
names = []
if USE_OPENVINO:
fn_pack = [longest_max_size, pad_to, normalize]
submits = []
# for file, dat in cnk:
# submits.append(executor.submit(dec_jpg_qsv, dat, fn_pack))
# names.append(file)
# for submit in submits:
# try:
# stack.append(submit.result().to(device).squeeze())
# except Exception as e:
# print(e)
# stack.append(dec_jpg(dat, fn_pack).squeeze())
for file, dat in cnk:
try:
raise Exception
stack.append(dec_jpg_qsv(dat, fn_pack).squeeze())
except Exception as e:
# print(e)
stack.append(dec_jpg(dat, fn_pack).squeeze())
names.append(file)
else:
print("fallback", inspect.currentframe().f_lineno)
for file, dat in cnk:
try:
decoded_image = decode_jpeg(tensor(dat), device=device)
except:
decoded_image = tensor(
numpy.array(Image.open(BytesIO(dat.tobytes()))).transpose([2, 0, 1]))
decoded_image = decoded_image.to(device, torch.float16) / 255
decoded_image = normalize(decoded_image)
decoded_image_resized = longest_max_size(decoded_image)
decoded_image_padded = pad_to(decoded_image_resized)
stack.append(decoded_image_padded.squeeze())
names.append(file)
namess.append(names)
[stack.append(torch.zeros(size=[3, 640, 640], dtype=torch.float16, device=device)) for _ in
range(chunk_size - stack.__len__())]
stacked = torch.stack(stack).contiguous()
# print(stacked.shape)
if USE_OPENVINO:
_outputs = onnx_model([stacked.cpu()])
# print(_outputs[onnx_model.output(0)])
outputs = [_outputs[onnx_model.output(i)] for i in range(2, -1, -1)]
else:
io_binding = session.io_binding()
io_binding.bind_input(
name="input",
device_type=stacked.device.type,
device_id=stacked.device.index if stacked.device.index is not None else 0,
element_type='float16',
shape=tuple(stacked.shape),
buffer_ptr=stacked.data_ptr()
)
io_binding.bind_output("landmark")
io_binding.bind_output("confidence")
io_binding.bind_output("bbox")
session.run_with_iobinding(iobinding=io_binding)
outputs: list[numpy.ndarray] = io_binding.copy_outputs_to_cpu()
print("fallback", inspect.currentframe().f_lineno)
# [numpy.memmap(filename=path.join("memmap", tmp_file_name + str(order)), dtype=numpy.float16,
# mode="w+", shape=output.shape) for order, output in enumerate(outputs)]
uuid = uuid4().__str__()
shared_array: list[shared_memory.SharedMemory] = \
[shared_memory.SharedMemory(name=uuid + "_" + str(order), create=True, size=output.nbytes)
for order, output in enumerate(outputs)]
shared_ndarray = [numpy.ndarray(shape=output.shape, dtype=numpy.float16, buffer=shm.buf)
for shm, output in zip(shared_array, outputs, strict=True)]
for shm, output in zip(shared_ndarray, outputs, strict=True):
shm[:] = output[:]
future = executor.submit(post_processor_shm, uuid, [output.shape for output in outputs],
chunk_size, [image_size, image_size])
futures.append(future)
shms.extend(shared_array)
# exit(0)
pbar.update(n=cnk.__len__())
# result_dict = dict()
with open("faces_qsv.jsonl", mode="a", encoding="utf-8") as fp:
futures_results = [future.result() for future in futures]
# pprint(futures_results)
for names, futures_result in zip(namess, futures_results):
for name, results in zip(names, futures_result):
results_list = []
if results:
# print(name)
for result in results:
# [print(int(a), end=" ") for a in result[0]]
# print(*result[1], end=" ")
# [print(int(a), end=" ") for a in result[2]]
# print()
# results_list.append(list(chain.from_iterable([result])))
fp.write(
pandas.io.json.ujson_dumps({name: [result[0], result[1][0], result[2]]},
ensure_ascii=False, double_precision=5) + "\n")
pass
else:
fp.write(
pandas.io.json.ujson_dumps({name: None}, ensure_ascii=False) + "\n")
# print(name, [])
pass
# result_dict[name] = results_list
# pprint(result_dict)
[shm.close() for shm in shms]

View File

@ -1,8 +1,5 @@
import json import json
import os import os
import warnings
warnings.filterwarnings("ignore", lineno=6, category=UserWarning)
from concurrent.futures.process import ProcessPoolExecutor from concurrent.futures.process import ProcessPoolExecutor
from itertools import chain from itertools import chain
from multiprocessing import shared_memory from multiprocessing import shared_memory
@ -15,7 +12,7 @@ import pandas.io.json
import tqdm import tqdm
from PIL import Image from PIL import Image
from uuid import uuid4 from uuid import uuid4
from onnxruntime import InferenceSession, SessionOptions, GraphOptimizationLevel from onnxruntime import InferenceSession
from torch import tensor from torch import tensor
import aiofiles import aiofiles
import numpy import numpy
@ -25,21 +22,16 @@ from asyncio import run, gather, Semaphore
from site import getsitepackages from site import getsitepackages
from rust_retinaface_post_processor import resnet_post_process from rust_retinaface_post_processor import resnet_post_process
USE_OPENVINO = True
if USE_OPENVINO:
import openvino
ov_core = openvino.Core()
os.environ["Path"] = path.join(getsitepackages()[-1], "tensorrt_libs") + pathsep + os.environ["Path"] os.environ["Path"] = path.join(getsitepackages()[-1], "tensorrt_libs") + pathsep + os.environ["Path"]
root_dir = r"E:\helloproject-ai-data\blog_images" root_dir = r"D:\helloproject-ai-data\blog_images"
model_path = r"C:\Users\tomokazu\build\retinaface\retinaface_only_nn_fp16.onnx" model_path = r"C:\Users\tomokazu\build\retinaface\retinaface_only_nn_fp16.onnx"
# makedirs("memmap", exist_ok=True) # makedirs("memmap", exist_ok=True)
files = [] files = []
files_data: dict[str, numpy.ndarray | None] = {} files_data: dict[str, numpy.ndarray | None] = {}
chunk_size = 16 chunk_size = 32
image_size = 640 image_size = 640
device = torch.device("cpu") if torch.xpu.is_available() else exit(-1) device = torch.device("cuda")
async def async_read(path: str, semaphore: Semaphore): async def async_read(path: str, semaphore: Semaphore):
@ -77,14 +69,6 @@ def post_processor_shm(shm_name, sizes, batch_size, image_size):
return res return res
def dec_jpg(f, fn):
_decoded_image = tensor(numpy.array(Image.open(BytesIO(f.tobytes()))).transpose([2, 0, 1]))
_decoded_image = _decoded_image.to(device, torch.float16) / 255
_decoded_image = fn[2](_decoded_image)
_decoded_image_resized = fn[0](_decoded_image)
return fn[1](_decoded_image_resized)
if __name__ == '__main__': if __name__ == '__main__':
from kornia.augmentation import LongestMaxSize, PadTo, Normalize from kornia.augmentation import LongestMaxSize, PadTo, Normalize
from kornia.constants import Resample from kornia.constants import Resample
@ -92,41 +76,24 @@ if __name__ == '__main__':
longest_max_size = LongestMaxSize(max_size=640, resample=Resample.NEAREST) longest_max_size = LongestMaxSize(max_size=640, resample=Resample.NEAREST)
pad_to = PadTo(size=(640, 640), pad_value=1.) pad_to = PadTo(size=(640, 640), pad_value=1.)
normalize = Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) normalize = Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
if USE_OPENVINO:
onnx_model = ov_core.read_model(model_path)
onnx_model.reshape([chunk_size, 3, image_size, image_size])
onnx_model = ov_core.compile_model(onnx_model, device_name='GPU')
else: session = InferenceSession(
session_options = SessionOptions() path_or_bytes=model_path,
session_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL providers=[
# session_options.optimized_model_filepath = GraphOptimizationLevel = "onnx_cache" ('TensorrtExecutionProvider', {
session = InferenceSession( 'trt_engine_cache_enable': True,
path_or_bytes=model_path, 'trt_engine_cache_path': 'trt_cache',
providers=[ 'trt_fp16_enable': True,
('TensorrtExecutionProvider', { 'trt_profile_min_shapes': f'input:1x3x{image_size}x{image_size}',
'trt_engine_cache_enable': True, 'trt_profile_max_shapes': f'input:{chunk_size}x3x{image_size}x{image_size}',
'trt_engine_cache_path': 'trt_cache', 'trt_profile_opt_shapes': f'input:{chunk_size}x3x{image_size}x{image_size}',
'trt_fp16_enable': True, }),
'trt_profile_min_shapes': f'input:1x3x{image_size}x{image_size}', 'CUDAExecutionProvider',
'trt_profile_max_shapes': f'input:{chunk_size}x3x{image_size}x{image_size}', 'CPUExecutionProvider'
'trt_profile_opt_shapes': f'input:{chunk_size}x3x{image_size}x{image_size}', ]
}), )
('OpenVINOExecutionProvider', { with open(file="faces.jsonl", mode="r", encoding="utf-8") as fp:
'device_type': 'GPU.0', already = {list(msgspec.json.decode(line).keys())[0] for line in fp.read().removesuffix("\n").split("\n")}
'precision': 'FP16',
'cache_dir': 'openvino_cache'
}),
'CUDAExecutionProvider',
'CPUExecutionProvider'
],
sess_options=session_options
)
if os.path.exists("faces.jsonl"):
with open(file="faces.jsonl", mode="r", encoding="utf-8") as fp:
already = {list(msgspec.json.decode(line).keys())[0] for line in fp.read().removesuffix("\n").split("\n")}
else:
already = set()
pbar = tqdm.tqdm( pbar = tqdm.tqdm(
total=(set().union(*[listdir(path.join(root_dir, name)) for name in listdir(root_dir)]) - already).__len__()) total=(set().union(*[listdir(path.join(root_dir, name)) for name in listdir(root_dir)]) - already).__len__())
@ -134,7 +101,7 @@ if __name__ == '__main__':
# exit(0) # exit(0)
for name in listdir(root_dir): for name in listdir(root_dir):
with (ProcessPoolExecutor(max_workers=16) as executor): with (ProcessPoolExecutor(max_workers=4) as executor):
pbar.set_description_str(desc=name, refresh=True) pbar.set_description_str(desc=name, refresh=True)
if name != "ブログ": if name != "ブログ":
# continue # continue
@ -154,53 +121,38 @@ if __name__ == '__main__':
for cnk in more_itertools.chunked(files_data.items(), n=chunk_size): for cnk in more_itertools.chunked(files_data.items(), n=chunk_size):
stack = [] stack = []
names = [] names = []
if USE_OPENVINO: for file, dat in cnk:
fn_pack = [longest_max_size, pad_to, normalize] try:
submits = [] decoded_image = decode_jpeg(tensor(dat), device=device)
for file, dat in cnk: except:
submits.append(executor.submit(dec_jpg, dat, fn_pack)) decoded_image = tensor(
names.append(file) numpy.array(Image.open(BytesIO(dat.tobytes()))).transpose([2, 0, 1])).to(
for submit in submits: device)
stack.append(submit.result().squeeze()) decoded_image = decoded_image.to(torch.float16) / 255
else: decoded_image = normalize(decoded_image)
for file, dat in cnk: decoded_image_resized = longest_max_size(decoded_image)
try: decoded_image_padded = pad_to(decoded_image_resized)
decoded_image = decode_jpeg(tensor(dat), device=device) stack.append(decoded_image_padded.squeeze())
except: names.append(file)
decoded_image = tensor(
numpy.array(Image.open(BytesIO(dat.tobytes()))).transpose([2, 0, 1]))
decoded_image = decoded_image.to(device, torch.float16) / 255
decoded_image = normalize(decoded_image)
decoded_image_resized = longest_max_size(decoded_image)
decoded_image_padded = pad_to(decoded_image_resized)
stack.append(decoded_image_padded.squeeze())
names.append(file)
namess.append(names) namess.append(names)
[stack.append(torch.zeros(size=[3, 640, 640], dtype=torch.float16, device=device)) for _ in [stack.append(torch.zeros(size=[3, 640, 640], dtype=torch.float16, device=device)) for _ in
range(chunk_size - stack.__len__())] range(chunk_size - stack.__len__())]
stacked = torch.stack(stack).contiguous() stacked = torch.stack(stack).contiguous()
# print(stacked.shape) # print(stacked.shape)
if USE_OPENVINO: io_binding = session.io_binding()
_outputs = onnx_model([stacked]) io_binding.bind_input(
# print(_outputs[onnx_model.output(0)]) name="input",
outputs = [_outputs[onnx_model.output(i)] for i in range(2, -1, -1)] device_type=stacked.device.type,
# print(outputs) device_id=stacked.device.index,
else: element_type='float16',
io_binding = session.io_binding() shape=tuple(stacked.shape),
io_binding.bind_input( buffer_ptr=stacked.data_ptr()
name="input", )
device_type=stacked.device.type, io_binding.bind_output("landmark")
device_id=stacked.device.index if stacked.device.index is not None else 0, io_binding.bind_output("confidence")
element_type='float16', io_binding.bind_output("bbox")
shape=tuple(stacked.shape), session.run_with_iobinding(iobinding=io_binding)
buffer_ptr=stacked.data_ptr() outputs: list[numpy.ndarray] = io_binding.copy_outputs_to_cpu()
)
io_binding.bind_output("landmark")
io_binding.bind_output("confidence")
io_binding.bind_output("bbox")
session.run_with_iobinding(iobinding=io_binding)
outputs: list[numpy.ndarray] = io_binding.copy_outputs_to_cpu()
# [numpy.memmap(filename=path.join("memmap", tmp_file_name + str(order)), dtype=numpy.float16, # [numpy.memmap(filename=path.join("memmap", tmp_file_name + str(order)), dtype=numpy.float16,
# mode="w+", shape=output.shape) for order, output in enumerate(outputs)] # mode="w+", shape=output.shape) for order, output in enumerate(outputs)]
uuid = uuid4().__str__() uuid = uuid4().__str__()