diff --git a/test_script/qsv_decode_test.py b/test_script/qsv_decode_test.py
index 7004587..152122b 100644
--- a/test_script/qsv_decode_test.py
+++ b/test_script/qsv_decode_test.py
@@ -46,15 +46,14 @@ pyplot.figure(figsize=(20, 20), dpi=150)
 pyplot.imshow(yuv_plane[2, :, :])
 pyplot.show()
 pyplot.close("all")
-ycbcr_mat = yuv_plane.transpose((1, 2, 0)).reshape((-1, 3)) - [0, 128, 128]
+ycbcr_mat = yuv_plane.transpose((1, 2, 0)) - [0, 128, 128]
 # print(ycbcr_mat)
 transform_matrix = numpy.array([
-    [1.0, 0.0, 1.5748],
-    [1.0, -0.1873, -0.4681],
-    [1.0, 1.8556, 0.0]
+    [1, 0, 1.402],
+    [1, -0.344136, -0.714136],
+    [1, 1.772, 0]
 ])
-rgb_plane = (numpy.clip(numpy.dot(ycbcr_mat, transform_matrix.T), 0, 255)
-             .reshape(pitch_h, pitch_w, 3).astype(numpy.uint8))
+rgb_plane = (numpy.clip(numpy.dot(ycbcr_mat, transform_matrix.T), 0, 255).astype(numpy.uint8))
 pyplot.figure(figsize=(20, 20), dpi=150)
 pyplot.imshow(rgb_plane)
 pyplot.show()
diff --git a/test_script/qsv_jpeg_decode.py b/test_script/qsv_jpeg_decode.py
index 39bcc28..34f85a6 100644
--- a/test_script/qsv_jpeg_decode.py
+++ b/test_script/qsv_jpeg_decode.py
@@ -1,7 +1,12 @@
+import ctypes
+import inspect
 import json
+import math
 import os
 import warnings
 
+from numpy.f2py.auxfuncs import throw_error
+
 warnings.filterwarnings("ignore", lineno=6, category=UserWarning)
 from concurrent.futures.process import ProcessPoolExecutor
 from itertools import chain
@@ -16,7 +21,7 @@ import tqdm
 from PIL import Image
 from uuid import uuid4
 from onnxruntime import InferenceSession, SessionOptions, GraphOptimizationLevel
-from torch import tensor
+from torch import tensor, as_strided
 import aiofiles
 import numpy
 import torch
@@ -24,6 +29,7 @@ from torchvision.io import decode_jpeg
 from asyncio import run, gather, Semaphore
 from site import getsitepackages
 from rust_retinaface_post_processor import resnet_post_process
+from test_ext import decode as qsv_decode
 
 USE_OPENVINO = True
 if USE_OPENVINO:
@@ -39,7 +45,7 @@ files = []
 files_data: dict[str, numpy.ndarray | None] = {}
 chunk_size = 16
 image_size = 640
-device = torch.device("cpu") if torch.xpu.is_available() else exit(-1)
+device = torch.device("xpu") if torch.xpu.is_available() else exit(-1)
 
 
 async def async_read(path: str, semaphore: Semaphore):
@@ -78,6 +84,7 @@ def post_processor_shm(shm_name, sizes, batch_size, image_size):
 
 
 def dec_jpg(f, fn):
+    # print("USE PILLOW")
     _decoded_image = tensor(numpy.array(Image.open(BytesIO(f.tobytes()))).transpose([2, 0, 1]))
     _decoded_image = _decoded_image.to(device, torch.float16) / 255
     _decoded_image = fn[2](_decoded_image)
@@ -85,6 +92,31 @@ def dec_jpg(f, fn):
     return fn[1](_decoded_image_resized)
 
 
+def dec_jpg_qsv(f, fn):
+    ptr, height, width, pitch = qsv_decode(f.tobytes())
+    pitch_h = math.ceil(height / 2) * 2
+    pitch_w = math.ceil(width / 2) * 2
+    y_arr = torch.frombuffer((ctypes.c_uint8 * (pitch_h * pitch)).from_address(ptr), dtype=torch.uint8,
+                             count=pitch_h * pitch).to(device)
+    uv_arr = torch.frombuffer((ctypes.c_uint8 * (int(pitch_h * 1.5) * pitch)).from_address(ptr),
+                              dtype=torch.uint8, count=int(pitch_h / 2) * pitch, offset=pitch_h * pitch).to(device)
+    y_plane = as_strided(y_arr, (pitch_h, pitch_w), (pitch, 1))
+    uv_plane = as_strided(uv_arr, (int(pitch_h / 2), int(pitch_w / 2), 2), (pitch, 2, 1))
+    yuv_plane = torch.stack([y_plane,
+                             uv_plane[:, :, 0].repeat_interleave(2, dim=0).repeat_interleave(2, dim=1),
+                             uv_plane[:, :, 1].repeat_interleave(2, dim=0).repeat_interleave(2, dim=1)])
+    ycbcr_mat = yuv_plane.permute((1, 2, 0)) - torch.Tensor([0, 128, 128]).to(device)
+    transform_matrix = torch.Tensor([
+        [1, 0, 1.402],
+        [1, -0.344136, -0.714136],
+        [1, 1.772, 0]
+    ]).to(device)
+    rgb_plane = torch.clip(torch.matmul(ycbcr_mat, transform_matrix.T), 0, 255).to(device, torch.uint8) / 255
+    _decoded_image = fn[2](rgb_plane.permute((2, 0, 1)))
+    _decoded_image_resized = fn[0](_decoded_image)
+    return fn[1](_decoded_image_resized)
+
+
 if __name__ == '__main__':
     from kornia.augmentation import LongestMaxSize, PadTo, Normalize
     from kornia.constants import Resample
@@ -93,10 +125,14 @@ if __name__ == '__main__':
     pad_to = PadTo(size=(640, 640), pad_value=1.)
     normalize = Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
     if USE_OPENVINO:
+
+        for ov_device in ov_core.get_available_devices():
+            device_name = ov_core.get_property(ov_device, "FULL_DEVICE_NAME")
+            print(f"{ov_device}: {device_name}")
         onnx_model = ov_core.read_model(model_path)
         onnx_model.reshape([chunk_size, 3, image_size, image_size])
         onnx_model = ov_core.compile_model(onnx_model, device_name='GPU')
-
+        # print(onnx_model)
     else:
         session_options = SessionOptions()
         session_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
@@ -104,26 +140,17 @@ if __name__ == '__main__':
         session = InferenceSession(
             path_or_bytes=model_path,
             providers=[
-                ('TensorrtExecutionProvider', {
-                    'trt_engine_cache_enable': True,
-                    'trt_engine_cache_path': 'trt_cache',
-                    'trt_fp16_enable': True,
-                    'trt_profile_min_shapes': f'input:1x3x{image_size}x{image_size}',
-                    'trt_profile_max_shapes': f'input:{chunk_size}x3x{image_size}x{image_size}',
-                    'trt_profile_opt_shapes': f'input:{chunk_size}x3x{image_size}x{image_size}',
-                }),
                 ('OpenVINOExecutionProvider', {
                     'device_type': 'GPU.0',
                     'precision': 'FP16',
                     'cache_dir': 'openvino_cache'
                 }),
-                'CUDAExecutionProvider',
                 'CPUExecutionProvider'
             ],
             sess_options=session_options
         )
-    if os.path.exists("faces.jsonl"):
-        with open(file="faces.jsonl", mode="r", encoding="utf-8") as fp:
+    if os.path.exists("faces_qsv.jsonl"):
+        with open(file="faces_qsv.jsonl", mode="r", encoding="utf-8") as fp:
             already = {list(msgspec.json.decode(line).keys())[0] for line in fp.read().removesuffix("\n").split("\n")}
     else:
         already = set()
@@ -134,7 +161,7 @@ if __name__ == '__main__':
     # exit(0)
 
     for name in listdir(root_dir):
-        with (ProcessPoolExecutor(max_workers=16) as executor):
+        with (ProcessPoolExecutor(max_workers=12) as executor):
             pbar.set_description_str(desc=name, refresh=True)
             if name != "ブログ":
                 # continue
@@ -157,12 +184,26 @@ if __name__ == '__main__':
                 if USE_OPENVINO:
                     fn_pack = [longest_max_size, pad_to, normalize]
                     submits = []
+                    # for file, dat in cnk:
+                    #     submits.append(executor.submit(dec_jpg_qsv, dat, fn_pack))
+                    #     names.append(file)
+                    # for submit in submits:
+                    #     try:
+                    #         stack.append(submit.result().to(device).squeeze())
+                    #     except Exception as e:
+                    #         print(e)
+                    #         stack.append(dec_jpg(dat, fn_pack).squeeze())
                     for file, dat in cnk:
-                        submits.append(executor.submit(dec_jpg, dat, fn_pack))
+                        try:
+                            raise Exception
+                            stack.append(dec_jpg_qsv(dat, fn_pack).squeeze())
+                        except Exception as e:
+                            # print(e)
+                            stack.append(dec_jpg(dat, fn_pack).squeeze())
                         names.append(file)
-                    for submit in submits:
-                        stack.append(submit.result().squeeze())
+
                 else:
+                    print("fallback", inspect.currentframe().f_lineno)
                     for file, dat in cnk:
                         try:
                             decoded_image = decode_jpeg(tensor(dat), device=device)
@@ -181,10 +222,9 @@ if __name__ == '__main__':
                 stacked = torch.stack(stack).contiguous()
                 # print(stacked.shape)
                 if USE_OPENVINO:
-                    _outputs = onnx_model([stacked])
+                    _outputs = onnx_model([stacked.cpu()])
                     # print(_outputs[onnx_model.output(0)])
                     outputs = [_outputs[onnx_model.output(i)] for i in range(2, -1, -1)]
-                    # print(outputs)
                 else:
                     io_binding = session.io_binding()
                     io_binding.bind_input(
@@ -200,7 +240,7 @@ if __name__ == '__main__':
                     io_binding.bind_output("bbox")
                     session.run_with_iobinding(iobinding=io_binding)
                     outputs: list[numpy.ndarray] = io_binding.copy_outputs_to_cpu()
-
+                    print("fallback", inspect.currentframe().f_lineno)
                 # [numpy.memmap(filename=path.join("memmap", tmp_file_name + str(order)), dtype=numpy.float16,
                 #               mode="w+", shape=output.shape) for order, output in enumerate(outputs)]
                 uuid = uuid4().__str__()
@@ -218,7 +258,7 @@ if __name__ == '__main__':
                 # exit(0)
                 pbar.update(n=cnk.__len__())
             # result_dict = dict()
-            with open("faces.jsonl", mode="a", encoding="utf-8") as fp:
+            with open("faces_qsv.jsonl", mode="a", encoding="utf-8") as fp:
                 futures_results = [future.result() for future in futures]
                 # pprint(futures_results)
                 for names, futures_result in zip(namess, futures_results):