diff --git a/test_script/retinaface_pure_impl.py b/test_script/retinaface_pure_impl.py index 71b533d..dbb2c2c 100644 --- a/test_script/retinaface_pure_impl.py +++ b/test_script/retinaface_pure_impl.py @@ -3,11 +3,11 @@ from math import ceil import torch from PIL import Image -from numpy import array +from numpy import array, transpose from retinaface.pre_trained_models import get_model from retinaface.predict_single import Model from retinaface.network import RetinaFace -from torch import jit, randn, no_grad, Tensor, int64, tensor, onnx +from torch import jit, randn, no_grad, Tensor, int64, tensor, onnx, from_numpy import albumentations as A from torchinfo import summary import numpy as np @@ -21,13 +21,12 @@ from torchvision.utils import _log_api_usage_once # model.eval() -image = Image.open( - fp="/home/tomokazu/PycharmProjects/helloproject-ai/data/blog_images" - "/稲場愛香/稲場愛香=juicejuice-official=12737097989-2.jpg").convert(mode="RGB") -image_arr = array(image) +image = Image.open(r"C:\Users\tomokazu\すぐ消す\野中美希.jpg").convert(mode="RGB") +image_arr = from_numpy(np.array(object=image, dtype=np.float32)).unsqueeze(0).permute(0, 3, 1, 2) + max_size = 512 -example_input = randn(size=[1, 3, 256, 256]).float().cuda() +example_input = randn(size=[10, 3, 256, 256]).float() retina_model = RetinaFace( name="Resnet50", @@ -35,267 +34,31 @@ retina_model = RetinaFace( return_layers={"layer2": 1, "layer3": 2, "layer4": 3}, in_channels=256, out_channels=256, -).cuda() - - -# onnx.export( -# model=retina_model, args=example_input, export_params=True, verbose=False, input_names=["input"], -# output_names=["bbox", "confidence", "landmark"], -# dynamic_axes={"input": { -# 0: "batch_size", -# 2: "height", -# 3: "width" -# }, "bbox": {1: "bbox"}, "confidence": {1: "confidence"}, "landmark": {1: "landmark"}}, opset_version=16, -# f="retinaface.onnx" -# ) - -def pad_to_size( - target_size: Tuple[int, int], - image: np.array, - bboxes: Optional[np.ndarray] = None, - keypoints: Optional[np.ndarray] = None, -) -> Dict[str, Union[np.ndarray, Tuple[int, int, int, int]]]: - target_height, target_width = target_size - - image_height, image_width = image.shape[:2] - - if target_width < image_width: - raise ValueError(f"Target width should bigger than image_width" f"We got {target_width} {image_width}") - - if target_height < image_height: - raise ValueError(f"Target height should bigger than image_height" f"We got {target_height} {image_height}") - - if image_height == target_height: - y_min_pad = 0 - y_max_pad = 0 - else: - y_pad = target_height - image_height - y_min_pad = y_pad // 2 - y_max_pad = y_pad - y_min_pad - - if image_width == target_width: - x_min_pad = 0 - x_max_pad = 0 - else: - x_pad = target_width - image_width - x_min_pad = x_pad // 2 - x_max_pad = x_pad - x_min_pad - - result = { - "pads": (x_min_pad, y_min_pad, x_max_pad, y_max_pad), - "image": cv2.copyMakeBorder(image, y_min_pad, y_max_pad, x_min_pad, x_max_pad, cv2.BORDER_CONSTANT), - } - - if bboxes is not None: - bboxes[:, 0] += x_min_pad - bboxes[:, 1] += y_min_pad - bboxes[:, 2] += x_min_pad - bboxes[:, 3] += y_min_pad - - result["bboxes"] = bboxes - - if keypoints is not None: - keypoints[:, 0] += x_min_pad - keypoints[:, 1] += y_min_pad - - result["keypoints"] = keypoints - - return result - - -def tensor_from_rgb_image(image: np.ndarray) -> torch.Tensor: - image = np.transpose(image, (2, 0, 1)) - return torch.from_numpy(image) - - -def priorbox(min_sizes, steps, clip, image_size): - feature_maps = [[ceil(image_size[0] / step), ceil(image_size[1] / step)] for step in steps] - - anchors = [] - for k, f in enumerate(feature_maps): - t_min_sizes = min_sizes[k] - for i, j in product(range(f[0]), range(f[1])): - for min_size in t_min_sizes: - s_kx = min_size / image_size[1] - s_ky = min_size / image_size[0] - dense_cx = [x * steps[k] / image_size[1] for x in [j + 0.5]] - dense_cy = [y * steps[k] / image_size[0] for y in [i + 0.5]] - for cy, cx in product(dense_cy, dense_cx): - anchors += [cx, cy, s_kx, s_ky] - - # back to torch land - output = torch.Tensor(anchors).view(-1, 4) - if clip: - output.clamp_(max=1, min=0) - return output - - -def decode( - loc: torch.Tensor, priors: torch.Tensor, variances: Union[List[float], Tuple[float, float]] -) -> torch.Tensor: - boxes = torch.cat( - ( - priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:], - priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1]), - ), - 1, - ) - boxes[:, :2] -= boxes[:, 2:] / 2 - boxes[:, 2:] += boxes[:, :2] - return boxes - - -def decode_landm( - pre: torch.Tensor, priors: torch.Tensor, variances: Union[List[float], Tuple[float, float]] -) -> torch.Tensor: - return torch.cat( - ( - priors[:, :2] + pre[:, :2] * variances[0] * priors[:, 2:], - priors[:, :2] + pre[:, 2:4] * variances[0] * priors[:, 2:], - priors[:, :2] + pre[:, 4:6] * variances[0] * priors[:, 2:], - priors[:, :2] + pre[:, 6:8] * variances[0] * priors[:, 2:], - priors[:, :2] + pre[:, 8:10] * variances[0] * priors[:, 2:], - ), - dim=1, - ) - - -def nms(boxes: Tensor, scores: Tensor, iou_threshold: float) -> Tensor: - if not torch.jit.is_scripting() and not torch.jit.is_tracing(): - _log_api_usage_once(nms) - _assert_has_ops() - return torch.ops.torchvision.nms(boxes, scores, iou_threshold) - - -def unpad_from_size( - pads: Tuple[int, int, int, int], - image: Optional[np.array] = None, - bboxes: Optional[np.ndarray] = None, - keypoints: Optional[np.ndarray] = None, -) -> Dict[str, np.ndarray]: - x_min_pad, y_min_pad, x_max_pad, y_max_pad = pads - - result = {} - - if image is not None: - height, width = image.shape[:2] - result["image"] = image[y_min_pad: height - y_max_pad, x_min_pad: width - x_max_pad] - - if bboxes is not None: - bboxes[:, 0] -= x_min_pad - bboxes[:, 1] -= y_min_pad - bboxes[:, 2] -= x_min_pad - bboxes[:, 3] -= y_min_pad - - result["bboxes"] = bboxes - - if keypoints is not None: - keypoints[:, 0] -= x_min_pad - keypoints[:, 1] -= y_min_pad - - result["keypoints"] = keypoints - - return result - - -device = "cuda" -transform = A.Compose([A.LongestMaxSize(max_size=max_size, p=1), A.Normalize(p=1)]) -variance = [0.1, 0.2] -nms_threshold = .4 -confidence_threshold = .7 -_priorbox = priorbox( - min_sizes=[[16, 32], [64, 128], [256, 512]], - steps=[8, 16, 32], - clip=False, - image_size=(max_size, max_size), -).to(device) -original_height, original_width = image_arr.shape[:2] - -scale_landmarks = torch.from_numpy(np.tile([max_size, max_size], 5)).to(device) -scale_bboxes = torch.from_numpy(np.tile([max_size, max_size], 2)).to(device) - -transformed_image = transform(image=image_arr)["image"] - -paded = pad_to_size(target_size=(max_size, max_size), image=transformed_image) - -pads = paded["pads"] - -torched_image = tensor_from_rgb_image(paded["image"]).to(device) - - -# loc, conf, land = retina_model(torched_image.unsqueeze(0)) - - -def infer(loc, conf, land): - conf = F.softmax(conf, dim=-1) - - annotations = [] - - boxes = decode(loc.data[0], _priorbox, variance) - - boxes *= scale_bboxes - scores = conf[0][:, 1] - - landmarks = decode_landm(land.data[0], _priorbox, variance) - landmarks *= scale_landmarks - - # ignore low scores - valid_index = torch.where(scores > confidence_threshold)[0] - boxes = boxes[valid_index] - landmarks = landmarks[valid_index] - scores = scores[valid_index] - - # Sort from high to low - order = scores.argsort(descending=True) - boxes = boxes[order] - landmarks = landmarks[order] - scores = scores[order] - - # do NMS - keep = nms(boxes, scores, nms_threshold) - boxes = boxes[keep, :].int() - - if boxes.shape[0] == 0: - return [{"bbox": [], "score": -1, "landmarks": []}] - - landmarks = landmarks[keep] - - scores = scores[keep].cpu().detach().numpy().astype(np.float64) - boxes = boxes.cpu().numpy() - landmarks = landmarks.cpu().numpy() - landmarks = landmarks.reshape([-1, 2]) - - unpadded = unpad_from_size(pads, bboxes=boxes, keypoints=landmarks) - - resize_coeff = max(original_height, original_width) / max_size - - boxes = (unpadded["bboxes"] * resize_coeff).astype(int) - landmarks = (unpadded["keypoints"].reshape(-1, 10) * resize_coeff).astype(int) - - for box_id, bbox in enumerate(boxes): - x_min, y_min, x_max, y_max = bbox - - x_min = np.clip(x_min, 0, original_width - 1) - x_max = np.clip(x_max, x_min + 1, original_width - 1) - - if x_min >= x_max: - continue - - y_min = np.clip(y_min, 0, original_height - 1) - y_max = np.clip(y_max, y_min + 1, original_height - 1) - - if y_min >= y_max: - continue - - annotations += [ - { - "bbox": bbox.tolist(), - "score": scores[box_id], - "landmarks": landmarks[box_id].reshape(-1, 2).tolist(), - } - ] - return annotations - - -ans = infer(*retina_model(torched_image.unsqueeze(0))) -print(ans) +).eval() + +print(image_arr.size()) + +torch.onnx.export( + model=retina_model, + args=example_input, + f="retinaface.onnx", + input_names=["input"], + output_names=["bbox", "confidence", "landmark"], + dynamic_axes={"input": { + 0: "batch_size", + 2: "height", + 3: "width" + }, + "bbox": {0: "batch_size", 1: "length"}, + "confidence": {0: "batch_size", 1: "length"}, + "landmark": {0: "batch_size", 1: "length"}, + }, + +) +with no_grad(): + bbox_regressions, classifications, ldm_regressions = retina_model(image_arr) + print(bbox_regressions) + print(bbox_regressions.data[0][:, :2]) + print(bbox_regressions.size()) + print(classifications.size()) + print(ldm_regressions.size())