commit e5acdf8f8de17afb8fa332d3ac75572361437bc5 Author: yayoimizuha Date: Sun Apr 16 11:25:23 2023 +0900 Initial Commit diff --git a/.face_crop_dbface.py b/.face_crop_dbface.py new file mode 100755 index 0000000..f6b06e7 --- /dev/null +++ b/.face_crop_dbface.py @@ -0,0 +1,334 @@ +import torch.cuda +from numpy import ndarray +import numpy as np +from DBFace_without_OpenCV import DBFace +import settings +from os import makedirs, listdir, stat, utime +from os.path import join, exists +from tqdm import tqdm +from PIL import Image, ImageDraw +from numpy import array, arctan2, pi, zeros, uint8, float32 +from aiofiles import open as a_open +from asyncio import gather, run +from multiprocessing import Queue, Process, get_start_method, set_start_method +from time import time, sleep +from io import BytesIO +from math import ceil, sqrt +from torch import from_numpy, cuda, Tensor, inference_mode, nn +import atexit + +face_dir = join(settings.datadir(), 'face_cropped') +blog_images = join(settings.datadir(), 'blog_images') + +if not exists(face_dir): + makedirs(face_dir) + + +def truncate(landmark: list[tuple[float]]) -> tuple[tuple[int, int], float]: + left_eye, right_eye, nose, left_mouth, right_mouth = landmark + center_x = sum((left_eye[0], right_eye[0], left_mouth[0], right_mouth[0])) / 4 + center_y = sum((left_eye[1], right_eye[1], left_mouth[1], right_mouth[1])) / 4 + eye_center = (right_eye[0] + left_eye[0]) / 2, (right_eye[1] + left_eye[1]) / 2 + mouth_center = (right_mouth[0] + left_mouth[0]) / 2, (right_mouth[1] + left_mouth[1]) / 2 + return (int(center_x), int(center_y)), arctan2(eye_center[0] - mouth_center[0], mouth_center[1] - eye_center[1]) + + +def load_image(basedir: str, queue: Queue) -> None: + def list_up(): + for name in listdir(basedir): + for image_file in listdir(join(basedir, name)): + yield name, image_file + + async def single_read(path: tuple[str, str]): + async with a_open(join(basedir, *path), mode='rb') as f: + return await f.read(), path + + async def parallel_read(paths: list[tuple[str, str]]): + return await gather(*[single_read(path) for path in paths]) + + file_list = [i for i in list_up()] + bar = tqdm(total=file_list.__len__()) + for i in range(0, file_list.__len__(), 20): + + while queue.qsize() > 300: + sleep(1e-3) + + chunk = file_list[i:i + 20] + + img_bins = run(parallel_read(chunk)) + for img_bin, p in img_bins: + queue.put((Image.open(BytesIO(img_bin)), p)) + bar.update(1) + return + + +def pre_process(q1: Queue, q2: Queue): + mean = [0.408, 0.447, 0.47] + std = [0.289, 0.274, 0.278] + while True: + while q2.qsize() > 4: + sleep(1e-4) + image, path = q1.get() + width, height = image.size + if width * height > 400_0000: + image = image.resize(size=(width // 2, height // 2)) + # print(path, image.size) + image = image.crop((0, 0, ceil(width / 32) * 32, ceil(height / 32) * 32)) # padding + img_arr = array(image) + img_arr = ((img_arr / 255.0 - mean) / std).astype(float32).transpose(2, 0, 1) + torch_image = from_numpy(img_arr)[None] + q2.put((torch_image.cuda(), path)) + pass + + +def predict(q1: Queue, q2: Queue): + model_path = '/home/tomokazu/PycharmProjects/helloproject-ai/DBFace_without_OpenCV/model/dbface.pth' + db_face = DBFace() + db_face.eval() + db_face.cuda() + db_face.load(model_path) + i = 0 + start = time() + while True: + # if i % 5000 == 0: + # cuda.empty_cache() + # torch_image:Tensor=torch_image + # print(i, path, torch_image.size()) + i += 1 + try: + torch_image, path = q1.get() + with inference_mode(): + q2.put((db_face(torch_image), path)) + except Exception as e: + print(e) + del db_face + cuda.empty_cache() + db_face = DBFace() + db_face.eval() + db_face.cuda() + db_face.load(model_path) + + +def exp(v): + if isinstance(v, tuple) or isinstance(v, list): + return [exp(item) for item in v] + elif isinstance(v, ndarray): + return np.array([exp(item) for item in v], v.dtype) + + gate = 1 + base = np.exp(1) + if abs(v) < gate: + return v * base + + if v > 0: + return np.exp(v) + else: + return -np.exp(-v) + + +def nms(objs, iou=0.5): + if objs is None or len(objs) <= 1: + return objs + + objs = sorted(objs, key=lambda obj: obj.score, reverse=True) + keep = [] + flags = [0] * len(objs) + for index, obj in enumerate(objs): + + if flags[index] != 0: + continue + + keep.append(obj) + for j in range(index + 1, len(objs)): + if flags[j] == 0 and obj.iou(objs[j]) > iou: + flags[j] = 1 + return keep + + +class BBox: + + def __init__(self, label, xyrb, score=0, landmark=None, rotate=False): + self.label = label + self.score = score + self.landmark = landmark + self.x, self.y, self.r, self.b = xyrb + self.rotate = rotate + + minx = min(self.x, self.r) + maxx = max(self.x, self.r) + miny = min(self.y, self.b) + maxy = max(self.y, self.b) + self.x, self.y, self.r, self.b = minx, miny, maxx, maxy + + def __repr__(self): + landmark_formated = ",".join( + [str(item[:2]) for item in self.landmark]) if self.landmark is not None else "empty" + return f"(BBox[{self.label}]: x={self.x:.2f}, y={self.y:.2f}, r={self.r:.2f}, " + \ + f"b={self.b:.2f}, width={self.width:.2f}, height={self.height:.2f}, landmark={landmark_formated})" + + @property + def width(self): + return self.r - self.x + 1 + + @property + def height(self): + return self.b - self.y + 1 + + @property + def area(self): + return self.width * self.height + + @property + def haslandmark(self): + return self.landmark is not None + + @property + def xxxxxyyyyy_cat_landmark(self): + x, y = zip(*self.landmark) + return x + y + + @property + def box(self): + return [self.x, self.y, self.r, self.b] + + @box.setter + def box(self, newvalue): + self.x, self.y, self.r, self.b = newvalue + + @property + def xywh(self): + return [self.x, self.y, self.width, self.height] + + @property + def center(self): + return [(self.x + self.r) * 0.5, (self.y + self.b) * 0.5] + + # return cx, cy, cx.diff, cy.diff + def safe_scale_center_and_diff(self, scale, limit_x, limit_y): + cx = clip_value((self.x + self.r) * 0.5 * scale, limit_x - 1) + cy = clip_value((self.y + self.b) * 0.5 * scale, limit_y - 1) + return [int(cx), int(cy), cx - int(cx), cy - int(cy)] + + def safe_scale_center(self, scale, limit_x, limit_y): + cx = int(clip_value((self.x + self.r) * 0.5 * scale, limit_x - 1)) + cy = int(clip_value((self.y + self.b) * 0.5 * scale, limit_y - 1)) + return [cx, cy] + + def clip(self, width, height): + self.x = clip_value(self.x, width - 1) + self.y = clip_value(self.y, height - 1) + self.r = clip_value(self.r, width - 1) + self.b = clip_value(self.b, height - 1) + return self + + def iou(self, other): + return computeIOU(self.box, other.box) + + +def computeIOU(rec1, rec2): + cx1, cy1, cx2, cy2 = rec1 + gx1, gy1, gx2, gy2 = rec2 + S_rec1 = (cx2 - cx1 + 1) * (cy2 - cy1 + 1) + S_rec2 = (gx2 - gx1 + 1) * (gy2 - gy1 + 1) + x1 = max(cx1, gx1) + y1 = max(cy1, gy1) + x2 = min(cx2, gx2) + y2 = min(cy2, gy2) + + w = max(0, x2 - x1 + 1) + h = max(0, y2 - y1 + 1) + area = w * h + iou = area / (S_rec1 + S_rec2 - area) + return iou + + +def clip_value(value, high, low=0): + return max(min(value, high), low) + + +def post_process(queue: Queue, threshold: float = 0.4, nms_iou: float = 0.5): + while True: + tensor, path = queue.get() + hm, box, landmark = tensor + del tensor + name, file = path + hm_pool = nn.functional.max_pool2d(hm, 3, 1, 1) + t = ((hm == hm_pool).float() * hm).view(1, -1).cpu() + if t.size()[1] < 1000: + continue + scores, indices = t.topk(1000) + hm_height, hm_width = hm.shape[2:] + del hm + scores = scores.squeeze() + indices = indices.squeeze() + ys = list((indices / hm_width).int().data.numpy()) + xs = list((indices % hm_width).int().data.numpy()) + scores = list(scores.data.numpy()) + box = box.cpu().squeeze().data.numpy() + landmark = landmark.cpu().squeeze().data.numpy() + + stride = 4 + objs = [] + for cx, cy, score in zip(xs, ys, scores): + if score < threshold: + break + + x, y, r, b = box[:, cy, cx] + xyrb = (array([cx, cy, cx, cy]) + [-x, -y, r, b]) * stride + x5y5 = landmark[:, cy, cx] + x5y5 = (exp(x5y5 * 4) + ([cx] * 5 + [cy] * 5)) * stride + box_landmark = list(zip(x5y5[:5], x5y5[5:])) + objs.append(BBox(0, xyrb=xyrb, score=score, landmark=box_landmark)) + predicted = nms(objs, iou=nms_iou) + image = Image.open(join(blog_images, *path)) + + width, height = image.size + if width * height > 400_0000: + image = image.resize(size=(width // 2, height // 2)) + + for order, face in enumerate(predicted): + trans = truncate(face.landmark) + rotated = image.rotate(angle=trans[1] * 360 / (2 * pi), center=trans[0]) + image_size = max(face.width, face.height) * sqrt(2) // 2 + if image_size < 100: + continue + cropped = rotated.crop((trans[0][0] - image_size, trans[0][1] - image_size, trans[0][0] + image_size, + trans[0][1] + image_size)) + if not exists(join(face_dir, name)): + makedirs(join(face_dir, name), exist_ok=True) + saved_path = join(face_dir, name, file.replace('.jpg', '-' + str(order + 1) + '.jpg')) + cropped.save(saved_path) + utime(path=saved_path, times=(stat(join(blog_images, *path)).st_atime, + stat(join(blog_images, *path)).st_mtime)) + + +if __name__ == '__main__': + if get_start_method() == 'fork': + set_start_method('spawn', force=True) + + try: + Load_Q, PreProcess_Q, Predict_Q, PostProcess_Q = (Queue() for i in range(4)) + Load_Processes = [Process(target=load_image, args=(blog_images, Load_Q)) + for _ in range(settings.FaceCropProcesses.load)] + PreProcesses = [Process(target=pre_process, args=(Load_Q, PreProcess_Q)) + for _ in range(settings.FaceCropProcesses.pre_process)] + Predict_Process = [Process(target=predict, args=(PreProcess_Q, Predict_Q)) + for _ in range(settings.FaceCropProcesses.predict)] + PostProcesses = [Process(target=post_process, args=(Predict_Q,)) + for _ in range(settings.FaceCropProcesses.post_process)] + [p.start() for p in Load_Processes] + [p.start() for p in PreProcesses] + [p.start() for p in Predict_Process] + [p.start() for p in PostProcesses] + while True: + sleep(5) + # print(Load_Q.qsize(), PreProcess_Q.qsize(), Predict_Q.qsize(), PostProcess_Q.qsize()) + if sum((Load_Q.qsize(), PreProcess_Q.qsize(), Predict_Q.qsize(), PostProcess_Q.qsize())) == 0: + raise KeyboardInterrupt + + except KeyboardInterrupt as e: + print(e) + [p.terminate() for p in Load_Processes] + [p.terminate() for p in PreProcesses] + [p.terminate() for p in Predict_Process] + [p.terminate() for p in PostProcesses] diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1371f10 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +DBFace* +*.jpg \ No newline at end of file diff --git a/NearestNeighbors.py b/NearestNeighbors.py new file mode 100755 index 0000000..9fa8b07 --- /dev/null +++ b/NearestNeighbors.py @@ -0,0 +1,70 @@ +from shutil import copyfile +from time import sleep + +from insightface.app import FaceAnalysis +from sklearn.neighbors import NearestNeighbors +from json import load +from os import listdir, makedirs +from os.path import join, isfile, isdir, basename, exists +from settings import datadir +from PIL import Image +from numpy import array, sqrt + +with open(file=join(datadir(), 'sample_emb.json'), mode='r') as f: + sample_dict: dict = load(f) + +sample_key = [basename(s).split('=')[0] for s in sample_dict.keys()] +sample_emb = list(sample_dict.values()) +del sample_dict + +neighbors = NearestNeighbors(n_neighbors=15, n_jobs=-1, metric='cosine') +# print(sample_emb[0]) +neighbors.fit(sample_emb) + +makedirs(join(datadir(), 'NN_predict'), exist_ok=True) + +face_analysis = FaceAnalysis(allowed_modules=['recognition', 'detection']) +face_analysis.prepare(ctx_id=0, det_size=(160, 160)) + +for name in listdir(join(datadir(), 'face_cropped')): + for file in listdir(join(datadir(), 'face_cropped', name)): + im_path = join(datadir(), 'face_cropped', name, file) + if not isfile(im_path): + continue + image = array(Image.open(im_path))[:, :, [2, 1, 0]] + emb = face_analysis.get(image) + if emb.__len__() == 0: + continue + distances, nears = neighbors.kneighbors([emb[0].embedding], return_distance=True) + # print(nears) + # print(distances) + scores = dict() + for dist, key_id in zip(distances[0], nears[0]): + if key_id != 0.0: + if not sample_key[key_id] in scores.keys(): + scores[sample_key[key_id]] = 0.0 + scores[sample_key[key_id]] += 1 / sqrt(dist) + # print(scores) + print(im_path) + val_sum = sum(scores.values()) + sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True) + for key, reliability in sorted_scores: + print(f'{key:<7}\t{(reliability * 100 / val_sum):05.2f}%') + + if sorted(distances[0], reverse=False)[0] < 0.4: + prediction = sorted_scores[0][0] + elif sorted_scores[0][1] / val_sum > 0.6: + prediction = sorted_scores[0][0] + else: + prediction = 'others' + print(prediction) # , sorted(distances[0], reverse=False)) + # print(sorted_scores) + # print(prediction) + + if not exists(join(datadir(), 'NN_predict', prediction)): + makedirs(join(datadir(), 'NN_predict', prediction)) + copyfile(im_path, join(datadir(), 'NN_predict', prediction, basename(im_path))) + + print('\n\n') + # sleep(1) + # exit() diff --git a/ameblo_download.py b/ameblo_download.py new file mode 100755 index 0000000..93dbfb4 --- /dev/null +++ b/ameblo_download.py @@ -0,0 +1,174 @@ +import settings +import re +import sys +from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning +from aiohttp import ClientSession, ClientConnectorError, ClientTimeout +from itertools import chain +from asyncio import run, Semaphore, sleep +from datetime import datetime +from aiofiles import open +from os import path, utime, stat, cpu_count, makedirs +from tqdm.asyncio import tqdm +from concurrent.futures import as_completed, ProcessPoolExecutor, Future +from ujson import loads +from warnings import filterwarnings + +PARALLEL_LIMIT = 300 + +makedirs(path.join(settings.datadir(), 'blog_images'), exist_ok=True) +makedirs(path.join(settings.datadir(), 'blog_text'), exist_ok=True) + +filterwarnings('ignore', category=MarkupResemblesLocatorWarning, module='bs4') + + +async def run_each(name: str) -> None: + sem: Semaphore = Semaphore(PARALLEL_LIMIT) + session: ClientSession = ClientSession(trust_env=True, headers=settings.request_header, + timeout=ClientTimeout(total=10 * 60)) + + list_pages_count = await parse_list_pages_count(name) + + print(name, list_pages_count) + + url_lists = await tqdm.gather(*[parse_list_page(name, i, sem, session) for i in range(1, list_pages_count + 1)], + desc=name) + + url_list = list(chain.from_iterable(url_lists)) + + for url in url_list: + if 'html' not in url: + print(url) + + executor = ProcessPoolExecutor(max_workers=cpu_count()) + futures = await tqdm.gather(*[parse_blog_post(url, sem, session, executor) for url in url_list], desc='scan blog') + images_list = list() + for future in tqdm(as_completed(futures), desc='waiting processing ' + name, total=len(futures)): + images_list.append(future.result()) + executor.shutdown() + image_link_package = list(chain.from_iterable(images_list)) + + await tqdm.gather( + *[download_image(filename, url, date, sem, session) for filename, url, date in image_link_package], + desc='downloading images') + + await session.close() + + +async def parse_list_pages_count(blog_name: str) -> int: + async with ClientSession(trust_env=True, headers=settings.request_header) as session: + async with session.get(f'https://ameblo.jp/{blog_name}/entrylist.html') as resp: + resp_html = await resp.text() + json_obj = loads(re.findall(r'