diff --git a/ameblo_download.py b/ameblo_download.py index 9e1d7b6..3425d3f 100755 --- a/ameblo_download.py +++ b/ameblo_download.py @@ -1,13 +1,8 @@ -from pprint import pprint -from typing import List, Tuple - -import h5py - import settings import re import sys from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning -from aiohttp import ClientSession, ClientConnectorError, ClientTimeout +from aiohttp import ClientSession, ClientTimeout from itertools import chain from asyncio import run, Semaphore, sleep, Lock from datetime import datetime @@ -15,14 +10,9 @@ from aiofiles import open as a_open import time from os import path, utime, stat, cpu_count, makedirs from tqdm.asyncio import tqdm -from concurrent.futures import as_completed, ProcessPoolExecutor, Future, ThreadPoolExecutor -from ujson import loads +from concurrent.futures import ProcessPoolExecutor, Future from warnings import filterwarnings -from h5py import File, special_dtype, string_dtype -from io import BytesIO -from numpy import void, array import ujson -import orjson import requests PARALLEL_LIMIT = 300 diff --git a/facenet_transfer_learning.py b/facenet_transfer_learning.py index 5edc5b6..8d30c8b 100644 --- a/facenet_transfer_learning.py +++ b/facenet_transfer_learning.py @@ -24,9 +24,11 @@ from torch.cuda import is_available from torch import no_grad, save, Tensor, load, device from datetime import datetime from distutils.util import strtobool +from intel_extension_for_pytorch import optimize CI = bool(strtobool(environ['CI'])) -device = device('cuda' if is_available() else 'cpu') +# device = device('cuda' if is_available() else 'cpu') +device = 'xpu' model_path: str = join(datadir(), 'artifact', 'vggface2_facenet.pth') input_shape: int = 256 @@ -128,6 +130,7 @@ optimizer = Adam(params=[ {'params': model_gpu[1].parameters(), 'lr': 1e-3}, ]) +model, optimizer = optimize(model=model, optimizer=optimizer) scheduler = lr_scheduler.StepLR(optimizer=optimizer, step_size=10, gamma=0.9) epochs = 100 diff --git a/get_article_and_comments.py b/get_article_and_comments.py index 436fc51..b6153b1 100644 --- a/get_article_and_comments.py +++ b/get_article_and_comments.py @@ -45,7 +45,7 @@ def parse_article(url: str) -> tuple[str, str, str, str, str]: # print(div) if not div.has_attr('data-src'): entry_body.find('img', class_='PhotoSwipeImage').replaceWith( - '--blog-image-' + str(div["data-image-order"]) + '--\n') + '\n' + '--blog-image-' + str(div["data-image-order"]) + '--') for i in entry_body.find_all('br'): i.replaceWith('\n') data_path = '/'.join([blog_account, str(blog_entry)]) diff --git a/movie_processing/opencv_test.py b/movie_processing/opencv_test.py new file mode 100644 index 0000000..4f53981 --- /dev/null +++ b/movie_processing/opencv_test.py @@ -0,0 +1,11 @@ +from cv2 import VideoCapture, getBuildInformation +from torchvision.models.mobilenetv3 import MobileNetV3 + +print(getBuildInformation()) +sample_video = VideoCapture('/home/tomokazu/PycharmProjects/helloproject-ai/koi_ing.webm') +assert sample_video.isOpened() + +ret = True +while ret: + ret, frame = sample_video.read() + # print(frame) diff --git a/settings.py b/settings.py index e40fddd..949162b 100755 --- a/settings.py +++ b/settings.py @@ -1,6 +1,6 @@ from functools import cache -from os import getcwd -from os.path import join +from os import getcwd, pardir +from os.path import join, abspath, dirname blog_list = ['angerme-ss-shin', 'angerme-amerika', 'angerme-new', 'juicejuice-official', 'tsubaki-factory', 'morningmusume-9ki', 'morningmusume-10ki', 'mm-12ki', 'morningm-13ki', 'morningmusume15ki', @@ -53,7 +53,7 @@ def theme_curator(theme: str, blog_id: str) -> str: @cache def datadir(): - return join(getcwd(), 'data') + return join('/home/tomokazu/PycharmProjects/helloproject-ai/', 'data') request_header = { diff --git a/text_processing/hdf2sql.py b/text_processing/hdf2sql.py new file mode 100644 index 0000000..899e0a4 --- /dev/null +++ b/text_processing/hdf2sql.py @@ -0,0 +1,43 @@ +from h5py import File +from io import BytesIO +from os import getcwd +from os.path import join +from pandas import DataFrame, to_datetime, concat +from sqlite3 import connect +from tqdm import tqdm + +filename = join('/mnt/shm/blog_text.hdf5') + +with open(file=filename, mode='rb') as f: + file_bio = BytesIO(initial_bytes=f.read()) + +article_tables = [] +comment_tables = [] +with (File(name=file_bio, mode='r+') as hdf5): + for blog_group in hdf5.keys(): + print(blog_group) + article_table = DataFrame(columns=['group', 'theme', 'title', 'date', 'article']) + for blog_entry in tqdm(hdf5[blog_group].keys()): + # print(blog_entry) + # print(hdf5[blog_group][blog_entry].keys()) + blog_article = hdf5[blog_group][blog_entry]['article'][()].decode('utf-8') + (_, entry_theme), (_, entry_title), (_, entry_date) = \ + list(hdf5[blog_group][blog_entry]['article'].attrs.items()) + # print(entry_theme, entry_title, entry_date) + article_table.loc[blog_entry] = [blog_group, entry_theme, entry_title, entry_date, blog_article] + comment_table = DataFrame(columns=['blog_id', 'user_id', 'nickname', 'title', 'date', 'article']) + for comment_entry, comment_text in hdf5[blog_group][blog_entry]['comments_dataset'].items(): + comment_article = comment_text[()].decode('utf-8') + comment_blog_id, comment_user_id, comment_nickname, comment_title, comment_date = \ + list(comment_text.attrs.values()) + comment_table.loc[comment_entry] = \ + [comment_blog_id, comment_user_id, comment_nickname, comment_title, comment_date, comment_article] + comment_tables.append(comment_table.copy(deep=True)) + article_tables.append(article_table.copy(deep=True)) + + # break + +# print(table) +with connect('blog_post.sqlite') as connector: + concat(objs=article_tables).to_sql(name='blog', con=connector) + concat(comment_tables).to_sql(name='comments', con=connector) diff --git a/text_processing/hdf2sql_parallel.py b/text_processing/hdf2sql_parallel.py new file mode 100644 index 0000000..6c79543 --- /dev/null +++ b/text_processing/hdf2sql_parallel.py @@ -0,0 +1,71 @@ +from typing import Any +from h5py import File +from io import BytesIO +from os import cpu_count +from os.path import join, pardir, dirname +from pandas import DataFrame +from sqlite3 import connect +from concurrent.futures import ProcessPoolExecutor as PPE +from multiprocessing import Manager +from sys import path, stdout, stderr +from shutil import move +from time import time + +from tqdm import tqdm + +path.append(pardir) +from settings import datadir + +filename = join('/mnt/shm/blog_text.hdf5') +temporary_dir = '/mnt/shm/' +with open(file=filename, mode='rb') as f: + file_bio = BytesIO(initial_bytes=f.read()) + + +def extract(key: str, file: str) -> tuple[list[list[Any]], list[list[Any]]]: + with File(name=file, mode='r') as hdf: + print(f'start: {key}') + start = time() + article_table = [] + comment_table = [] + for blog_entry in hdf[key].keys(): + blog_article = hdf[key][blog_entry]['article'][()].decode('utf-8') + entry_theme, entry_title, entry_date = list(hdf[key][blog_entry]['article'].attrs.values()) + article_table.append([blog_entry, key, entry_theme, entry_title, entry_date, blog_article]) + for comment_entry, comment_text in hdf[key][blog_entry]['comments_dataset'].items(): + comment_article = comment_text[()].decode('utf-8') + comment_blog_id, comment_user_id, comment_nickname, comment_title, comment_date = \ + list(comment_text.attrs.values()) + comment_table.append( + [comment_entry, comment_blog_id, comment_user_id, comment_nickname, comment_title, comment_date, + comment_article]) + # break + print(f'end: {key} at {int(time() - start)}s') + return article_table, comment_table + + +results = [] +with PPE(max_workers=cpu_count()) as executor, File(name=file_bio, mode='r') as hdf5: + for order, blog_group in enumerate(hdf5.keys(), start=0): + # print(blog_group) + lock = Manager().Lock() + results.append(executor.submit(extract, blog_group, filename)) + +article_tables = [] +comment_tables = [] +for job in results: + a, b = job.result() + article_tables.extend(a) + comment_tables.extend(b) + +with connect(database=join(temporary_dir, 'tmp.sqlite'), timeout=3600) as connector: + article_dataframe = DataFrame(data=article_tables, columns=['index', 'group', 'theme', 'title', 'date', 'article']) + # article_dataframe.set_index('index') + article_dataframe = article_dataframe.astype({"index":int}) + article_dataframe.to_sql(name='blog', con=connector, if_exists='replace',index=False) + comment_dataframe = DataFrame(data=comment_tables, + columns=['index', 'blog_id', 'user_id', 'nickname', 'title', 'date', 'article']) + comment_dataframe.set_index('index') + comment_dataframe.to_sql(name='comment', con=connector, if_exists='replace') + +move(join(temporary_dir, 'tmp.sqlite'), join(datadir(), 'blog_post.sqlite')) diff --git a/text_processing/sqlite_process.py b/text_processing/sqlite_process.py new file mode 100644 index 0000000..b7ba8f7 --- /dev/null +++ b/text_processing/sqlite_process.py @@ -0,0 +1,52 @@ +from sqlite3 import connect +from shutil import copyfile +from settings import datadir +from os.path import join +from os import getcwd +from numpy import array, tile, int_, rot90, where, ndarray, vectorize, str_ + +# copyfile(src=join(datadir(), 'blog_post.sqlite'), dst=join('/mnt/shm/blog_post.sqlite')) +hash_func = vectorize(hash, otypes=[str]) +with connect(database='/mnt/shm/blog_post.sqlite') as connector: + cursor = connector.cursor() + print(cursor.execute("SELECT t.sql FROM sqlite_master t WHERE name = 'blog'").fetchone()) + if 'article_cleaned' not in cursor.execute("SELECT t.sql FROM sqlite_master t WHERE name = 'blog'").fetchone()[0]: + cursor.execute("ALTER TABLE blog add article_cleaned TEXT") + print(cursor.execute("SELECT t.sql FROM sqlite_master t WHERE name = 'blog'").fetchone()) + + for (theme,) in cursor.execute(f'SELECT DISTINCT theme FROM blog').fetchall(): + print(theme) + blog_contents = cursor.execute( + f'SELECT title,date,article,theme,"index" FROM blog WHERE theme = \'{theme}\' ORDER BY date').fetchall() + # cleaned_list = [None] * (blog_contents.__len__() - 1) + # cleaned_list = [] + for i in range(blog_contents.__len__() - 1): + if blog_contents[i][3] != '八木栞': + pass + a = blog_contents[i] + b = blog_contents[i + 1] + # print(a) + # print(b) + a_list = array(a[2].split('\n'), dtype=object) + b_list = array(b[2].split('\n'), dtype=object) + a_hash_list = hash_func(a_list) + b_hash_list = hash_func(b_list) + # print(a_hash_list.shape) + # print(b_hash_list.shape) + a_ndarray: ndarray = tile(array(object=a_hash_list), reps=(*b_hash_list.shape, 1)) + b_ndarray: ndarray = rot90(tile(array(object=b_hash_list), reps=(*a_hash_list.shape, 1))) + # print(a_ndarray.shape) + # print(b_ndarray.shape) + dup = a_ndarray == b_ndarray + # print(dup) + # print(list(zip(*where(dup)))) + cleaned_text = '\n'.join(a_list[(~dup.any(axis=0))]).replace('\'', '\'\'') + cursor.execute( + f"UPDATE blog SET article_cleaned = \'{cleaned_text}\' WHERE \"index\" = {blog_contents[i][4]}") + # cleaned_list[i] = cleaned_text + # cleaned_list.append(cleaned_text) + print('\t' + blog_contents[i][0]) + # input() + + connector.cursor().close() + connector.commit()