update

2023-10-14 19:39:27 +09:00 · 2023-10-14 19:39:27 +09:00 · cbea8536d8
parent 185646c601
commit cbea8536d8
8 changed files with 187 additions and 17 deletions
--- a/ameblo_download.py
+++ b/ameblo_download.py
@ -1,13 +1,8 @@
-from pprint import pprint
-from typing import List, Tuple
-
-import h5py
-
 import settings
 import re
 import sys
 from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
-from aiohttp import ClientSession, ClientConnectorError, ClientTimeout
+from aiohttp import ClientSession, ClientTimeout
 from itertools import chain
 from asyncio import run, Semaphore, sleep, Lock
 from datetime import datetime
@ -15,14 +10,9 @@ from aiofiles import open as a_open
 import time
 from os import path, utime, stat, cpu_count, makedirs
 from tqdm.asyncio import tqdm
-from concurrent.futures import as_completed, ProcessPoolExecutor, Future, ThreadPoolExecutor
-from ujson import loads
+from concurrent.futures import ProcessPoolExecutor, Future
 from warnings import filterwarnings
-from h5py import File, special_dtype, string_dtype
-from io import BytesIO
-from numpy import void, array
 import ujson
-import orjson
 import requests

 PARALLEL_LIMIT = 300
--- a/facenet_transfer_learning.py
+++ b/facenet_transfer_learning.py
@ -24,9 +24,11 @@ from torch.cuda import is_available
 from torch import no_grad, save, Tensor, load, device
 from datetime import datetime
 from distutils.util import strtobool
+from intel_extension_for_pytorch import optimize

 CI = bool(strtobool(environ['CI']))
-device = device('cuda' if is_available() else 'cpu')
+# device = device('cuda' if is_available() else 'cpu')
+device = 'xpu'

 model_path: str = join(datadir(), 'artifact', 'vggface2_facenet.pth')
 input_shape: int = 256
@ -128,6 +130,7 @@ optimizer = Adam(params=[
    {'params': model_gpu[1].parameters(), 'lr': 1e-3},
 ])

+model, optimizer = optimize(model=model, optimizer=optimizer)
 scheduler = lr_scheduler.StepLR(optimizer=optimizer, step_size=10, gamma=0.9)
 epochs = 100

--- a/get_article_and_comments.py
+++ b/get_article_and_comments.py
@ -45,7 +45,7 @@ def parse_article(url: str) -> tuple[str, str, str, str, str]:
        # print(div)
        if not div.has_attr('data-src'):
            entry_body.find('img', class_='PhotoSwipeImage').replaceWith(
-                '--blog-image-' + str(div["data-image-order"]) + '--\n')
+                '\n' + '--blog-image-' + str(div["data-image-order"]) + '--')
    for i in entry_body.find_all('br'):
        i.replaceWith('\n')
    data_path = '/'.join([blog_account, str(blog_entry)])
--- a/movie_processing/opencv_test.py
+++ b/movie_processing/opencv_test.py
@ -0,0 +1,11 @@
+from cv2 import VideoCapture, getBuildInformation
+from torchvision.models.mobilenetv3 import MobileNetV3
+
+print(getBuildInformation())
+sample_video = VideoCapture('/home/tomokazu/PycharmProjects/helloproject-ai/koi_ing.webm')
+assert sample_video.isOpened()
+
+ret = True
+while ret:
+    ret, frame = sample_video.read()
+    # print(frame)
--- a/settings.py
+++ b/settings.py
@ -1,6 +1,6 @@
 from functools import cache
-from os import getcwd
-from os.path import join
+from os import getcwd, pardir
+from os.path import join, abspath, dirname

 blog_list = ['angerme-ss-shin', 'angerme-amerika', 'angerme-new', 'juicejuice-official', 'tsubaki-factory',
             'morningmusume-9ki', 'morningmusume-10ki', 'mm-12ki', 'morningm-13ki', 'morningmusume15ki',
@ -53,7 +53,7 @@ def theme_curator(theme: str, blog_id: str) -> str:

@cache
 def datadir():
-    return join(getcwd(), 'data')
+    return join('/home/tomokazu/PycharmProjects/helloproject-ai/', 'data')


 request_header = {
--- a/text_processing/hdf2sql.py
+++ b/text_processing/hdf2sql.py
@ -0,0 +1,43 @@
+from h5py import File
+from io import BytesIO
+from os import getcwd
+from os.path import join
+from pandas import DataFrame, to_datetime, concat
+from sqlite3 import connect
+from tqdm import tqdm
+
+filename = join('/mnt/shm/blog_text.hdf5')
+
+with open(file=filename, mode='rb') as f:
+    file_bio = BytesIO(initial_bytes=f.read())
+
+article_tables = []
+comment_tables = []
+with (File(name=file_bio, mode='r+') as hdf5):
+    for blog_group in hdf5.keys():
+        print(blog_group)
+        article_table = DataFrame(columns=['group', 'theme', 'title', 'date', 'article'])
+        for blog_entry in tqdm(hdf5[blog_group].keys()):
+            # print(blog_entry)
+            # print(hdf5[blog_group][blog_entry].keys())
+            blog_article = hdf5[blog_group][blog_entry]['article'][()].decode('utf-8')
+            (_, entry_theme), (_, entry_title), (_, entry_date) = \
+                list(hdf5[blog_group][blog_entry]['article'].attrs.items())
+            # print(entry_theme, entry_title, entry_date)
+            article_table.loc[blog_entry] = [blog_group, entry_theme, entry_title, entry_date, blog_article]
+            comment_table = DataFrame(columns=['blog_id', 'user_id', 'nickname', 'title', 'date', 'article'])
+            for comment_entry, comment_text in hdf5[blog_group][blog_entry]['comments_dataset'].items():
+                comment_article = comment_text[()].decode('utf-8')
+                comment_blog_id, comment_user_id, comment_nickname, comment_title, comment_date = \
+                    list(comment_text.attrs.values())
+                comment_table.loc[comment_entry] = \
+                    [comment_blog_id, comment_user_id, comment_nickname, comment_title, comment_date, comment_article]
+            comment_tables.append(comment_table.copy(deep=True))
+        article_tables.append(article_table.copy(deep=True))
+
+        # break
+
+# print(table)
+with connect('blog_post.sqlite') as connector:
+    concat(objs=article_tables).to_sql(name='blog', con=connector)
+    concat(comment_tables).to_sql(name='comments', con=connector)
--- a/text_processing/hdf2sql_parallel.py
+++ b/text_processing/hdf2sql_parallel.py
@ -0,0 +1,71 @@
+from typing import Any
+from h5py import File
+from io import BytesIO
+from os import cpu_count
+from os.path import join, pardir, dirname
+from pandas import DataFrame
+from sqlite3 import connect
+from concurrent.futures import ProcessPoolExecutor as PPE
+from multiprocessing import Manager
+from sys import path, stdout, stderr
+from shutil import move
+from time import time
+
+from tqdm import tqdm
+
+path.append(pardir)
+from settings import datadir
+
+filename = join('/mnt/shm/blog_text.hdf5')
+temporary_dir = '/mnt/shm/'
+with open(file=filename, mode='rb') as f:
+    file_bio = BytesIO(initial_bytes=f.read())
+
+
+def extract(key: str, file: str) -> tuple[list[list[Any]], list[list[Any]]]:
+    with File(name=file, mode='r') as hdf:
+        print(f'start: {key}')
+        start = time()
+        article_table = []
+        comment_table = []
+        for blog_entry in hdf[key].keys():
+            blog_article = hdf[key][blog_entry]['article'][()].decode('utf-8')
+            entry_theme, entry_title, entry_date = list(hdf[key][blog_entry]['article'].attrs.values())
+            article_table.append([blog_entry, key, entry_theme, entry_title, entry_date, blog_article])
+            for comment_entry, comment_text in hdf[key][blog_entry]['comments_dataset'].items():
+                comment_article = comment_text[()].decode('utf-8')
+                comment_blog_id, comment_user_id, comment_nickname, comment_title, comment_date = \
+                    list(comment_text.attrs.values())
+                comment_table.append(
+                    [comment_entry, comment_blog_id, comment_user_id, comment_nickname, comment_title, comment_date,
+                     comment_article])
+            # break
+        print(f'end: {key} at {int(time() - start)}s')
+        return article_table, comment_table
+
+
+results = []
+with PPE(max_workers=cpu_count()) as executor, File(name=file_bio, mode='r') as hdf5:
+    for order, blog_group in enumerate(hdf5.keys(), start=0):
+        # print(blog_group)
+        lock = Manager().Lock()
+        results.append(executor.submit(extract, blog_group, filename))
+
+article_tables = []
+comment_tables = []
+for job in results:
+    a, b = job.result()
+    article_tables.extend(a)
+    comment_tables.extend(b)
+
+with connect(database=join(temporary_dir, 'tmp.sqlite'), timeout=3600) as connector:
+    article_dataframe = DataFrame(data=article_tables, columns=['index', 'group', 'theme', 'title', 'date', 'article'])
+    # article_dataframe.set_index('index')
+    article_dataframe = article_dataframe.astype({"index":int})
+    article_dataframe.to_sql(name='blog', con=connector, if_exists='replace',index=False)
+    comment_dataframe = DataFrame(data=comment_tables,
+                                  columns=['index', 'blog_id', 'user_id', 'nickname', 'title', 'date', 'article'])
+    comment_dataframe.set_index('index')
+    comment_dataframe.to_sql(name='comment', con=connector, if_exists='replace')
+
+move(join(temporary_dir, 'tmp.sqlite'), join(datadir(), 'blog_post.sqlite'))
--- a/text_processing/sqlite_process.py
+++ b/text_processing/sqlite_process.py
@ -0,0 +1,52 @@
+from sqlite3 import connect
+from shutil import copyfile
+from settings import datadir
+from os.path import join
+from os import getcwd
+from numpy import array, tile, int_, rot90, where, ndarray, vectorize, str_
+
+# copyfile(src=join(datadir(), 'blog_post.sqlite'), dst=join('/mnt/shm/blog_post.sqlite'))
+hash_func = vectorize(hash, otypes=[str])
+with connect(database='/mnt/shm/blog_post.sqlite') as connector:
+    cursor = connector.cursor()
+    print(cursor.execute("SELECT t.sql FROM sqlite_master t WHERE name = 'blog'").fetchone())
+    if 'article_cleaned' not in cursor.execute("SELECT t.sql FROM sqlite_master t WHERE name = 'blog'").fetchone()[0]:
+        cursor.execute("ALTER TABLE blog add article_cleaned TEXT")
+    print(cursor.execute("SELECT t.sql FROM sqlite_master t WHERE name = 'blog'").fetchone())
+
+    for (theme,) in cursor.execute(f'SELECT DISTINCT theme FROM blog').fetchall():
+        print(theme)
+        blog_contents = cursor.execute(
+            f'SELECT title,date,article,theme,"index" FROM blog WHERE theme = \'{theme}\' ORDER BY date').fetchall()
+        # cleaned_list = [None] * (blog_contents.__len__() - 1)
+        # cleaned_list = []
+        for i in range(blog_contents.__len__() - 1):
+            if blog_contents[i][3] != '八木栞':
+                pass
+            a = blog_contents[i]
+            b = blog_contents[i + 1]
+            # print(a)
+            # print(b)
+            a_list = array(a[2].split('\n'), dtype=object)
+            b_list = array(b[2].split('\n'), dtype=object)
+            a_hash_list = hash_func(a_list)
+            b_hash_list = hash_func(b_list)
+            # print(a_hash_list.shape)
+            # print(b_hash_list.shape)
+            a_ndarray: ndarray = tile(array(object=a_hash_list), reps=(*b_hash_list.shape, 1))
+            b_ndarray: ndarray = rot90(tile(array(object=b_hash_list), reps=(*a_hash_list.shape, 1)))
+            # print(a_ndarray.shape)
+            # print(b_ndarray.shape)
+            dup = a_ndarray == b_ndarray
+            # print(dup)
+            # print(list(zip(*where(dup))))
+            cleaned_text = '\n'.join(a_list[(~dup.any(axis=0))]).replace('\'', '\'\'')
+            cursor.execute(
+                f"UPDATE blog SET article_cleaned = \'{cleaned_text}\' WHERE \"index\" = {blog_contents[i][4]}")
+            # cleaned_list[i] = cleaned_text
+            # cleaned_list.append(cleaned_text)
+            print('\t' + blog_contents[i][0])
+            # input()
+
+    connector.cursor().close()
+    connector.commit()