update

2023-10-14 19:39:27 +09:00 · 2023-10-14 19:39:27 +09:00 · cbea8536d8
parent 185646c601
commit cbea8536d8
8 changed files with 187 additions and 17 deletions
--- a/ameblo_download.py
+++ b/ameblo_download.py
@ -1,13 +1,8 @@
 from pprint import pprint
 from typing import List, Tuple
 import h5py
 import settings
 import re
 import sys
 from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
-from aiohttp import ClientSession, ClientConnectorError, ClientTimeout
+from aiohttp import ClientSession, ClientTimeout
 from itertools import chain
 from asyncio import run, Semaphore, sleep, Lock
 from datetime import datetime
@ -15,14 +10,9 @@ from aiofiles import open as a_open
 import time
 from os import path, utime, stat, cpu_count, makedirs
 from tqdm.asyncio import tqdm
-from concurrent.futures import as_completed, ProcessPoolExecutor, Future, ThreadPoolExecutor
+from concurrent.futures import ProcessPoolExecutor, Future
 from ujson import loads
 from warnings import filterwarnings
 from h5py import File, special_dtype, string_dtype
 from io import BytesIO
 from numpy import void, array
 import ujson
 import orjson
 import requests
 PARALLEL_LIMIT = 300
--- a/facenet_transfer_learning.py
+++ b/facenet_transfer_learning.py
@ -24,9 +24,11 @@ from torch.cuda import is_available
 from torch import no_grad, save, Tensor, load, device
 from datetime import datetime
 from distutils.util import strtobool
 from intel_extension_for_pytorch import optimize
 CI = bool(strtobool(environ['CI']))
-device = device('cuda' if is_available() else 'cpu')
+# device = device('cuda' if is_available() else 'cpu')
 device = 'xpu'
 model_path: str = join(datadir(), 'artifact', 'vggface2_facenet.pth')
 input_shape: int = 256
@ -128,6 +130,7 @@ optimizer = Adam(params=[
    {'params': model_gpu[1].parameters(), 'lr': 1e-3},
 ])
 model, optimizer = optimize(model=model, optimizer=optimizer)
 scheduler = lr_scheduler.StepLR(optimizer=optimizer, step_size=10, gamma=0.9)
 epochs = 100
--- a/get_article_and_comments.py
+++ b/get_article_and_comments.py
@ -45,7 +45,7 @@ def parse_article(url: str) -> tuple[str, str, str, str, str]:
        # print(div)
        if not div.has_attr('data-src'):
            entry_body.find('img', class_='PhotoSwipeImage').replaceWith(
-                '--blog-image-' + str(div["data-image-order"]) + '--\n')
+                '\n' + '--blog-image-' + str(div["data-image-order"]) + '--')
    for i in entry_body.find_all('br'):
        i.replaceWith('\n')
    data_path = '/'.join([blog_account, str(blog_entry)])
--- a/movie_processing/opencv_test.py
+++ b/movie_processing/opencv_test.py
@ -0,0 +1,11 @@
 from cv2 import VideoCapture, getBuildInformation
 from torchvision.models.mobilenetv3 import MobileNetV3
 print(getBuildInformation())
 sample_video = VideoCapture('/home/tomokazu/PycharmProjects/helloproject-ai/koi_ing.webm')
 assert sample_video.isOpened()
 ret = True
 while ret:
    ret, frame = sample_video.read()
    # print(frame)
--- a/settings.py
+++ b/settings.py
@ -1,6 +1,6 @@
 from functools import cache
-from os import getcwd
+from os import getcwd, pardir
-from os.path import join
+from os.path import join, abspath, dirname
 blog_list = ['angerme-ss-shin', 'angerme-amerika', 'angerme-new', 'juicejuice-official', 'tsubaki-factory',
             'morningmusume-9ki', 'morningmusume-10ki', 'mm-12ki', 'morningm-13ki', 'morningmusume15ki',
@ -53,7 +53,7 @@ def theme_curator(theme: str, blog_id: str) -> str:
@cache
 def datadir():
-    return join(getcwd(), 'data')
+    return join('/home/tomokazu/PycharmProjects/helloproject-ai/', 'data')
 request_header = {
--- a/text_processing/hdf2sql.py
+++ b/text_processing/hdf2sql.py
@ -0,0 +1,43 @@
 from h5py import File
 from io import BytesIO
 from os import getcwd
 from os.path import join
 from pandas import DataFrame, to_datetime, concat
 from sqlite3 import connect
 from tqdm import tqdm
 filename = join('/mnt/shm/blog_text.hdf5')
 with open(file=filename, mode='rb') as f:
    file_bio = BytesIO(initial_bytes=f.read())
 article_tables = []
 comment_tables = []
 with (File(name=file_bio, mode='r+') as hdf5):
    for blog_group in hdf5.keys():
        print(blog_group)
        article_table = DataFrame(columns=['group', 'theme', 'title', 'date', 'article'])
        for blog_entry in tqdm(hdf5[blog_group].keys()):
            # print(blog_entry)
            # print(hdf5[blog_group][blog_entry].keys())
            blog_article = hdf5[blog_group][blog_entry]['article'][()].decode('utf-8')
            (_, entry_theme), (_, entry_title), (_, entry_date) = \
                list(hdf5[blog_group][blog_entry]['article'].attrs.items())
            # print(entry_theme, entry_title, entry_date)
            article_table.loc[blog_entry] = [blog_group, entry_theme, entry_title, entry_date, blog_article]
            comment_table = DataFrame(columns=['blog_id', 'user_id', 'nickname', 'title', 'date', 'article'])
            for comment_entry, comment_text in hdf5[blog_group][blog_entry]['comments_dataset'].items():
                comment_article = comment_text[()].decode('utf-8')
                comment_blog_id, comment_user_id, comment_nickname, comment_title, comment_date = \
                    list(comment_text.attrs.values())
                comment_table.loc[comment_entry] = \
                    [comment_blog_id, comment_user_id, comment_nickname, comment_title, comment_date, comment_article]
            comment_tables.append(comment_table.copy(deep=True))
        article_tables.append(article_table.copy(deep=True))
        # break
 # print(table)
 with connect('blog_post.sqlite') as connector:
    concat(objs=article_tables).to_sql(name='blog', con=connector)
    concat(comment_tables).to_sql(name='comments', con=connector)
--- a/text_processing/hdf2sql_parallel.py
+++ b/text_processing/hdf2sql_parallel.py
@ -0,0 +1,71 @@
 from typing import Any
 from h5py import File
 from io import BytesIO
 from os import cpu_count
 from os.path import join, pardir, dirname
 from pandas import DataFrame
 from sqlite3 import connect
 from concurrent.futures import ProcessPoolExecutor as PPE
 from multiprocessing import Manager
 from sys import path, stdout, stderr
 from shutil import move
 from time import time
 from tqdm import tqdm
 path.append(pardir)
 from settings import datadir
 filename = join('/mnt/shm/blog_text.hdf5')
 temporary_dir = '/mnt/shm/'
 with open(file=filename, mode='rb') as f:
    file_bio = BytesIO(initial_bytes=f.read())
 def extract(key: str, file: str) -> tuple[list[list[Any]], list[list[Any]]]:
    with File(name=file, mode='r') as hdf:
        print(f'start: {key}')
        start = time()
        article_table = []
        comment_table = []
        for blog_entry in hdf[key].keys():
            blog_article = hdf[key][blog_entry]['article'][()].decode('utf-8')
            entry_theme, entry_title, entry_date = list(hdf[key][blog_entry]['article'].attrs.values())
            article_table.append([blog_entry, key, entry_theme, entry_title, entry_date, blog_article])
            for comment_entry, comment_text in hdf[key][blog_entry]['comments_dataset'].items():
                comment_article = comment_text[()].decode('utf-8')
                comment_blog_id, comment_user_id, comment_nickname, comment_title, comment_date = \
                    list(comment_text.attrs.values())
                comment_table.append(
                    [comment_entry, comment_blog_id, comment_user_id, comment_nickname, comment_title, comment_date,
                     comment_article])
            # break
        print(f'end: {key} at {int(time() - start)}s')
        return article_table, comment_table
 results = []
 with PPE(max_workers=cpu_count()) as executor, File(name=file_bio, mode='r') as hdf5:
    for order, blog_group in enumerate(hdf5.keys(), start=0):
        # print(blog_group)
        lock = Manager().Lock()
        results.append(executor.submit(extract, blog_group, filename))
 article_tables = []
 comment_tables = []
 for job in results:
    a, b = job.result()
    article_tables.extend(a)
    comment_tables.extend(b)
 with connect(database=join(temporary_dir, 'tmp.sqlite'), timeout=3600) as connector:
    article_dataframe = DataFrame(data=article_tables, columns=['index', 'group', 'theme', 'title', 'date', 'article'])
    # article_dataframe.set_index('index')
    article_dataframe = article_dataframe.astype({"index":int})
    article_dataframe.to_sql(name='blog', con=connector, if_exists='replace',index=False)
    comment_dataframe = DataFrame(data=comment_tables,
                                  columns=['index', 'blog_id', 'user_id', 'nickname', 'title', 'date', 'article'])
    comment_dataframe.set_index('index')
    comment_dataframe.to_sql(name='comment', con=connector, if_exists='replace')
 move(join(temporary_dir, 'tmp.sqlite'), join(datadir(), 'blog_post.sqlite'))
--- a/text_processing/sqlite_process.py
+++ b/text_processing/sqlite_process.py
@ -0,0 +1,52 @@
 from sqlite3 import connect
 from shutil import copyfile
 from settings import datadir
 from os.path import join
 from os import getcwd
 from numpy import array, tile, int_, rot90, where, ndarray, vectorize, str_
 # copyfile(src=join(datadir(), 'blog_post.sqlite'), dst=join('/mnt/shm/blog_post.sqlite'))
 hash_func = vectorize(hash, otypes=[str])
 with connect(database='/mnt/shm/blog_post.sqlite') as connector:
    cursor = connector.cursor()
    print(cursor.execute("SELECT t.sql FROM sqlite_master t WHERE name = 'blog'").fetchone())
    if 'article_cleaned' not in cursor.execute("SELECT t.sql FROM sqlite_master t WHERE name = 'blog'").fetchone()[0]:
        cursor.execute("ALTER TABLE blog add article_cleaned TEXT")
    print(cursor.execute("SELECT t.sql FROM sqlite_master t WHERE name = 'blog'").fetchone())
    for (theme,) in cursor.execute(f'SELECT DISTINCT theme FROM blog').fetchall():
        print(theme)
        blog_contents = cursor.execute(
            f'SELECT title,date,article,theme,"index" FROM blog WHERE theme = \'{theme}\' ORDER BY date').fetchall()
        # cleaned_list = [None] * (blog_contents.__len__() - 1)
        # cleaned_list = []
        for i in range(blog_contents.__len__() - 1):
            if blog_contents[i][3] != '八木栞':
                pass
            a = blog_contents[i]
            b = blog_contents[i + 1]
            # print(a)
            # print(b)
            a_list = array(a[2].split('\n'), dtype=object)
            b_list = array(b[2].split('\n'), dtype=object)
            a_hash_list = hash_func(a_list)
            b_hash_list = hash_func(b_list)
            # print(a_hash_list.shape)
            # print(b_hash_list.shape)
            a_ndarray: ndarray = tile(array(object=a_hash_list), reps=(*b_hash_list.shape, 1))
            b_ndarray: ndarray = rot90(tile(array(object=b_hash_list), reps=(*a_hash_list.shape, 1)))
            # print(a_ndarray.shape)
            # print(b_ndarray.shape)
            dup = a_ndarray == b_ndarray
            # print(dup)
            # print(list(zip(*where(dup))))
            cleaned_text = '\n'.join(a_list[(~dup.any(axis=0))]).replace('\'', '\'\'')
            cursor.execute(
                f"UPDATE blog SET article_cleaned = \'{cleaned_text}\' WHERE \"index\" = {blog_contents[i][4]}")
            # cleaned_list[i] = cleaned_text
            # cleaned_list.append(cleaned_text)
            print('\t' + blog_contents[i][0])
            # input()
    connector.cursor().close()
    connector.commit()