update
continuous-integration/drone/push Build was killed Details

This commit is contained in:
yayoimizuha 2023-10-14 19:39:27 +09:00
parent 185646c601
commit cbea8536d8
8 changed files with 187 additions and 17 deletions

View File

@ -1,13 +1,8 @@
from pprint import pprint
from typing import List, Tuple
import h5py
import settings
import re
import sys
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
from aiohttp import ClientSession, ClientConnectorError, ClientTimeout
from aiohttp import ClientSession, ClientTimeout
from itertools import chain
from asyncio import run, Semaphore, sleep, Lock
from datetime import datetime
@ -15,14 +10,9 @@ from aiofiles import open as a_open
import time
from os import path, utime, stat, cpu_count, makedirs
from tqdm.asyncio import tqdm
from concurrent.futures import as_completed, ProcessPoolExecutor, Future, ThreadPoolExecutor
from ujson import loads
from concurrent.futures import ProcessPoolExecutor, Future
from warnings import filterwarnings
from h5py import File, special_dtype, string_dtype
from io import BytesIO
from numpy import void, array
import ujson
import orjson
import requests
PARALLEL_LIMIT = 300

View File

@ -24,9 +24,11 @@ from torch.cuda import is_available
from torch import no_grad, save, Tensor, load, device
from datetime import datetime
from distutils.util import strtobool
from intel_extension_for_pytorch import optimize
CI = bool(strtobool(environ['CI']))
device = device('cuda' if is_available() else 'cpu')
# device = device('cuda' if is_available() else 'cpu')
device = 'xpu'
model_path: str = join(datadir(), 'artifact', 'vggface2_facenet.pth')
input_shape: int = 256
@ -128,6 +130,7 @@ optimizer = Adam(params=[
{'params': model_gpu[1].parameters(), 'lr': 1e-3},
])
model, optimizer = optimize(model=model, optimizer=optimizer)
scheduler = lr_scheduler.StepLR(optimizer=optimizer, step_size=10, gamma=0.9)
epochs = 100

View File

@ -45,7 +45,7 @@ def parse_article(url: str) -> tuple[str, str, str, str, str]:
# print(div)
if not div.has_attr('data-src'):
entry_body.find('img', class_='PhotoSwipeImage').replaceWith(
'--blog-image-' + str(div["data-image-order"]) + '--\n')
'\n' + '--blog-image-' + str(div["data-image-order"]) + '--')
for i in entry_body.find_all('br'):
i.replaceWith('\n')
data_path = '/'.join([blog_account, str(blog_entry)])

View File

@ -0,0 +1,11 @@
from cv2 import VideoCapture, getBuildInformation
from torchvision.models.mobilenetv3 import MobileNetV3
print(getBuildInformation())
sample_video = VideoCapture('/home/tomokazu/PycharmProjects/helloproject-ai/koi_ing.webm')
assert sample_video.isOpened()
ret = True
while ret:
ret, frame = sample_video.read()
# print(frame)

View File

@ -1,6 +1,6 @@
from functools import cache
from os import getcwd
from os.path import join
from os import getcwd, pardir
from os.path import join, abspath, dirname
blog_list = ['angerme-ss-shin', 'angerme-amerika', 'angerme-new', 'juicejuice-official', 'tsubaki-factory',
'morningmusume-9ki', 'morningmusume-10ki', 'mm-12ki', 'morningm-13ki', 'morningmusume15ki',
@ -53,7 +53,7 @@ def theme_curator(theme: str, blog_id: str) -> str:
@cache
def datadir():
return join(getcwd(), 'data')
return join('/home/tomokazu/PycharmProjects/helloproject-ai/', 'data')
request_header = {

View File

@ -0,0 +1,43 @@
from h5py import File
from io import BytesIO
from os import getcwd
from os.path import join
from pandas import DataFrame, to_datetime, concat
from sqlite3 import connect
from tqdm import tqdm
filename = join('/mnt/shm/blog_text.hdf5')
with open(file=filename, mode='rb') as f:
file_bio = BytesIO(initial_bytes=f.read())
article_tables = []
comment_tables = []
with (File(name=file_bio, mode='r+') as hdf5):
for blog_group in hdf5.keys():
print(blog_group)
article_table = DataFrame(columns=['group', 'theme', 'title', 'date', 'article'])
for blog_entry in tqdm(hdf5[blog_group].keys()):
# print(blog_entry)
# print(hdf5[blog_group][blog_entry].keys())
blog_article = hdf5[blog_group][blog_entry]['article'][()].decode('utf-8')
(_, entry_theme), (_, entry_title), (_, entry_date) = \
list(hdf5[blog_group][blog_entry]['article'].attrs.items())
# print(entry_theme, entry_title, entry_date)
article_table.loc[blog_entry] = [blog_group, entry_theme, entry_title, entry_date, blog_article]
comment_table = DataFrame(columns=['blog_id', 'user_id', 'nickname', 'title', 'date', 'article'])
for comment_entry, comment_text in hdf5[blog_group][blog_entry]['comments_dataset'].items():
comment_article = comment_text[()].decode('utf-8')
comment_blog_id, comment_user_id, comment_nickname, comment_title, comment_date = \
list(comment_text.attrs.values())
comment_table.loc[comment_entry] = \
[comment_blog_id, comment_user_id, comment_nickname, comment_title, comment_date, comment_article]
comment_tables.append(comment_table.copy(deep=True))
article_tables.append(article_table.copy(deep=True))
# break
# print(table)
with connect('blog_post.sqlite') as connector:
concat(objs=article_tables).to_sql(name='blog', con=connector)
concat(comment_tables).to_sql(name='comments', con=connector)

View File

@ -0,0 +1,71 @@
from typing import Any
from h5py import File
from io import BytesIO
from os import cpu_count
from os.path import join, pardir, dirname
from pandas import DataFrame
from sqlite3 import connect
from concurrent.futures import ProcessPoolExecutor as PPE
from multiprocessing import Manager
from sys import path, stdout, stderr
from shutil import move
from time import time
from tqdm import tqdm
path.append(pardir)
from settings import datadir
filename = join('/mnt/shm/blog_text.hdf5')
temporary_dir = '/mnt/shm/'
with open(file=filename, mode='rb') as f:
file_bio = BytesIO(initial_bytes=f.read())
def extract(key: str, file: str) -> tuple[list[list[Any]], list[list[Any]]]:
with File(name=file, mode='r') as hdf:
print(f'start: {key}')
start = time()
article_table = []
comment_table = []
for blog_entry in hdf[key].keys():
blog_article = hdf[key][blog_entry]['article'][()].decode('utf-8')
entry_theme, entry_title, entry_date = list(hdf[key][blog_entry]['article'].attrs.values())
article_table.append([blog_entry, key, entry_theme, entry_title, entry_date, blog_article])
for comment_entry, comment_text in hdf[key][blog_entry]['comments_dataset'].items():
comment_article = comment_text[()].decode('utf-8')
comment_blog_id, comment_user_id, comment_nickname, comment_title, comment_date = \
list(comment_text.attrs.values())
comment_table.append(
[comment_entry, comment_blog_id, comment_user_id, comment_nickname, comment_title, comment_date,
comment_article])
# break
print(f'end: {key} at {int(time() - start)}s')
return article_table, comment_table
results = []
with PPE(max_workers=cpu_count()) as executor, File(name=file_bio, mode='r') as hdf5:
for order, blog_group in enumerate(hdf5.keys(), start=0):
# print(blog_group)
lock = Manager().Lock()
results.append(executor.submit(extract, blog_group, filename))
article_tables = []
comment_tables = []
for job in results:
a, b = job.result()
article_tables.extend(a)
comment_tables.extend(b)
with connect(database=join(temporary_dir, 'tmp.sqlite'), timeout=3600) as connector:
article_dataframe = DataFrame(data=article_tables, columns=['index', 'group', 'theme', 'title', 'date', 'article'])
# article_dataframe.set_index('index')
article_dataframe = article_dataframe.astype({"index":int})
article_dataframe.to_sql(name='blog', con=connector, if_exists='replace',index=False)
comment_dataframe = DataFrame(data=comment_tables,
columns=['index', 'blog_id', 'user_id', 'nickname', 'title', 'date', 'article'])
comment_dataframe.set_index('index')
comment_dataframe.to_sql(name='comment', con=connector, if_exists='replace')
move(join(temporary_dir, 'tmp.sqlite'), join(datadir(), 'blog_post.sqlite'))

View File

@ -0,0 +1,52 @@
from sqlite3 import connect
from shutil import copyfile
from settings import datadir
from os.path import join
from os import getcwd
from numpy import array, tile, int_, rot90, where, ndarray, vectorize, str_
# copyfile(src=join(datadir(), 'blog_post.sqlite'), dst=join('/mnt/shm/blog_post.sqlite'))
hash_func = vectorize(hash, otypes=[str])
with connect(database='/mnt/shm/blog_post.sqlite') as connector:
cursor = connector.cursor()
print(cursor.execute("SELECT t.sql FROM sqlite_master t WHERE name = 'blog'").fetchone())
if 'article_cleaned' not in cursor.execute("SELECT t.sql FROM sqlite_master t WHERE name = 'blog'").fetchone()[0]:
cursor.execute("ALTER TABLE blog add article_cleaned TEXT")
print(cursor.execute("SELECT t.sql FROM sqlite_master t WHERE name = 'blog'").fetchone())
for (theme,) in cursor.execute(f'SELECT DISTINCT theme FROM blog').fetchall():
print(theme)
blog_contents = cursor.execute(
f'SELECT title,date,article,theme,"index" FROM blog WHERE theme = \'{theme}\' ORDER BY date').fetchall()
# cleaned_list = [None] * (blog_contents.__len__() - 1)
# cleaned_list = []
for i in range(blog_contents.__len__() - 1):
if blog_contents[i][3] != '八木栞':
pass
a = blog_contents[i]
b = blog_contents[i + 1]
# print(a)
# print(b)
a_list = array(a[2].split('\n'), dtype=object)
b_list = array(b[2].split('\n'), dtype=object)
a_hash_list = hash_func(a_list)
b_hash_list = hash_func(b_list)
# print(a_hash_list.shape)
# print(b_hash_list.shape)
a_ndarray: ndarray = tile(array(object=a_hash_list), reps=(*b_hash_list.shape, 1))
b_ndarray: ndarray = rot90(tile(array(object=b_hash_list), reps=(*a_hash_list.shape, 1)))
# print(a_ndarray.shape)
# print(b_ndarray.shape)
dup = a_ndarray == b_ndarray
# print(dup)
# print(list(zip(*where(dup))))
cleaned_text = '\n'.join(a_list[(~dup.any(axis=0))]).replace('\'', '\'\'')
cursor.execute(
f"UPDATE blog SET article_cleaned = \'{cleaned_text}\' WHERE \"index\" = {blog_contents[i][4]}")
# cleaned_list[i] = cleaned_text
# cleaned_list.append(cleaned_text)
print('\t' + blog_contents[i][0])
# input()
connector.cursor().close()
connector.commit()