update
continuous-integration/drone/push Build was killed
Details
continuous-integration/drone/push Build was killed
Details
This commit is contained in:
parent
185646c601
commit
cbea8536d8
|
|
@ -1,13 +1,8 @@
|
||||||
from pprint import pprint
|
|
||||||
from typing import List, Tuple
|
|
||||||
|
|
||||||
import h5py
|
|
||||||
|
|
||||||
import settings
|
import settings
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
|
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
|
||||||
from aiohttp import ClientSession, ClientConnectorError, ClientTimeout
|
from aiohttp import ClientSession, ClientTimeout
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
from asyncio import run, Semaphore, sleep, Lock
|
from asyncio import run, Semaphore, sleep, Lock
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
@ -15,14 +10,9 @@ from aiofiles import open as a_open
|
||||||
import time
|
import time
|
||||||
from os import path, utime, stat, cpu_count, makedirs
|
from os import path, utime, stat, cpu_count, makedirs
|
||||||
from tqdm.asyncio import tqdm
|
from tqdm.asyncio import tqdm
|
||||||
from concurrent.futures import as_completed, ProcessPoolExecutor, Future, ThreadPoolExecutor
|
from concurrent.futures import ProcessPoolExecutor, Future
|
||||||
from ujson import loads
|
|
||||||
from warnings import filterwarnings
|
from warnings import filterwarnings
|
||||||
from h5py import File, special_dtype, string_dtype
|
|
||||||
from io import BytesIO
|
|
||||||
from numpy import void, array
|
|
||||||
import ujson
|
import ujson
|
||||||
import orjson
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
PARALLEL_LIMIT = 300
|
PARALLEL_LIMIT = 300
|
||||||
|
|
|
||||||
|
|
@ -24,9 +24,11 @@ from torch.cuda import is_available
|
||||||
from torch import no_grad, save, Tensor, load, device
|
from torch import no_grad, save, Tensor, load, device
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from distutils.util import strtobool
|
from distutils.util import strtobool
|
||||||
|
from intel_extension_for_pytorch import optimize
|
||||||
|
|
||||||
CI = bool(strtobool(environ['CI']))
|
CI = bool(strtobool(environ['CI']))
|
||||||
device = device('cuda' if is_available() else 'cpu')
|
# device = device('cuda' if is_available() else 'cpu')
|
||||||
|
device = 'xpu'
|
||||||
|
|
||||||
model_path: str = join(datadir(), 'artifact', 'vggface2_facenet.pth')
|
model_path: str = join(datadir(), 'artifact', 'vggface2_facenet.pth')
|
||||||
input_shape: int = 256
|
input_shape: int = 256
|
||||||
|
|
@ -128,6 +130,7 @@ optimizer = Adam(params=[
|
||||||
{'params': model_gpu[1].parameters(), 'lr': 1e-3},
|
{'params': model_gpu[1].parameters(), 'lr': 1e-3},
|
||||||
])
|
])
|
||||||
|
|
||||||
|
model, optimizer = optimize(model=model, optimizer=optimizer)
|
||||||
scheduler = lr_scheduler.StepLR(optimizer=optimizer, step_size=10, gamma=0.9)
|
scheduler = lr_scheduler.StepLR(optimizer=optimizer, step_size=10, gamma=0.9)
|
||||||
epochs = 100
|
epochs = 100
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -45,7 +45,7 @@ def parse_article(url: str) -> tuple[str, str, str, str, str]:
|
||||||
# print(div)
|
# print(div)
|
||||||
if not div.has_attr('data-src'):
|
if not div.has_attr('data-src'):
|
||||||
entry_body.find('img', class_='PhotoSwipeImage').replaceWith(
|
entry_body.find('img', class_='PhotoSwipeImage').replaceWith(
|
||||||
'--blog-image-' + str(div["data-image-order"]) + '--\n')
|
'\n' + '--blog-image-' + str(div["data-image-order"]) + '--')
|
||||||
for i in entry_body.find_all('br'):
|
for i in entry_body.find_all('br'):
|
||||||
i.replaceWith('\n')
|
i.replaceWith('\n')
|
||||||
data_path = '/'.join([blog_account, str(blog_entry)])
|
data_path = '/'.join([blog_account, str(blog_entry)])
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,11 @@
|
||||||
|
from cv2 import VideoCapture, getBuildInformation
|
||||||
|
from torchvision.models.mobilenetv3 import MobileNetV3
|
||||||
|
|
||||||
|
print(getBuildInformation())
|
||||||
|
sample_video = VideoCapture('/home/tomokazu/PycharmProjects/helloproject-ai/koi_ing.webm')
|
||||||
|
assert sample_video.isOpened()
|
||||||
|
|
||||||
|
ret = True
|
||||||
|
while ret:
|
||||||
|
ret, frame = sample_video.read()
|
||||||
|
# print(frame)
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
from functools import cache
|
from functools import cache
|
||||||
from os import getcwd
|
from os import getcwd, pardir
|
||||||
from os.path import join
|
from os.path import join, abspath, dirname
|
||||||
|
|
||||||
blog_list = ['angerme-ss-shin', 'angerme-amerika', 'angerme-new', 'juicejuice-official', 'tsubaki-factory',
|
blog_list = ['angerme-ss-shin', 'angerme-amerika', 'angerme-new', 'juicejuice-official', 'tsubaki-factory',
|
||||||
'morningmusume-9ki', 'morningmusume-10ki', 'mm-12ki', 'morningm-13ki', 'morningmusume15ki',
|
'morningmusume-9ki', 'morningmusume-10ki', 'mm-12ki', 'morningm-13ki', 'morningmusume15ki',
|
||||||
|
|
@ -53,7 +53,7 @@ def theme_curator(theme: str, blog_id: str) -> str:
|
||||||
|
|
||||||
@cache
|
@cache
|
||||||
def datadir():
|
def datadir():
|
||||||
return join(getcwd(), 'data')
|
return join('/home/tomokazu/PycharmProjects/helloproject-ai/', 'data')
|
||||||
|
|
||||||
|
|
||||||
request_header = {
|
request_header = {
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,43 @@
|
||||||
|
from h5py import File
|
||||||
|
from io import BytesIO
|
||||||
|
from os import getcwd
|
||||||
|
from os.path import join
|
||||||
|
from pandas import DataFrame, to_datetime, concat
|
||||||
|
from sqlite3 import connect
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
filename = join('/mnt/shm/blog_text.hdf5')
|
||||||
|
|
||||||
|
with open(file=filename, mode='rb') as f:
|
||||||
|
file_bio = BytesIO(initial_bytes=f.read())
|
||||||
|
|
||||||
|
article_tables = []
|
||||||
|
comment_tables = []
|
||||||
|
with (File(name=file_bio, mode='r+') as hdf5):
|
||||||
|
for blog_group in hdf5.keys():
|
||||||
|
print(blog_group)
|
||||||
|
article_table = DataFrame(columns=['group', 'theme', 'title', 'date', 'article'])
|
||||||
|
for blog_entry in tqdm(hdf5[blog_group].keys()):
|
||||||
|
# print(blog_entry)
|
||||||
|
# print(hdf5[blog_group][blog_entry].keys())
|
||||||
|
blog_article = hdf5[blog_group][blog_entry]['article'][()].decode('utf-8')
|
||||||
|
(_, entry_theme), (_, entry_title), (_, entry_date) = \
|
||||||
|
list(hdf5[blog_group][blog_entry]['article'].attrs.items())
|
||||||
|
# print(entry_theme, entry_title, entry_date)
|
||||||
|
article_table.loc[blog_entry] = [blog_group, entry_theme, entry_title, entry_date, blog_article]
|
||||||
|
comment_table = DataFrame(columns=['blog_id', 'user_id', 'nickname', 'title', 'date', 'article'])
|
||||||
|
for comment_entry, comment_text in hdf5[blog_group][blog_entry]['comments_dataset'].items():
|
||||||
|
comment_article = comment_text[()].decode('utf-8')
|
||||||
|
comment_blog_id, comment_user_id, comment_nickname, comment_title, comment_date = \
|
||||||
|
list(comment_text.attrs.values())
|
||||||
|
comment_table.loc[comment_entry] = \
|
||||||
|
[comment_blog_id, comment_user_id, comment_nickname, comment_title, comment_date, comment_article]
|
||||||
|
comment_tables.append(comment_table.copy(deep=True))
|
||||||
|
article_tables.append(article_table.copy(deep=True))
|
||||||
|
|
||||||
|
# break
|
||||||
|
|
||||||
|
# print(table)
|
||||||
|
with connect('blog_post.sqlite') as connector:
|
||||||
|
concat(objs=article_tables).to_sql(name='blog', con=connector)
|
||||||
|
concat(comment_tables).to_sql(name='comments', con=connector)
|
||||||
|
|
@ -0,0 +1,71 @@
|
||||||
|
from typing import Any
|
||||||
|
from h5py import File
|
||||||
|
from io import BytesIO
|
||||||
|
from os import cpu_count
|
||||||
|
from os.path import join, pardir, dirname
|
||||||
|
from pandas import DataFrame
|
||||||
|
from sqlite3 import connect
|
||||||
|
from concurrent.futures import ProcessPoolExecutor as PPE
|
||||||
|
from multiprocessing import Manager
|
||||||
|
from sys import path, stdout, stderr
|
||||||
|
from shutil import move
|
||||||
|
from time import time
|
||||||
|
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
path.append(pardir)
|
||||||
|
from settings import datadir
|
||||||
|
|
||||||
|
filename = join('/mnt/shm/blog_text.hdf5')
|
||||||
|
temporary_dir = '/mnt/shm/'
|
||||||
|
with open(file=filename, mode='rb') as f:
|
||||||
|
file_bio = BytesIO(initial_bytes=f.read())
|
||||||
|
|
||||||
|
|
||||||
|
def extract(key: str, file: str) -> tuple[list[list[Any]], list[list[Any]]]:
|
||||||
|
with File(name=file, mode='r') as hdf:
|
||||||
|
print(f'start: {key}')
|
||||||
|
start = time()
|
||||||
|
article_table = []
|
||||||
|
comment_table = []
|
||||||
|
for blog_entry in hdf[key].keys():
|
||||||
|
blog_article = hdf[key][blog_entry]['article'][()].decode('utf-8')
|
||||||
|
entry_theme, entry_title, entry_date = list(hdf[key][blog_entry]['article'].attrs.values())
|
||||||
|
article_table.append([blog_entry, key, entry_theme, entry_title, entry_date, blog_article])
|
||||||
|
for comment_entry, comment_text in hdf[key][blog_entry]['comments_dataset'].items():
|
||||||
|
comment_article = comment_text[()].decode('utf-8')
|
||||||
|
comment_blog_id, comment_user_id, comment_nickname, comment_title, comment_date = \
|
||||||
|
list(comment_text.attrs.values())
|
||||||
|
comment_table.append(
|
||||||
|
[comment_entry, comment_blog_id, comment_user_id, comment_nickname, comment_title, comment_date,
|
||||||
|
comment_article])
|
||||||
|
# break
|
||||||
|
print(f'end: {key} at {int(time() - start)}s')
|
||||||
|
return article_table, comment_table
|
||||||
|
|
||||||
|
|
||||||
|
results = []
|
||||||
|
with PPE(max_workers=cpu_count()) as executor, File(name=file_bio, mode='r') as hdf5:
|
||||||
|
for order, blog_group in enumerate(hdf5.keys(), start=0):
|
||||||
|
# print(blog_group)
|
||||||
|
lock = Manager().Lock()
|
||||||
|
results.append(executor.submit(extract, blog_group, filename))
|
||||||
|
|
||||||
|
article_tables = []
|
||||||
|
comment_tables = []
|
||||||
|
for job in results:
|
||||||
|
a, b = job.result()
|
||||||
|
article_tables.extend(a)
|
||||||
|
comment_tables.extend(b)
|
||||||
|
|
||||||
|
with connect(database=join(temporary_dir, 'tmp.sqlite'), timeout=3600) as connector:
|
||||||
|
article_dataframe = DataFrame(data=article_tables, columns=['index', 'group', 'theme', 'title', 'date', 'article'])
|
||||||
|
# article_dataframe.set_index('index')
|
||||||
|
article_dataframe = article_dataframe.astype({"index":int})
|
||||||
|
article_dataframe.to_sql(name='blog', con=connector, if_exists='replace',index=False)
|
||||||
|
comment_dataframe = DataFrame(data=comment_tables,
|
||||||
|
columns=['index', 'blog_id', 'user_id', 'nickname', 'title', 'date', 'article'])
|
||||||
|
comment_dataframe.set_index('index')
|
||||||
|
comment_dataframe.to_sql(name='comment', con=connector, if_exists='replace')
|
||||||
|
|
||||||
|
move(join(temporary_dir, 'tmp.sqlite'), join(datadir(), 'blog_post.sqlite'))
|
||||||
|
|
@ -0,0 +1,52 @@
|
||||||
|
from sqlite3 import connect
|
||||||
|
from shutil import copyfile
|
||||||
|
from settings import datadir
|
||||||
|
from os.path import join
|
||||||
|
from os import getcwd
|
||||||
|
from numpy import array, tile, int_, rot90, where, ndarray, vectorize, str_
|
||||||
|
|
||||||
|
# copyfile(src=join(datadir(), 'blog_post.sqlite'), dst=join('/mnt/shm/blog_post.sqlite'))
|
||||||
|
hash_func = vectorize(hash, otypes=[str])
|
||||||
|
with connect(database='/mnt/shm/blog_post.sqlite') as connector:
|
||||||
|
cursor = connector.cursor()
|
||||||
|
print(cursor.execute("SELECT t.sql FROM sqlite_master t WHERE name = 'blog'").fetchone())
|
||||||
|
if 'article_cleaned' not in cursor.execute("SELECT t.sql FROM sqlite_master t WHERE name = 'blog'").fetchone()[0]:
|
||||||
|
cursor.execute("ALTER TABLE blog add article_cleaned TEXT")
|
||||||
|
print(cursor.execute("SELECT t.sql FROM sqlite_master t WHERE name = 'blog'").fetchone())
|
||||||
|
|
||||||
|
for (theme,) in cursor.execute(f'SELECT DISTINCT theme FROM blog').fetchall():
|
||||||
|
print(theme)
|
||||||
|
blog_contents = cursor.execute(
|
||||||
|
f'SELECT title,date,article,theme,"index" FROM blog WHERE theme = \'{theme}\' ORDER BY date').fetchall()
|
||||||
|
# cleaned_list = [None] * (blog_contents.__len__() - 1)
|
||||||
|
# cleaned_list = []
|
||||||
|
for i in range(blog_contents.__len__() - 1):
|
||||||
|
if blog_contents[i][3] != '八木栞':
|
||||||
|
pass
|
||||||
|
a = blog_contents[i]
|
||||||
|
b = blog_contents[i + 1]
|
||||||
|
# print(a)
|
||||||
|
# print(b)
|
||||||
|
a_list = array(a[2].split('\n'), dtype=object)
|
||||||
|
b_list = array(b[2].split('\n'), dtype=object)
|
||||||
|
a_hash_list = hash_func(a_list)
|
||||||
|
b_hash_list = hash_func(b_list)
|
||||||
|
# print(a_hash_list.shape)
|
||||||
|
# print(b_hash_list.shape)
|
||||||
|
a_ndarray: ndarray = tile(array(object=a_hash_list), reps=(*b_hash_list.shape, 1))
|
||||||
|
b_ndarray: ndarray = rot90(tile(array(object=b_hash_list), reps=(*a_hash_list.shape, 1)))
|
||||||
|
# print(a_ndarray.shape)
|
||||||
|
# print(b_ndarray.shape)
|
||||||
|
dup = a_ndarray == b_ndarray
|
||||||
|
# print(dup)
|
||||||
|
# print(list(zip(*where(dup))))
|
||||||
|
cleaned_text = '\n'.join(a_list[(~dup.any(axis=0))]).replace('\'', '\'\'')
|
||||||
|
cursor.execute(
|
||||||
|
f"UPDATE blog SET article_cleaned = \'{cleaned_text}\' WHERE \"index\" = {blog_contents[i][4]}")
|
||||||
|
# cleaned_list[i] = cleaned_text
|
||||||
|
# cleaned_list.append(cleaned_text)
|
||||||
|
print('\t' + blog_contents[i][0])
|
||||||
|
# input()
|
||||||
|
|
||||||
|
connector.cursor().close()
|
||||||
|
connector.commit()
|
||||||
Loading…
Reference in New Issue