update
continuous-integration/drone/push Build encountered an error
Details
continuous-integration/drone/push Build encountered an error
Details
This commit is contained in:
parent
730e910a36
commit
237557e9c7
|
|
@ -1,17 +1,29 @@
|
||||||
|
from pprint import pprint
|
||||||
|
from typing import List, Tuple
|
||||||
|
|
||||||
|
import h5py
|
||||||
|
|
||||||
import settings
|
import settings
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
|
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
|
||||||
from aiohttp import ClientSession, ClientConnectorError, ClientTimeout
|
from aiohttp import ClientSession, ClientConnectorError, ClientTimeout
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
from asyncio import run, Semaphore, sleep
|
from asyncio import run, Semaphore, sleep, Lock
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from aiofiles import open
|
from aiofiles import open as a_open
|
||||||
|
import time
|
||||||
from os import path, utime, stat, cpu_count, makedirs
|
from os import path, utime, stat, cpu_count, makedirs
|
||||||
from tqdm.asyncio import tqdm
|
from tqdm.asyncio import tqdm
|
||||||
from concurrent.futures import as_completed, ProcessPoolExecutor, Future
|
from concurrent.futures import as_completed, ProcessPoolExecutor, Future, ThreadPoolExecutor
|
||||||
from ujson import loads
|
from ujson import loads
|
||||||
from warnings import filterwarnings
|
from warnings import filterwarnings
|
||||||
|
from h5py import File, special_dtype, string_dtype
|
||||||
|
from io import BytesIO
|
||||||
|
from numpy import void, array
|
||||||
|
import ujson
|
||||||
|
import orjson
|
||||||
|
import requests
|
||||||
|
|
||||||
PARALLEL_LIMIT = 300
|
PARALLEL_LIMIT = 300
|
||||||
|
|
||||||
|
|
@ -25,7 +37,6 @@ async def run_each(name: str) -> None:
|
||||||
sem: Semaphore = Semaphore(PARALLEL_LIMIT)
|
sem: Semaphore = Semaphore(PARALLEL_LIMIT)
|
||||||
session: ClientSession = ClientSession(trust_env=True, headers=settings.request_header,
|
session: ClientSession = ClientSession(trust_env=True, headers=settings.request_header,
|
||||||
timeout=ClientTimeout(total=10 * 60))
|
timeout=ClientTimeout(total=10 * 60))
|
||||||
|
|
||||||
list_pages_count = await parse_list_pages_count(name)
|
list_pages_count = await parse_list_pages_count(name)
|
||||||
|
|
||||||
print(name, list_pages_count)
|
print(name, list_pages_count)
|
||||||
|
|
@ -38,14 +49,13 @@ async def run_each(name: str) -> None:
|
||||||
for url in url_list:
|
for url in url_list:
|
||||||
if 'html' not in url:
|
if 'html' not in url:
|
||||||
print(url)
|
print(url)
|
||||||
|
|
||||||
executor = ProcessPoolExecutor(max_workers=cpu_count())
|
executor = ProcessPoolExecutor(max_workers=cpu_count())
|
||||||
futures = await tqdm.gather(*[parse_blog_post(url, sem, session, executor) for url in url_list], desc='scan blog')
|
lock = Lock()
|
||||||
images_list = list()
|
futures = await tqdm.gather(
|
||||||
for future in tqdm(as_completed(futures), desc='waiting processing ' + name, total=len(futures)):
|
*[parse_blog_post(url, sem, session, executor, lock) for url in url_list],
|
||||||
images_list.append(future.result())
|
desc='scan blog')
|
||||||
executor.shutdown()
|
executor.shutdown()
|
||||||
image_link_package = list(chain.from_iterable(images_list))
|
image_link_package = list(chain.from_iterable(futures))
|
||||||
|
|
||||||
await tqdm.gather(
|
await tqdm.gather(
|
||||||
*[download_image(filename, url, date, sem, session) for filename, url, date in image_link_package],
|
*[download_image(filename, url, date, sem, session) for filename, url, date in image_link_package],
|
||||||
|
|
@ -58,7 +68,7 @@ async def parse_list_pages_count(blog_name: str) -> int:
|
||||||
async with ClientSession(trust_env=True, headers=settings.request_header) as session:
|
async with ClientSession(trust_env=True, headers=settings.request_header) as session:
|
||||||
async with session.get(f'https://ameblo.jp/{blog_name}/entrylist.html') as resp:
|
async with session.get(f'https://ameblo.jp/{blog_name}/entrylist.html') as resp:
|
||||||
resp_html = await resp.text()
|
resp_html = await resp.text()
|
||||||
json_obj = loads(re.findall(r'<script>window.INIT_DATA=(.*?)};', resp_html)[0] + '}')
|
json_obj = ujson.loads(re.findall(r'<script>window.INIT_DATA=(.*?)};', resp_html)[0] + '}')
|
||||||
return list(json_obj['entryState']['blogPageMap'].values())[0]['paging']['max_page']
|
return list(json_obj['entryState']['blogPageMap'].values())[0]['paging']['max_page']
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -67,11 +77,16 @@ async def parse_list_page(blog_name: str, order: int, sem: Semaphore, session: C
|
||||||
async with session.get(f'https://ameblo.jp/{blog_name}/entrylist-{order}.html') as resp:
|
async with session.get(f'https://ameblo.jp/{blog_name}/entrylist-{order}.html') as resp:
|
||||||
resp_html = await resp.text()
|
resp_html = await resp.text()
|
||||||
try:
|
try:
|
||||||
json_obj = loads(re.findall(r'<script>window.INIT_DATA=(.*?)};', resp_html)[0] + '}')
|
json_obj = ujson.loads(re.findall(r'<script>window.INIT_DATA=(.*?)};', resp_html)[0] + '}')
|
||||||
page_url_list: list[str] = list()
|
page_url_list: list[str] = list()
|
||||||
for blog_post_desc in list(json_obj['entryState']['entryMap'].values()):
|
for blog_post_desc in list(json_obj['entryState']['entryMap'].values()):
|
||||||
if blog_post_desc['publish_flg'] == 'open':
|
if blog_post_desc['publish_flg'] == 'open':
|
||||||
page_url_list.append(f"https://ameblo.jp/{blog_name}/entry-{blog_post_desc['entry_id']}.html")
|
page_url_list.append(f"https://ameblo.jp/{blog_name}/entry-{blog_post_desc['entry_id']}.html" +
|
||||||
|
"," +
|
||||||
|
";".join(["https://ameblo.jp/_api/blogComments", f"amebaId={blog_name}",
|
||||||
|
f"blogId={blog_post_desc['blog_id']}",
|
||||||
|
f"entryId={blog_post_desc['entry_id']}",
|
||||||
|
"excludeReplies=false", "limit=1", "offset=0"]))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(e)
|
||||||
print(f'https://ameblo.jp/{blog_name}/entrylist-{order}.html')
|
print(f'https://ameblo.jp/{blog_name}/entrylist-{order}.html')
|
||||||
|
|
@ -79,10 +94,10 @@ async def parse_list_page(blog_name: str, order: int, sem: Semaphore, session: C
|
||||||
return page_url_list
|
return page_url_list
|
||||||
|
|
||||||
|
|
||||||
def parse_image(html: str, url: str) -> list:
|
def parse_image(html: str, url: str) -> list[tuple[str, str, datetime]]:
|
||||||
blog_account = url.split('/')[-2]
|
blog_account = url.split('/')[-2]
|
||||||
try:
|
try:
|
||||||
json_obj = list(loads(re.findall(r'<script>window.INIT_DATA=(.*?)};', html)[0] + '}')['entryState'][
|
json_obj = list(ujson.loads(re.findall(r'<script>window.INIT_DATA=(.*?)};', html)[0] + '}')['entryState'][
|
||||||
'entryMap'].values())[0]
|
'entryMap'].values())[0]
|
||||||
except IndexError as e:
|
except IndexError as e:
|
||||||
print(e, url)
|
print(e, url)
|
||||||
|
|
@ -106,29 +121,43 @@ def parse_image(html: str, url: str) -> list:
|
||||||
))
|
))
|
||||||
entry_body.find('img', class_='PhotoSwipeImage').replaceWith(
|
entry_body.find('img', class_='PhotoSwipeImage').replaceWith(
|
||||||
'--blog-image-' + str(div["data-image-order"]) + '--\n')
|
'--blog-image-' + str(div["data-image-order"]) + '--\n')
|
||||||
if not path.isdir(path.join(settings.datadir(), 'blog_text', theme)):
|
|
||||||
makedirs(path.join(settings.datadir(), 'blog_text', theme), exist_ok=True)
|
|
||||||
for i in entry_body.find_all('br'):
|
|
||||||
i.replaceWith('\n')
|
|
||||||
|
|
||||||
async def save_text(save_path: str, content: str, last_modified_time: datetime):
|
|
||||||
async with open(save_path, mode='w') as f:
|
|
||||||
await f.write(content)
|
|
||||||
utime(path=save_path, times=(stat(path=save_path).st_atime, last_modified_time.timestamp()))
|
|
||||||
|
|
||||||
run(save_text(path.join(settings.datadir(), 'blog_text', theme, blog_account + '=' + str(blog_entry) + '.txt'),
|
|
||||||
entry_body.text, date))
|
|
||||||
# print(return_list)
|
|
||||||
return return_list
|
return return_list
|
||||||
|
|
||||||
|
|
||||||
async def parse_blog_post(url: str, sem: Semaphore, session: ClientSession, executor: ProcessPoolExecutor) -> Future:
|
def get_api_json(api_url: str) -> list:
|
||||||
# -> list[tuple[str, str, datetime]]:
|
|
||||||
# print(url)
|
|
||||||
while True:
|
while True:
|
||||||
async with sem:
|
try:
|
||||||
|
with requests.get(api_url) as resp:
|
||||||
|
resp_json = ujson.loads(resp.text)
|
||||||
|
comments_count = resp_json['paging']['total_count']
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
time.sleep(5.0)
|
||||||
|
print(api_url)
|
||||||
|
print(e, resp.text, resp.status_code, file=sys.stderr)
|
||||||
|
while True:
|
||||||
|
if comments_count == 0:
|
||||||
|
comments = []
|
||||||
|
break
|
||||||
|
else:
|
||||||
try:
|
try:
|
||||||
async with session.get(url) as resp:
|
with requests.get(api_url.replace('limit=1', f'limit={comments_count}')) as resp:
|
||||||
|
comments = list(ujson.loads(resp.text)['commentMap'].values())
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
time.sleep(5.0)
|
||||||
|
print(e, file=sys.stderr)
|
||||||
|
# print(comments.__len__())
|
||||||
|
return comments
|
||||||
|
|
||||||
|
|
||||||
|
async def parse_blog_post(urls: str, sem: Semaphore, session: ClientSession, executor: ProcessPoolExecutor,
|
||||||
|
lock: Lock) -> Future:
|
||||||
|
page_url, comment_api_url = urls.split(',')
|
||||||
|
async with sem:
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
async with session.get(page_url) as resp:
|
||||||
resp_html = await resp.text()
|
resp_html = await resp.text()
|
||||||
# await sleep(1.0)
|
# await sleep(1.0)
|
||||||
break
|
break
|
||||||
|
|
@ -136,7 +165,13 @@ async def parse_blog_post(url: str, sem: Semaphore, session: ClientSession, exec
|
||||||
await sleep(5.0)
|
await sleep(5.0)
|
||||||
print(e, file=sys.stderr)
|
print(e, file=sys.stderr)
|
||||||
|
|
||||||
return executor.submit(parse_image, resp_html, url)
|
o = executor.submit(parse_image, resp_html, page_url)
|
||||||
|
async with lock:
|
||||||
|
async with a_open(file=path.join(settings.datadir(), 'api_urls.txt'), mode='a') as f:
|
||||||
|
await f.write(urls + '\n')
|
||||||
|
|
||||||
|
image_list = o.result()
|
||||||
|
return image_list
|
||||||
|
|
||||||
|
|
||||||
async def download_image(filename: str, url: str, date: datetime, sem: Semaphore, session: ClientSession) -> None:
|
async def download_image(filename: str, url: str, date: datetime, sem: Semaphore, session: ClientSession) -> None:
|
||||||
|
|
@ -152,7 +187,7 @@ async def download_image(filename: str, url: str, date: datetime, sem: Semaphore
|
||||||
async with session.get(url) as resp:
|
async with session.get(url) as resp:
|
||||||
if resp.content_type != "image/jpeg":
|
if resp.content_type != "image/jpeg":
|
||||||
return
|
return
|
||||||
async with open(file=filepath, mode="wb") as f:
|
async with a_open(file=filepath, mode="wb") as f:
|
||||||
await f.write(await resp.read())
|
await f.write(await resp.read())
|
||||||
utime(path=filepath, times=(stat(path=filepath).st_atime, date.timestamp()))
|
utime(path=filepath, times=(stat(path=filepath).st_atime, date.timestamp()))
|
||||||
|
|
||||||
|
|
@ -170,5 +205,7 @@ def grep_modified_time(html: str) -> str:
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
with open(file=path.join(settings.datadir(),'api_urls.txt'),mode='w') as f:
|
||||||
|
f.write("")
|
||||||
for blog in settings.blog_list:
|
for blog in settings.blog_list:
|
||||||
run(run_each(blog))
|
run(run_each(blog))
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,148 @@
|
||||||
|
import sys
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
from io import BytesIO
|
||||||
|
from h5py import File, string_dtype
|
||||||
|
import requests
|
||||||
|
from numpy import array, ceil
|
||||||
|
from tqdm import tqdm
|
||||||
|
from settings import datadir, theme_curator
|
||||||
|
from concurrent.futures import ProcessPoolExecutor
|
||||||
|
from os import cpu_count
|
||||||
|
from os.path import join
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import ujson
|
||||||
|
from more_itertools import chunked
|
||||||
|
from datetime import datetime, timezone, timedelta
|
||||||
|
|
||||||
|
JST = timezone(timedelta(hours=9), "JST")
|
||||||
|
|
||||||
|
|
||||||
|
def parse_article(url: str) -> tuple[str, str, str, str, str]:
|
||||||
|
while True:
|
||||||
|
with requests.get(url) as resp:
|
||||||
|
html = resp.text
|
||||||
|
try:
|
||||||
|
json_obj = list(ujson.loads(re.findall(r'<script>window.INIT_DATA=(.*?)};', html)[0] + '}')['entryState'][
|
||||||
|
'entryMap'].values())[0]
|
||||||
|
break
|
||||||
|
except IndexError as e:
|
||||||
|
print(e, url)
|
||||||
|
blog_account = url.split('/')[-2]
|
||||||
|
theme = theme_curator(json_obj['theme_name'], blog_account)
|
||||||
|
date = json_obj['last_edit_datetime']
|
||||||
|
blog_entry = json_obj['entry_id']
|
||||||
|
try:
|
||||||
|
entry_title = json_obj['entry_title']
|
||||||
|
except:
|
||||||
|
entry_title = ''
|
||||||
|
entry_body = BeautifulSoup(json_obj['entry_text'].replace('<br>', '\n'), 'lxml')
|
||||||
|
# print(entry_body)
|
||||||
|
for emoji in entry_body.find_all('img', class_='emoji'):
|
||||||
|
emoji.decompose()
|
||||||
|
image_divs = entry_body.find_all('img', class_='PhotoSwipeImage')
|
||||||
|
for div in image_divs:
|
||||||
|
# print(div)
|
||||||
|
if not div.has_attr('data-src'):
|
||||||
|
entry_body.find('img', class_='PhotoSwipeImage').replaceWith(
|
||||||
|
'--blog-image-' + str(div["data-image-order"]) + '--\n')
|
||||||
|
for i in entry_body.find_all('br'):
|
||||||
|
i.replaceWith('\n')
|
||||||
|
data_path = '/'.join([blog_account, str(blog_entry)])
|
||||||
|
return entry_body.text, entry_title, theme, date, data_path
|
||||||
|
|
||||||
|
|
||||||
|
def get_api_json(api_url: str) -> list:
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
with requests.get(api_url) as resp:
|
||||||
|
resp_json = ujson.loads(resp.text)
|
||||||
|
comments_count = resp_json['paging']['total_count']
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
time.sleep(5.0)
|
||||||
|
print(api_url)
|
||||||
|
print(e, resp.text, resp.status_code, file=sys.stderr)
|
||||||
|
while True:
|
||||||
|
if comments_count == 0:
|
||||||
|
comments = []
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
with requests.get(api_url.replace('limit=1', f'limit={comments_count}')) as resp:
|
||||||
|
comments = list(ujson.loads(resp.text)['commentMap'].values())
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
time.sleep(5.0)
|
||||||
|
print(e, file=sys.stderr)
|
||||||
|
# print(comments.__len__())
|
||||||
|
return comments
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
chunk_size = 10
|
||||||
|
article_executor = ProcessPoolExecutor(max_workers=cpu_count() * 2)
|
||||||
|
api_executor = ProcessPoolExecutor(max_workers=chunk_size)
|
||||||
|
|
||||||
|
hdf5_bio = BytesIO()
|
||||||
|
with open(file=join(datadir(), 'blog_text.hdf5'), mode='rb') as hdf5_file:
|
||||||
|
hdf5_bio.write(hdf5_file.read())
|
||||||
|
|
||||||
|
save_cycle = 0
|
||||||
|
num_lines = sum([1 for _ in open(file=join(datadir(), 'api_urls.txt'), mode='r')])
|
||||||
|
with File(name=hdf5_bio, mode='a') as hdf5:
|
||||||
|
with open(file=join(datadir(), 'api_urls.txt'), mode='r') as f:
|
||||||
|
for rows in tqdm(chunked(f, n=chunk_size), total=ceil(num_lines / chunk_size)):
|
||||||
|
# save_cycle += 1
|
||||||
|
article_output = []
|
||||||
|
api_output = []
|
||||||
|
for row in rows:
|
||||||
|
article_url, comment_api_url = row.split(',')
|
||||||
|
blog_key = comment_api_url.split(';')[1].split('=')[1]
|
||||||
|
article_key = comment_api_url.split(';')[3].split('=')[1]
|
||||||
|
if f"/{blog_key}/{article_key}" in hdf5:
|
||||||
|
upd_time = datetime.fromisoformat(hdf5[blog_key][article_key]['article'].attrs['update_time'])
|
||||||
|
if (datetime.now(tz=JST) - upd_time).days > 4:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
del hdf5[blog_key][article_key]
|
||||||
|
save_cycle += 1
|
||||||
|
article_output.append(article_executor.submit(parse_article, article_url))
|
||||||
|
api_output.append(api_executor.submit(get_api_json, comment_api_url))
|
||||||
|
for article_res, api_res in zip(article_output, api_output):
|
||||||
|
entry_text, entry_title, theme, date, data_path = article_res.result()
|
||||||
|
comments = api_res.result()
|
||||||
|
post = hdf5.create_group(name=data_path)
|
||||||
|
article = post.create_dataset('article', dtype=string_dtype(encoding='utf-8'),
|
||||||
|
data=array(entry_text.encode('utf-8')))
|
||||||
|
article.attrs['theme'] = theme
|
||||||
|
article.attrs['title'] = entry_title
|
||||||
|
article.attrs['update_time'] = date
|
||||||
|
|
||||||
|
comments_dataset = post.create_group(name='comments_dataset')
|
||||||
|
if comments.__len__() != 0:
|
||||||
|
for order, text in enumerate(comments):
|
||||||
|
comment_id = str(text['comment_id'])
|
||||||
|
comment = comments_dataset.create_dataset(name=comment_id,
|
||||||
|
dtype=string_dtype(encoding='utf-8'), data=array(
|
||||||
|
text['comment_text'].replace('<br />', '\n').encode('utf-8')))
|
||||||
|
if 'comment_author' in text.keys():
|
||||||
|
comment.attrs['author_id'] = text['comment_author']['ameba_id']
|
||||||
|
comment.attrs['author_blog_id'] = text['comment_author']['blog_id']
|
||||||
|
comment.attrs['author_nickname'] = text['comment_author']['nickname']
|
||||||
|
else:
|
||||||
|
comment.attrs['author_id'] = ''
|
||||||
|
comment.attrs['author_blog_id'] = -1
|
||||||
|
comment.attrs['author_nickname'] = text['comment_name']
|
||||||
|
comment.attrs['comment_title'] = text['comment_title']
|
||||||
|
comment.attrs['comment_update_time'] = text['upd_datetime']
|
||||||
|
hdf5.flush()
|
||||||
|
if save_cycle > 1_000:
|
||||||
|
with open(file=join(datadir(), 'blog_text.hdf5'), mode='wb') as hdf5_file:
|
||||||
|
hdf5_file.write(hdf5_bio.getvalue())
|
||||||
|
save_cycle = 0
|
||||||
|
# exit()
|
||||||
|
with open(file=join(datadir(), 'blog_text.hdf5'), mode='wb') as hdf5_file:
|
||||||
|
hdf5_file.write(hdf5_bio.getvalue())
|
||||||
|
article_executor.shutdown()
|
||||||
|
api_executor.shutdown()
|
||||||
|
|
@ -0,0 +1,51 @@
|
||||||
|
import sys
|
||||||
|
from os.path import join, basename, dirname
|
||||||
|
from pprint import pprint
|
||||||
|
from io import BytesIO
|
||||||
|
from h5py import File
|
||||||
|
from datetime import datetime, timezone, timedelta
|
||||||
|
from gzip import compress, decompress
|
||||||
|
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
JST = timezone(timedelta(hours=9), "JST")
|
||||||
|
COMPRESS_METHOD = 'gzip'
|
||||||
|
COMPRESS_OPT = 9
|
||||||
|
# print(sys.argv)
|
||||||
|
hdf5_bio = BytesIO()
|
||||||
|
hdf5_bio_compressed = BytesIO()
|
||||||
|
with open(file=join(sys.argv[1]), mode='rb') as hdf5_file:
|
||||||
|
hdf5_bio.write(hdf5_file.read())
|
||||||
|
|
||||||
|
with File(name=hdf5_bio, mode='r') as hdf5, File(name=hdf5_bio_compressed, mode='w') as hdf5_compressed:
|
||||||
|
for group in hdf5.keys():
|
||||||
|
print(group)
|
||||||
|
hdf5_group = hdf5_compressed.create_group(name=group)
|
||||||
|
for article_id in tqdm(hdf5[group].keys()):
|
||||||
|
article = hdf5_group.create_group(name=article_id)
|
||||||
|
# print(group, article_id)
|
||||||
|
article_txt = hdf5[group][article_id]['article']
|
||||||
|
article_txt_compressed = article.create_dataset(name='article', dtype=f'S{article_txt[()].__len__() + 1}',
|
||||||
|
shape=(1,))
|
||||||
|
article_txt_compressed[0] = article_txt[()]
|
||||||
|
for k, v in article_txt.attrs.items():
|
||||||
|
# print(k, v)
|
||||||
|
article['article'].attrs[k] = v
|
||||||
|
comments = article.create_group(name='comments_dataset')
|
||||||
|
for comment_key in hdf5[group][article_id]['comments_dataset']:
|
||||||
|
comment_txt = hdf5[group][article_id]['comments_dataset'][comment_key]
|
||||||
|
# print(group, article_id, comment_key, comment_txt[()].decode('utf-8'))
|
||||||
|
comment_txt_compressed = comments.create_dataset(name=comment_key,
|
||||||
|
dtype=f'S{comment_txt[()].__len__() + 1}', shape=(1,))
|
||||||
|
comment_txt_compressed[0] = comment_txt[()]
|
||||||
|
for k, v in comment_txt.attrs.items():
|
||||||
|
comments[comment_key].attrs[k] = v
|
||||||
|
|
||||||
|
name, ext = basename(sys.argv[1]).rsplit('.', maxsplit=1)
|
||||||
|
with open(file=join(dirname(sys.argv[1]), name + '_compressed' + '.' + ext), mode='wb') as f:
|
||||||
|
f.write(hdf5_bio_compressed.getvalue())
|
||||||
|
|
||||||
|
# None (bytes) 12.4 MiB (12,966,690)
|
||||||
|
# only article compressed 12.4 MiB (12,973,914)
|
||||||
|
# all gzipped 33.2 MiB (34,768,669)
|
||||||
|
# all chunked 33.2 MiB (34,768,669)
|
||||||
|
|
@ -130,9 +130,9 @@ optimizer = Adam(params=[
|
||||||
# {'params': model_gpu.maxpool.parameters(), 'lr': 1e-8},
|
# {'params': model_gpu.maxpool.parameters(), 'lr': 1e-8},
|
||||||
{'params': model_gpu.layer1.parameters(), 'lr': 1e-8},
|
{'params': model_gpu.layer1.parameters(), 'lr': 1e-8},
|
||||||
{'params': model_gpu.layer2.parameters(), 'lr': 1e-8},
|
{'params': model_gpu.layer2.parameters(), 'lr': 1e-8},
|
||||||
{'params': model_gpu.layer3.parameters(), 'lr': 1e-5},
|
{'params': model_gpu.layer3.parameters(), 'lr': 1e-6},
|
||||||
{'params': model_gpu.layer4.parameters(), 'lr': 1e-4},
|
{'params': model_gpu.layer4.parameters(), 'lr': 1e-5},
|
||||||
{'params': model_gpu.fc.parameters(), 'lr': 1e-4},
|
{'params': model_gpu.fc.parameters(), 'lr': 1e-5},
|
||||||
|
|
||||||
])
|
])
|
||||||
scheduler = lr_scheduler.StepLR(optimizer=optimizer, step_size=5, gamma=0.5)
|
scheduler = lr_scheduler.StepLR(optimizer=optimizer, step_size=5, gamma=0.5)
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,5 @@
|
||||||
|
from super_gradients.training.models import get
|
||||||
|
|
||||||
|
yolo_nas = get(model_name='yolo_nas_l', pretrained_weights='coco').cuda()
|
||||||
|
|
||||||
|
yolo_nas.predict('橋迫鈴=angerme-new=12687767841-1.jpg', conf=0.8).show()
|
||||||
Loading…
Reference in New Issue