update
continuous-integration/drone/push Build encountered an error Details

This commit is contained in:
yayoimizuha 2023-05-17 00:40:46 +09:00
parent 730e910a36
commit 237557e9c7
5 changed files with 279 additions and 38 deletions

View File

@ -1,17 +1,29 @@
from pprint import pprint
from typing import List, Tuple
import h5py
import settings import settings
import re import re
import sys import sys
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
from aiohttp import ClientSession, ClientConnectorError, ClientTimeout from aiohttp import ClientSession, ClientConnectorError, ClientTimeout
from itertools import chain from itertools import chain
from asyncio import run, Semaphore, sleep from asyncio import run, Semaphore, sleep, Lock
from datetime import datetime from datetime import datetime
from aiofiles import open from aiofiles import open as a_open
import time
from os import path, utime, stat, cpu_count, makedirs from os import path, utime, stat, cpu_count, makedirs
from tqdm.asyncio import tqdm from tqdm.asyncio import tqdm
from concurrent.futures import as_completed, ProcessPoolExecutor, Future from concurrent.futures import as_completed, ProcessPoolExecutor, Future, ThreadPoolExecutor
from ujson import loads from ujson import loads
from warnings import filterwarnings from warnings import filterwarnings
from h5py import File, special_dtype, string_dtype
from io import BytesIO
from numpy import void, array
import ujson
import orjson
import requests
PARALLEL_LIMIT = 300 PARALLEL_LIMIT = 300
@ -25,7 +37,6 @@ async def run_each(name: str) -> None:
sem: Semaphore = Semaphore(PARALLEL_LIMIT) sem: Semaphore = Semaphore(PARALLEL_LIMIT)
session: ClientSession = ClientSession(trust_env=True, headers=settings.request_header, session: ClientSession = ClientSession(trust_env=True, headers=settings.request_header,
timeout=ClientTimeout(total=10 * 60)) timeout=ClientTimeout(total=10 * 60))
list_pages_count = await parse_list_pages_count(name) list_pages_count = await parse_list_pages_count(name)
print(name, list_pages_count) print(name, list_pages_count)
@ -38,14 +49,13 @@ async def run_each(name: str) -> None:
for url in url_list: for url in url_list:
if 'html' not in url: if 'html' not in url:
print(url) print(url)
executor = ProcessPoolExecutor(max_workers=cpu_count()) executor = ProcessPoolExecutor(max_workers=cpu_count())
futures = await tqdm.gather(*[parse_blog_post(url, sem, session, executor) for url in url_list], desc='scan blog') lock = Lock()
images_list = list() futures = await tqdm.gather(
for future in tqdm(as_completed(futures), desc='waiting processing ' + name, total=len(futures)): *[parse_blog_post(url, sem, session, executor, lock) for url in url_list],
images_list.append(future.result()) desc='scan blog')
executor.shutdown() executor.shutdown()
image_link_package = list(chain.from_iterable(images_list)) image_link_package = list(chain.from_iterable(futures))
await tqdm.gather( await tqdm.gather(
*[download_image(filename, url, date, sem, session) for filename, url, date in image_link_package], *[download_image(filename, url, date, sem, session) for filename, url, date in image_link_package],
@ -58,7 +68,7 @@ async def parse_list_pages_count(blog_name: str) -> int:
async with ClientSession(trust_env=True, headers=settings.request_header) as session: async with ClientSession(trust_env=True, headers=settings.request_header) as session:
async with session.get(f'https://ameblo.jp/{blog_name}/entrylist.html') as resp: async with session.get(f'https://ameblo.jp/{blog_name}/entrylist.html') as resp:
resp_html = await resp.text() resp_html = await resp.text()
json_obj = loads(re.findall(r'<script>window.INIT_DATA=(.*?)};', resp_html)[0] + '}') json_obj = ujson.loads(re.findall(r'<script>window.INIT_DATA=(.*?)};', resp_html)[0] + '}')
return list(json_obj['entryState']['blogPageMap'].values())[0]['paging']['max_page'] return list(json_obj['entryState']['blogPageMap'].values())[0]['paging']['max_page']
@ -67,11 +77,16 @@ async def parse_list_page(blog_name: str, order: int, sem: Semaphore, session: C
async with session.get(f'https://ameblo.jp/{blog_name}/entrylist-{order}.html') as resp: async with session.get(f'https://ameblo.jp/{blog_name}/entrylist-{order}.html') as resp:
resp_html = await resp.text() resp_html = await resp.text()
try: try:
json_obj = loads(re.findall(r'<script>window.INIT_DATA=(.*?)};', resp_html)[0] + '}') json_obj = ujson.loads(re.findall(r'<script>window.INIT_DATA=(.*?)};', resp_html)[0] + '}')
page_url_list: list[str] = list() page_url_list: list[str] = list()
for blog_post_desc in list(json_obj['entryState']['entryMap'].values()): for blog_post_desc in list(json_obj['entryState']['entryMap'].values()):
if blog_post_desc['publish_flg'] == 'open': if blog_post_desc['publish_flg'] == 'open':
page_url_list.append(f"https://ameblo.jp/{blog_name}/entry-{blog_post_desc['entry_id']}.html") page_url_list.append(f"https://ameblo.jp/{blog_name}/entry-{blog_post_desc['entry_id']}.html" +
"," +
";".join(["https://ameblo.jp/_api/blogComments", f"amebaId={blog_name}",
f"blogId={blog_post_desc['blog_id']}",
f"entryId={blog_post_desc['entry_id']}",
"excludeReplies=false", "limit=1", "offset=0"]))
except Exception as e: except Exception as e:
print(e) print(e)
print(f'https://ameblo.jp/{blog_name}/entrylist-{order}.html') print(f'https://ameblo.jp/{blog_name}/entrylist-{order}.html')
@ -79,10 +94,10 @@ async def parse_list_page(blog_name: str, order: int, sem: Semaphore, session: C
return page_url_list return page_url_list
def parse_image(html: str, url: str) -> list: def parse_image(html: str, url: str) -> list[tuple[str, str, datetime]]:
blog_account = url.split('/')[-2] blog_account = url.split('/')[-2]
try: try:
json_obj = list(loads(re.findall(r'<script>window.INIT_DATA=(.*?)};', html)[0] + '}')['entryState'][ json_obj = list(ujson.loads(re.findall(r'<script>window.INIT_DATA=(.*?)};', html)[0] + '}')['entryState'][
'entryMap'].values())[0] 'entryMap'].values())[0]
except IndexError as e: except IndexError as e:
print(e, url) print(e, url)
@ -106,29 +121,43 @@ def parse_image(html: str, url: str) -> list:
)) ))
entry_body.find('img', class_='PhotoSwipeImage').replaceWith( entry_body.find('img', class_='PhotoSwipeImage').replaceWith(
'--blog-image-' + str(div["data-image-order"]) + '--\n') '--blog-image-' + str(div["data-image-order"]) + '--\n')
if not path.isdir(path.join(settings.datadir(), 'blog_text', theme)):
makedirs(path.join(settings.datadir(), 'blog_text', theme), exist_ok=True)
for i in entry_body.find_all('br'):
i.replaceWith('\n')
async def save_text(save_path: str, content: str, last_modified_time: datetime):
async with open(save_path, mode='w') as f:
await f.write(content)
utime(path=save_path, times=(stat(path=save_path).st_atime, last_modified_time.timestamp()))
run(save_text(path.join(settings.datadir(), 'blog_text', theme, blog_account + '=' + str(blog_entry) + '.txt'),
entry_body.text, date))
# print(return_list)
return return_list return return_list
async def parse_blog_post(url: str, sem: Semaphore, session: ClientSession, executor: ProcessPoolExecutor) -> Future: def get_api_json(api_url: str) -> list:
# -> list[tuple[str, str, datetime]]:
# print(url)
while True: while True:
async with sem: try:
with requests.get(api_url) as resp:
resp_json = ujson.loads(resp.text)
comments_count = resp_json['paging']['total_count']
break
except Exception as e:
time.sleep(5.0)
print(api_url)
print(e, resp.text, resp.status_code, file=sys.stderr)
while True:
if comments_count == 0:
comments = []
break
else:
try: try:
async with session.get(url) as resp: with requests.get(api_url.replace('limit=1', f'limit={comments_count}')) as resp:
comments = list(ujson.loads(resp.text)['commentMap'].values())
break
except Exception as e:
time.sleep(5.0)
print(e, file=sys.stderr)
# print(comments.__len__())
return comments
async def parse_blog_post(urls: str, sem: Semaphore, session: ClientSession, executor: ProcessPoolExecutor,
lock: Lock) -> Future:
page_url, comment_api_url = urls.split(',')
async with sem:
while True:
try:
async with session.get(page_url) as resp:
resp_html = await resp.text() resp_html = await resp.text()
# await sleep(1.0) # await sleep(1.0)
break break
@ -136,7 +165,13 @@ async def parse_blog_post(url: str, sem: Semaphore, session: ClientSession, exec
await sleep(5.0) await sleep(5.0)
print(e, file=sys.stderr) print(e, file=sys.stderr)
return executor.submit(parse_image, resp_html, url) o = executor.submit(parse_image, resp_html, page_url)
async with lock:
async with a_open(file=path.join(settings.datadir(), 'api_urls.txt'), mode='a') as f:
await f.write(urls + '\n')
image_list = o.result()
return image_list
async def download_image(filename: str, url: str, date: datetime, sem: Semaphore, session: ClientSession) -> None: async def download_image(filename: str, url: str, date: datetime, sem: Semaphore, session: ClientSession) -> None:
@ -152,7 +187,7 @@ async def download_image(filename: str, url: str, date: datetime, sem: Semaphore
async with session.get(url) as resp: async with session.get(url) as resp:
if resp.content_type != "image/jpeg": if resp.content_type != "image/jpeg":
return return
async with open(file=filepath, mode="wb") as f: async with a_open(file=filepath, mode="wb") as f:
await f.write(await resp.read()) await f.write(await resp.read())
utime(path=filepath, times=(stat(path=filepath).st_atime, date.timestamp())) utime(path=filepath, times=(stat(path=filepath).st_atime, date.timestamp()))
@ -170,5 +205,7 @@ def grep_modified_time(html: str) -> str:
if __name__ == '__main__': if __name__ == '__main__':
with open(file=path.join(settings.datadir(),'api_urls.txt'),mode='w') as f:
f.write("")
for blog in settings.blog_list: for blog in settings.blog_list:
run(run_each(blog)) run(run_each(blog))

148
get_article_and_comments.py Normal file
View File

@ -0,0 +1,148 @@
import sys
import re
import time
from io import BytesIO
from h5py import File, string_dtype
import requests
from numpy import array, ceil
from tqdm import tqdm
from settings import datadir, theme_curator
from concurrent.futures import ProcessPoolExecutor
from os import cpu_count
from os.path import join
from bs4 import BeautifulSoup
import ujson
from more_itertools import chunked
from datetime import datetime, timezone, timedelta
JST = timezone(timedelta(hours=9), "JST")
def parse_article(url: str) -> tuple[str, str, str, str, str]:
while True:
with requests.get(url) as resp:
html = resp.text
try:
json_obj = list(ujson.loads(re.findall(r'<script>window.INIT_DATA=(.*?)};', html)[0] + '}')['entryState'][
'entryMap'].values())[0]
break
except IndexError as e:
print(e, url)
blog_account = url.split('/')[-2]
theme = theme_curator(json_obj['theme_name'], blog_account)
date = json_obj['last_edit_datetime']
blog_entry = json_obj['entry_id']
try:
entry_title = json_obj['entry_title']
except:
entry_title = ''
entry_body = BeautifulSoup(json_obj['entry_text'].replace('<br>', '\n'), 'lxml')
# print(entry_body)
for emoji in entry_body.find_all('img', class_='emoji'):
emoji.decompose()
image_divs = entry_body.find_all('img', class_='PhotoSwipeImage')
for div in image_divs:
# print(div)
if not div.has_attr('data-src'):
entry_body.find('img', class_='PhotoSwipeImage').replaceWith(
'--blog-image-' + str(div["data-image-order"]) + '--\n')
for i in entry_body.find_all('br'):
i.replaceWith('\n')
data_path = '/'.join([blog_account, str(blog_entry)])
return entry_body.text, entry_title, theme, date, data_path
def get_api_json(api_url: str) -> list:
while True:
try:
with requests.get(api_url) as resp:
resp_json = ujson.loads(resp.text)
comments_count = resp_json['paging']['total_count']
break
except Exception as e:
time.sleep(5.0)
print(api_url)
print(e, resp.text, resp.status_code, file=sys.stderr)
while True:
if comments_count == 0:
comments = []
break
else:
try:
with requests.get(api_url.replace('limit=1', f'limit={comments_count}')) as resp:
comments = list(ujson.loads(resp.text)['commentMap'].values())
break
except Exception as e:
time.sleep(5.0)
print(e, file=sys.stderr)
# print(comments.__len__())
return comments
if __name__ == '__main__':
chunk_size = 10
article_executor = ProcessPoolExecutor(max_workers=cpu_count() * 2)
api_executor = ProcessPoolExecutor(max_workers=chunk_size)
hdf5_bio = BytesIO()
with open(file=join(datadir(), 'blog_text.hdf5'), mode='rb') as hdf5_file:
hdf5_bio.write(hdf5_file.read())
save_cycle = 0
num_lines = sum([1 for _ in open(file=join(datadir(), 'api_urls.txt'), mode='r')])
with File(name=hdf5_bio, mode='a') as hdf5:
with open(file=join(datadir(), 'api_urls.txt'), mode='r') as f:
for rows in tqdm(chunked(f, n=chunk_size), total=ceil(num_lines / chunk_size)):
# save_cycle += 1
article_output = []
api_output = []
for row in rows:
article_url, comment_api_url = row.split(',')
blog_key = comment_api_url.split(';')[1].split('=')[1]
article_key = comment_api_url.split(';')[3].split('=')[1]
if f"/{blog_key}/{article_key}" in hdf5:
upd_time = datetime.fromisoformat(hdf5[blog_key][article_key]['article'].attrs['update_time'])
if (datetime.now(tz=JST) - upd_time).days > 4:
continue
else:
del hdf5[blog_key][article_key]
save_cycle += 1
article_output.append(article_executor.submit(parse_article, article_url))
api_output.append(api_executor.submit(get_api_json, comment_api_url))
for article_res, api_res in zip(article_output, api_output):
entry_text, entry_title, theme, date, data_path = article_res.result()
comments = api_res.result()
post = hdf5.create_group(name=data_path)
article = post.create_dataset('article', dtype=string_dtype(encoding='utf-8'),
data=array(entry_text.encode('utf-8')))
article.attrs['theme'] = theme
article.attrs['title'] = entry_title
article.attrs['update_time'] = date
comments_dataset = post.create_group(name='comments_dataset')
if comments.__len__() != 0:
for order, text in enumerate(comments):
comment_id = str(text['comment_id'])
comment = comments_dataset.create_dataset(name=comment_id,
dtype=string_dtype(encoding='utf-8'), data=array(
text['comment_text'].replace('<br />', '\n').encode('utf-8')))
if 'comment_author' in text.keys():
comment.attrs['author_id'] = text['comment_author']['ameba_id']
comment.attrs['author_blog_id'] = text['comment_author']['blog_id']
comment.attrs['author_nickname'] = text['comment_author']['nickname']
else:
comment.attrs['author_id'] = ''
comment.attrs['author_blog_id'] = -1
comment.attrs['author_nickname'] = text['comment_name']
comment.attrs['comment_title'] = text['comment_title']
comment.attrs['comment_update_time'] = text['upd_datetime']
hdf5.flush()
if save_cycle > 1_000:
with open(file=join(datadir(), 'blog_text.hdf5'), mode='wb') as hdf5_file:
hdf5_file.write(hdf5_bio.getvalue())
save_cycle = 0
# exit()
with open(file=join(datadir(), 'blog_text.hdf5'), mode='wb') as hdf5_file:
hdf5_file.write(hdf5_bio.getvalue())
article_executor.shutdown()
api_executor.shutdown()

51
hdf5_compresser.py Normal file
View File

@ -0,0 +1,51 @@
import sys
from os.path import join, basename, dirname
from pprint import pprint
from io import BytesIO
from h5py import File
from datetime import datetime, timezone, timedelta
from gzip import compress, decompress
from tqdm import tqdm
JST = timezone(timedelta(hours=9), "JST")
COMPRESS_METHOD = 'gzip'
COMPRESS_OPT = 9
# print(sys.argv)
hdf5_bio = BytesIO()
hdf5_bio_compressed = BytesIO()
with open(file=join(sys.argv[1]), mode='rb') as hdf5_file:
hdf5_bio.write(hdf5_file.read())
with File(name=hdf5_bio, mode='r') as hdf5, File(name=hdf5_bio_compressed, mode='w') as hdf5_compressed:
for group in hdf5.keys():
print(group)
hdf5_group = hdf5_compressed.create_group(name=group)
for article_id in tqdm(hdf5[group].keys()):
article = hdf5_group.create_group(name=article_id)
# print(group, article_id)
article_txt = hdf5[group][article_id]['article']
article_txt_compressed = article.create_dataset(name='article', dtype=f'S{article_txt[()].__len__() + 1}',
shape=(1,))
article_txt_compressed[0] = article_txt[()]
for k, v in article_txt.attrs.items():
# print(k, v)
article['article'].attrs[k] = v
comments = article.create_group(name='comments_dataset')
for comment_key in hdf5[group][article_id]['comments_dataset']:
comment_txt = hdf5[group][article_id]['comments_dataset'][comment_key]
# print(group, article_id, comment_key, comment_txt[()].decode('utf-8'))
comment_txt_compressed = comments.create_dataset(name=comment_key,
dtype=f'S{comment_txt[()].__len__() + 1}', shape=(1,))
comment_txt_compressed[0] = comment_txt[()]
for k, v in comment_txt.attrs.items():
comments[comment_key].attrs[k] = v
name, ext = basename(sys.argv[1]).rsplit('.', maxsplit=1)
with open(file=join(dirname(sys.argv[1]), name + '_compressed' + '.' + ext), mode='wb') as f:
f.write(hdf5_bio_compressed.getvalue())
# None (bytes) 12.4 MiB (12,966,690)
# only article compressed 12.4 MiB (12,973,914)
# all gzipped 33.2 MiB (34,768,669)
# all chunked 33.2 MiB (34,768,669)

View File

@ -130,9 +130,9 @@ optimizer = Adam(params=[
# {'params': model_gpu.maxpool.parameters(), 'lr': 1e-8}, # {'params': model_gpu.maxpool.parameters(), 'lr': 1e-8},
{'params': model_gpu.layer1.parameters(), 'lr': 1e-8}, {'params': model_gpu.layer1.parameters(), 'lr': 1e-8},
{'params': model_gpu.layer2.parameters(), 'lr': 1e-8}, {'params': model_gpu.layer2.parameters(), 'lr': 1e-8},
{'params': model_gpu.layer3.parameters(), 'lr': 1e-5}, {'params': model_gpu.layer3.parameters(), 'lr': 1e-6},
{'params': model_gpu.layer4.parameters(), 'lr': 1e-4}, {'params': model_gpu.layer4.parameters(), 'lr': 1e-5},
{'params': model_gpu.fc.parameters(), 'lr': 1e-4}, {'params': model_gpu.fc.parameters(), 'lr': 1e-5},
]) ])
scheduler = lr_scheduler.StepLR(optimizer=optimizer, step_size=5, gamma=0.5) scheduler = lr_scheduler.StepLR(optimizer=optimizer, step_size=5, gamma=0.5)

5
yolo_test.py Normal file
View File

@ -0,0 +1,5 @@
from super_gradients.training.models import get
yolo_nas = get(model_name='yolo_nas_l', pretrained_weights='coco').cuda()
yolo_nas.predict('橋迫鈴=angerme-new=12687767841-1.jpg', conf=0.8).show()