update

2023-05-17 00:40:46 +09:00 · 2023-05-17 00:40:46 +09:00 · 237557e9c7
parent 730e910a36
commit 237557e9c7
5 changed files with 279 additions and 38 deletions
--- a/ameblo_download.py
+++ b/ameblo_download.py
@ -1,17 +1,29 @@
 from pprint import pprint
 from typing import List, Tuple
 import h5py
 import settings
 import re
 import sys
 from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
 from aiohttp import ClientSession, ClientConnectorError, ClientTimeout
 from itertools import chain
-from asyncio import run, Semaphore, sleep
+from asyncio import run, Semaphore, sleep, Lock
 from datetime import datetime
-from aiofiles import open
+from aiofiles import open as a_open
 import time
 from os import path, utime, stat, cpu_count, makedirs
 from tqdm.asyncio import tqdm
-from concurrent.futures import as_completed, ProcessPoolExecutor, Future
+from concurrent.futures import as_completed, ProcessPoolExecutor, Future, ThreadPoolExecutor
 from ujson import loads
 from warnings import filterwarnings
 from h5py import File, special_dtype, string_dtype
 from io import BytesIO
 from numpy import void, array
 import ujson
 import orjson
 import requests
 PARALLEL_LIMIT = 300
@ -25,7 +37,6 @@ async def run_each(name: str) -> None:
    sem: Semaphore = Semaphore(PARALLEL_LIMIT)
    session: ClientSession = ClientSession(trust_env=True, headers=settings.request_header,
                                           timeout=ClientTimeout(total=10 * 60))
    list_pages_count = await parse_list_pages_count(name)
    print(name, list_pages_count)
@ -38,14 +49,13 @@ async def run_each(name: str) -> None:
    for url in url_list:
        if 'html' not in url:
            print(url)
    executor = ProcessPoolExecutor(max_workers=cpu_count())
-    futures = await tqdm.gather(*[parse_blog_post(url, sem, session, executor) for url in url_list], desc='scan blog')
+    lock = Lock()
-    images_list = list()
+    futures = await tqdm.gather(
-    for future in tqdm(as_completed(futures), desc='waiting processing ' + name, total=len(futures)):
+            *[parse_blog_post(url, sem, session, executor, lock) for url in url_list],
-        images_list.append(future.result())
+            desc='scan blog')
    executor.shutdown()
-    image_link_package = list(chain.from_iterable(images_list))
+    image_link_package = list(chain.from_iterable(futures))
    await tqdm.gather(
        *[download_image(filename, url, date, sem, session) for filename, url, date in image_link_package],
@ -58,7 +68,7 @@ async def parse_list_pages_count(blog_name: str) -> int:
    async with ClientSession(trust_env=True, headers=settings.request_header) as session:
        async with session.get(f'https://ameblo.jp/{blog_name}/entrylist.html') as resp:
            resp_html = await resp.text()
-            json_obj = loads(re.findall(r'<script>window.INIT_DATA=(.*?)};', resp_html)[0] + '}')
+            json_obj = ujson.loads(re.findall(r'<script>window.INIT_DATA=(.*?)};', resp_html)[0] + '}')
            return list(json_obj['entryState']['blogPageMap'].values())[0]['paging']['max_page']
@ -67,11 +77,16 @@ async def parse_list_page(blog_name: str, order: int, sem: Semaphore, session: C
        async with session.get(f'https://ameblo.jp/{blog_name}/entrylist-{order}.html') as resp:
            resp_html = await resp.text()
    try:
-        json_obj = loads(re.findall(r'<script>window.INIT_DATA=(.*?)};', resp_html)[0] + '}')
+        json_obj = ujson.loads(re.findall(r'<script>window.INIT_DATA=(.*?)};', resp_html)[0] + '}')
        page_url_list: list[str] = list()
        for blog_post_desc in list(json_obj['entryState']['entryMap'].values()):
            if blog_post_desc['publish_flg'] == 'open':
-                page_url_list.append(f"https://ameblo.jp/{blog_name}/entry-{blog_post_desc['entry_id']}.html")
+                page_url_list.append(f"https://ameblo.jp/{blog_name}/entry-{blog_post_desc['entry_id']}.html" +
                                     "," +
                                     ";".join(["https://ameblo.jp/_api/blogComments", f"amebaId={blog_name}",
                                               f"blogId={blog_post_desc['blog_id']}",
                                               f"entryId={blog_post_desc['entry_id']}",
                                               "excludeReplies=false", "limit=1", "offset=0"]))
    except Exception as e:
        print(e)
        print(f'https://ameblo.jp/{blog_name}/entrylist-{order}.html')
@ -79,10 +94,10 @@ async def parse_list_page(blog_name: str, order: int, sem: Semaphore, session: C
    return page_url_list
-def parse_image(html: str, url: str) -> list:
+def parse_image(html: str, url: str) -> list[tuple[str, str, datetime]]:
    blog_account = url.split('/')[-2]
    try:
-        json_obj = list(loads(re.findall(r'<script>window.INIT_DATA=(.*?)};', html)[0] + '}')['entryState'][
+        json_obj = list(ujson.loads(re.findall(r'<script>window.INIT_DATA=(.*?)};', html)[0] + '}')['entryState'][
                            'entryMap'].values())[0]
    except IndexError as e:
        print(e, url)
@ -106,29 +121,43 @@ def parse_image(html: str, url: str) -> list:
            ))
            entry_body.find('img', class_='PhotoSwipeImage').replaceWith(
                '--blog-image-' + str(div["data-image-order"]) + '--\n')
    if not path.isdir(path.join(settings.datadir(), 'blog_text', theme)):
        makedirs(path.join(settings.datadir(), 'blog_text', theme), exist_ok=True)
    for i in entry_body.find_all('br'):
        i.replaceWith('\n')
    async def save_text(save_path: str, content: str, last_modified_time: datetime):
        async with open(save_path, mode='w') as f:
            await f.write(content)
        utime(path=save_path, times=(stat(path=save_path).st_atime, last_modified_time.timestamp()))
    run(save_text(path.join(settings.datadir(), 'blog_text', theme, blog_account + '=' + str(blog_entry) + '.txt'),
                  entry_body.text, date))
    # print(return_list)
    return return_list
-async def parse_blog_post(url: str, sem: Semaphore, session: ClientSession, executor: ProcessPoolExecutor) -> Future:
+def get_api_json(api_url: str) -> list:
    # -> list[tuple[str, str, datetime]]:
    # print(url)
    while True:
-        async with sem:
+        try:
            with requests.get(api_url) as resp:
                resp_json = ujson.loads(resp.text)
                comments_count = resp_json['paging']['total_count']
                break
        except Exception as e:
            time.sleep(5.0)
            print(api_url)
            print(e, resp.text, resp.status_code, file=sys.stderr)
    while True:
        if comments_count == 0:
            comments = []
            break
        else:
            try:
-                async with session.get(url) as resp:
+                with requests.get(api_url.replace('limit=1', f'limit={comments_count}')) as resp:
                    comments = list(ujson.loads(resp.text)['commentMap'].values())
                    break
            except Exception as e:
                time.sleep(5.0)
                print(e, file=sys.stderr)
    # print(comments.__len__())
    return comments
 async def parse_blog_post(urls: str, sem: Semaphore, session: ClientSession, executor: ProcessPoolExecutor,
                          lock: Lock) -> Future:
    page_url, comment_api_url = urls.split(',')
    async with sem:
        while True:
            try:
                async with session.get(page_url) as resp:
                    resp_html = await resp.text()
                    # await sleep(1.0)
                    break
@ -136,7 +165,13 @@ async def parse_blog_post(url: str, sem: Semaphore, session: ClientSession, exec
                await sleep(5.0)
                print(e, file=sys.stderr)
-    return executor.submit(parse_image, resp_html, url)
+    o = executor.submit(parse_image, resp_html, page_url)
    async with lock:
        async with a_open(file=path.join(settings.datadir(), 'api_urls.txt'), mode='a') as f:
            await f.write(urls + '\n')
    image_list = o.result()
    return image_list
 async def download_image(filename: str, url: str, date: datetime, sem: Semaphore, session: ClientSession) -> None:
@ -152,7 +187,7 @@ async def download_image(filename: str, url: str, date: datetime, sem: Semaphore
        async with session.get(url) as resp:
            if resp.content_type != "image/jpeg":
                return
-            async with open(file=filepath, mode="wb") as f:
+            async with a_open(file=filepath, mode="wb") as f:
                await f.write(await resp.read())
    utime(path=filepath, times=(stat(path=filepath).st_atime, date.timestamp()))
@ -170,5 +205,7 @@ def grep_modified_time(html: str) -> str:
 if __name__ == '__main__':
    with open(file=path.join(settings.datadir(),'api_urls.txt'),mode='w') as f:
        f.write("")
    for blog in settings.blog_list:
        run(run_each(blog))
--- a/get_article_and_comments.py
+++ b/get_article_and_comments.py
@ -0,0 +1,148 @@
 import sys
 import re
 import time
 from io import BytesIO
 from h5py import File, string_dtype
 import requests
 from numpy import array, ceil
 from tqdm import tqdm
 from settings import datadir, theme_curator
 from concurrent.futures import ProcessPoolExecutor
 from os import cpu_count
 from os.path import join
 from bs4 import BeautifulSoup
 import ujson
 from more_itertools import chunked
 from datetime import datetime, timezone, timedelta
 JST = timezone(timedelta(hours=9), "JST")
 def parse_article(url: str) -> tuple[str, str, str, str, str]:
    while True:
        with requests.get(url) as resp:
            html = resp.text
        try:
            json_obj = list(ujson.loads(re.findall(r'<script>window.INIT_DATA=(.*?)};', html)[0] + '}')['entryState'][
                                'entryMap'].values())[0]
            break
        except IndexError as e:
            print(e, url)
    blog_account = url.split('/')[-2]
    theme = theme_curator(json_obj['theme_name'], blog_account)
    date = json_obj['last_edit_datetime']
    blog_entry = json_obj['entry_id']
    try:
        entry_title = json_obj['entry_title']
    except:
        entry_title = ''
    entry_body = BeautifulSoup(json_obj['entry_text'].replace('<br>', '\n'), 'lxml')
    # print(entry_body)
    for emoji in entry_body.find_all('img', class_='emoji'):
        emoji.decompose()
    image_divs = entry_body.find_all('img', class_='PhotoSwipeImage')
    for div in image_divs:
        # print(div)
        if not div.has_attr('data-src'):
            entry_body.find('img', class_='PhotoSwipeImage').replaceWith(
                '--blog-image-' + str(div["data-image-order"]) + '--\n')
    for i in entry_body.find_all('br'):
        i.replaceWith('\n')
    data_path = '/'.join([blog_account, str(blog_entry)])
    return entry_body.text, entry_title, theme, date, data_path
 def get_api_json(api_url: str) -> list:
    while True:
        try:
            with requests.get(api_url) as resp:
                resp_json = ujson.loads(resp.text)
                comments_count = resp_json['paging']['total_count']
                break
        except Exception as e:
            time.sleep(5.0)
            print(api_url)
            print(e, resp.text, resp.status_code, file=sys.stderr)
    while True:
        if comments_count == 0:
            comments = []
            break
        else:
            try:
                with requests.get(api_url.replace('limit=1', f'limit={comments_count}')) as resp:
                    comments = list(ujson.loads(resp.text)['commentMap'].values())
                    break
            except Exception as e:
                time.sleep(5.0)
                print(e, file=sys.stderr)
    # print(comments.__len__())
    return comments
 if __name__ == '__main__':
    chunk_size = 10
    article_executor = ProcessPoolExecutor(max_workers=cpu_count() * 2)
    api_executor = ProcessPoolExecutor(max_workers=chunk_size)
    hdf5_bio = BytesIO()
    with open(file=join(datadir(), 'blog_text.hdf5'), mode='rb') as hdf5_file:
        hdf5_bio.write(hdf5_file.read())
    save_cycle = 0
    num_lines = sum([1 for _ in open(file=join(datadir(), 'api_urls.txt'), mode='r')])
    with File(name=hdf5_bio, mode='a') as hdf5:
        with open(file=join(datadir(), 'api_urls.txt'), mode='r') as f:
            for rows in tqdm(chunked(f, n=chunk_size), total=ceil(num_lines / chunk_size)):
                # save_cycle += 1
                article_output = []
                api_output = []
                for row in rows:
                    article_url, comment_api_url = row.split(',')
                    blog_key = comment_api_url.split(';')[1].split('=')[1]
                    article_key = comment_api_url.split(';')[3].split('=')[1]
                    if f"/{blog_key}/{article_key}" in hdf5:
                        upd_time = datetime.fromisoformat(hdf5[blog_key][article_key]['article'].attrs['update_time'])
                        if (datetime.now(tz=JST) - upd_time).days > 4:
                            continue
                        else:
                            del hdf5[blog_key][article_key]
                    save_cycle += 1
                    article_output.append(article_executor.submit(parse_article, article_url))
                    api_output.append(api_executor.submit(get_api_json, comment_api_url))
                for article_res, api_res in zip(article_output, api_output):
                    entry_text, entry_title, theme, date, data_path = article_res.result()
                    comments = api_res.result()
                    post = hdf5.create_group(name=data_path)
                    article = post.create_dataset('article', dtype=string_dtype(encoding='utf-8'),
                                                  data=array(entry_text.encode('utf-8')))
                    article.attrs['theme'] = theme
                    article.attrs['title'] = entry_title
                    article.attrs['update_time'] = date
                    comments_dataset = post.create_group(name='comments_dataset')
                    if comments.__len__() != 0:
                        for order, text in enumerate(comments):
                            comment_id = str(text['comment_id'])
                            comment = comments_dataset.create_dataset(name=comment_id,
                                                                      dtype=string_dtype(encoding='utf-8'), data=array(
                                    text['comment_text'].replace('<br />', '\n').encode('utf-8')))
                            if 'comment_author' in text.keys():
                                comment.attrs['author_id'] = text['comment_author']['ameba_id']
                                comment.attrs['author_blog_id'] = text['comment_author']['blog_id']
                                comment.attrs['author_nickname'] = text['comment_author']['nickname']
                            else:
                                comment.attrs['author_id'] = ''
                                comment.attrs['author_blog_id'] = -1
                                comment.attrs['author_nickname'] = text['comment_name']
                            comment.attrs['comment_title'] = text['comment_title']
                            comment.attrs['comment_update_time'] = text['upd_datetime']
                hdf5.flush()
                if save_cycle > 1_000:
                    with open(file=join(datadir(), 'blog_text.hdf5'), mode='wb') as hdf5_file:
                        hdf5_file.write(hdf5_bio.getvalue())
                    save_cycle = 0
                    # exit()
    with open(file=join(datadir(), 'blog_text.hdf5'), mode='wb') as hdf5_file:
        hdf5_file.write(hdf5_bio.getvalue())
    article_executor.shutdown()
    api_executor.shutdown()
--- a/hdf5_compresser.py
+++ b/hdf5_compresser.py
@ -0,0 +1,51 @@
 import sys
 from os.path import join, basename, dirname
 from pprint import pprint
 from io import BytesIO
 from h5py import File
 from datetime import datetime, timezone, timedelta
 from gzip import compress, decompress
 from tqdm import tqdm
 JST = timezone(timedelta(hours=9), "JST")
 COMPRESS_METHOD = 'gzip'
 COMPRESS_OPT = 9
 # print(sys.argv)
 hdf5_bio = BytesIO()
 hdf5_bio_compressed = BytesIO()
 with open(file=join(sys.argv[1]), mode='rb') as hdf5_file:
    hdf5_bio.write(hdf5_file.read())
 with File(name=hdf5_bio, mode='r') as hdf5, File(name=hdf5_bio_compressed, mode='w') as hdf5_compressed:
    for group in hdf5.keys():
        print(group)
        hdf5_group = hdf5_compressed.create_group(name=group)
        for article_id in tqdm(hdf5[group].keys()):
            article = hdf5_group.create_group(name=article_id)
            # print(group, article_id)
            article_txt = hdf5[group][article_id]['article']
            article_txt_compressed = article.create_dataset(name='article', dtype=f'S{article_txt[()].__len__() + 1}',
                                                            shape=(1,))
            article_txt_compressed[0] = article_txt[()]
            for k, v in article_txt.attrs.items():
                # print(k, v)
                article['article'].attrs[k] = v
            comments = article.create_group(name='comments_dataset')
            for comment_key in hdf5[group][article_id]['comments_dataset']:
                comment_txt = hdf5[group][article_id]['comments_dataset'][comment_key]
                # print(group, article_id, comment_key, comment_txt[()].decode('utf-8'))
                comment_txt_compressed = comments.create_dataset(name=comment_key,
                                                                 dtype=f'S{comment_txt[()].__len__() + 1}', shape=(1,))
                comment_txt_compressed[0] = comment_txt[()]
                for k, v in comment_txt.attrs.items():
                    comments[comment_key].attrs[k] = v
 name, ext = basename(sys.argv[1]).rsplit('.', maxsplit=1)
 with open(file=join(dirname(sys.argv[1]), name + '_compressed' + '.' + ext), mode='wb') as f:
    f.write(hdf5_bio_compressed.getvalue())
 # None (bytes) 12.4 MiB (12,966,690)
 # only article compressed 12.4 MiB (12,973,914)
 # all gzipped 33.2 MiB (34,768,669)
 # all chunked 33.2 MiB (34,768,669)
--- a/resnet_finetune_vggface.py
+++ b/resnet_finetune_vggface.py
@ -130,9 +130,9 @@ optimizer = Adam(params=[
    # {'params': model_gpu.maxpool.parameters(), 'lr': 1e-8},
    {'params': model_gpu.layer1.parameters(), 'lr': 1e-8},
    {'params': model_gpu.layer2.parameters(), 'lr': 1e-8},
-    {'params': model_gpu.layer3.parameters(), 'lr': 1e-5},
+    {'params': model_gpu.layer3.parameters(), 'lr': 1e-6},
-    {'params': model_gpu.layer4.parameters(), 'lr': 1e-4},
+    {'params': model_gpu.layer4.parameters(), 'lr': 1e-5},
-    {'params': model_gpu.fc.parameters(), 'lr': 1e-4},
+    {'params': model_gpu.fc.parameters(), 'lr': 1e-5},
 ])
 scheduler = lr_scheduler.StepLR(optimizer=optimizer, step_size=5, gamma=0.5)
--- a/yolo_test.py
+++ b/yolo_test.py
@ -0,0 +1,5 @@
 from super_gradients.training.models import get
 yolo_nas = get(model_name='yolo_nas_l', pretrained_weights='coco').cuda()
 yolo_nas.predict('橋迫鈴=angerme-new=12687767841-1.jpg', conf=0.8).show()