diff --git a/ameblo_download.py b/ameblo_download.py index 93dbfb4..6f8a9f7 100755 --- a/ameblo_download.py +++ b/ameblo_download.py @@ -1,17 +1,29 @@ +from pprint import pprint +from typing import List, Tuple + +import h5py + import settings import re import sys from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning from aiohttp import ClientSession, ClientConnectorError, ClientTimeout from itertools import chain -from asyncio import run, Semaphore, sleep +from asyncio import run, Semaphore, sleep, Lock from datetime import datetime -from aiofiles import open +from aiofiles import open as a_open +import time from os import path, utime, stat, cpu_count, makedirs from tqdm.asyncio import tqdm -from concurrent.futures import as_completed, ProcessPoolExecutor, Future +from concurrent.futures import as_completed, ProcessPoolExecutor, Future, ThreadPoolExecutor from ujson import loads from warnings import filterwarnings +from h5py import File, special_dtype, string_dtype +from io import BytesIO +from numpy import void, array +import ujson +import orjson +import requests PARALLEL_LIMIT = 300 @@ -25,7 +37,6 @@ async def run_each(name: str) -> None: sem: Semaphore = Semaphore(PARALLEL_LIMIT) session: ClientSession = ClientSession(trust_env=True, headers=settings.request_header, timeout=ClientTimeout(total=10 * 60)) - list_pages_count = await parse_list_pages_count(name) print(name, list_pages_count) @@ -38,14 +49,13 @@ async def run_each(name: str) -> None: for url in url_list: if 'html' not in url: print(url) - executor = ProcessPoolExecutor(max_workers=cpu_count()) - futures = await tqdm.gather(*[parse_blog_post(url, sem, session, executor) for url in url_list], desc='scan blog') - images_list = list() - for future in tqdm(as_completed(futures), desc='waiting processing ' + name, total=len(futures)): - images_list.append(future.result()) + lock = Lock() + futures = await tqdm.gather( + *[parse_blog_post(url, sem, session, executor, lock) for url in url_list], + desc='scan blog') executor.shutdown() - image_link_package = list(chain.from_iterable(images_list)) + image_link_package = list(chain.from_iterable(futures)) await tqdm.gather( *[download_image(filename, url, date, sem, session) for filename, url, date in image_link_package], @@ -58,7 +68,7 @@ async def parse_list_pages_count(blog_name: str) -> int: async with ClientSession(trust_env=True, headers=settings.request_header) as session: async with session.get(f'https://ameblo.jp/{blog_name}/entrylist.html') as resp: resp_html = await resp.text() - json_obj = loads(re.findall(r'