import settings import re import sys from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning from aiohttp import ClientSession, ClientConnectorError, ClientTimeout from itertools import chain from asyncio import run, Semaphore, sleep from datetime import datetime from aiofiles import open from os import path, utime, stat, cpu_count, makedirs from tqdm.asyncio import tqdm from concurrent.futures import as_completed, ProcessPoolExecutor, Future from ujson import loads from warnings import filterwarnings PARALLEL_LIMIT = 300 makedirs(path.join(settings.datadir(), 'blog_images'), exist_ok=True) makedirs(path.join(settings.datadir(), 'blog_text'), exist_ok=True) filterwarnings('ignore', category=MarkupResemblesLocatorWarning, module='bs4') async def run_each(name: str) -> None: sem: Semaphore = Semaphore(PARALLEL_LIMIT) session: ClientSession = ClientSession(trust_env=True, headers=settings.request_header, timeout=ClientTimeout(total=10 * 60)) list_pages_count = await parse_list_pages_count(name) print(name, list_pages_count) url_lists = await tqdm.gather(*[parse_list_page(name, i, sem, session) for i in range(1, list_pages_count + 1)], desc=name) url_list = list(chain.from_iterable(url_lists)) for url in url_list: if 'html' not in url: print(url) executor = ProcessPoolExecutor(max_workers=cpu_count()) futures = await tqdm.gather(*[parse_blog_post(url, sem, session, executor) for url in url_list], desc='scan blog') images_list = list() for future in tqdm(as_completed(futures), desc='waiting processing ' + name, total=len(futures)): images_list.append(future.result()) executor.shutdown() image_link_package = list(chain.from_iterable(images_list)) await tqdm.gather( *[download_image(filename, url, date, sem, session) for filename, url, date in image_link_package], desc='downloading images') await session.close() async def parse_list_pages_count(blog_name: str) -> int: async with ClientSession(trust_env=True, headers=settings.request_header) as session: async with session.get(f'https://ameblo.jp/{blog_name}/entrylist.html') as resp: resp_html = await resp.text() json_obj = loads(re.findall(r'