from pprint import pprint from typing import List, Tuple import h5py import settings import re import sys from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning from aiohttp import ClientSession, ClientConnectorError, ClientTimeout from itertools import chain from asyncio import run, Semaphore, sleep, Lock from datetime import datetime from aiofiles import open as a_open import time from os import path, utime, stat, cpu_count, makedirs from tqdm.asyncio import tqdm from concurrent.futures import as_completed, ProcessPoolExecutor, Future, ThreadPoolExecutor from ujson import loads from warnings import filterwarnings from h5py import File, special_dtype, string_dtype from io import BytesIO from numpy import void, array import ujson import orjson import requests PARALLEL_LIMIT = 300 makedirs(path.join(settings.datadir(), 'blog_images'), exist_ok=True) makedirs(path.join(settings.datadir(), 'blog_text'), exist_ok=True) filterwarnings('ignore', category=MarkupResemblesLocatorWarning, module='bs4') async def run_each(name: str) -> None: sem: Semaphore = Semaphore(PARALLEL_LIMIT) session: ClientSession = ClientSession(trust_env=True, headers=settings.request_header, timeout=ClientTimeout(total=10 * 60)) list_pages_count = await parse_list_pages_count(name) print(name, list_pages_count) url_lists = await tqdm.gather(*[parse_list_page(name, i, sem, session) for i in range(1, list_pages_count + 1)], desc=name) url_list = list(chain.from_iterable(url_lists)) for url in url_list: if 'html' not in url: print(url) executor = ProcessPoolExecutor(max_workers=cpu_count()) lock = Lock() futures = await tqdm.gather( *[parse_blog_post(url, sem, session, executor, lock) for url in url_list], desc='scan blog') executor.shutdown() image_link_package = list(chain.from_iterable(futures)) await tqdm.gather( *[download_image(filename, url, date, sem, session) for filename, url, date in image_link_package], desc='downloading images') await session.close() async def parse_list_pages_count(blog_name: str) -> int: async with ClientSession(trust_env=True, headers=settings.request_header) as session: async with session.get(f'https://ameblo.jp/{blog_name}/entrylist.html') as resp: resp_html = await resp.text() json_obj = ujson.loads(re.findall(r'