import settings import re import sys from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning from aiohttp import ClientSession, ClientTimeout from itertools import chain from asyncio import run, Semaphore, sleep, Lock from datetime import datetime from aiofiles import open as a_open import time from os import path, utime, stat, cpu_count, makedirs from tqdm.asyncio import tqdm from concurrent.futures import ProcessPoolExecutor, Future from warnings import filterwarnings import ujson import requests PARALLEL_LIMIT = 300 makedirs(path.join(settings.datadir(), 'blog_images'), exist_ok=True) makedirs(path.join(settings.datadir(), 'blog_text'), exist_ok=True) filterwarnings('ignore', category=MarkupResemblesLocatorWarning, module='bs4') async def run_each(name: str) -> None: sem: Semaphore = Semaphore(PARALLEL_LIMIT) session: ClientSession = ClientSession(trust_env=True, headers=settings.request_header, timeout=ClientTimeout(total=10 * 60)) list_pages_count = await parse_list_pages_count(name) print(name, list_pages_count) url_lists = await tqdm.gather(*[parse_list_page(name, i, sem, session) for i in range(1, list_pages_count + 1)], desc=name) url_list = list(chain.from_iterable(url_lists)) for url in url_list: if 'html' not in url: print(url) executor = ProcessPoolExecutor(max_workers=cpu_count()) lock = Lock() futures = await tqdm.gather( *[parse_blog_post(url, sem, session, executor, lock) for url in url_list], desc='scan blog') executor.shutdown() image_link_package = list(chain.from_iterable(futures)) await tqdm.gather( *[download_image(filename, url, date, sem, session) for filename, url, date in image_link_package], desc='downloading images') await session.close() async def parse_list_pages_count(blog_name: str) -> int: async with ClientSession(trust_env=True, headers=settings.request_header) as session: async with session.get(f'https://ameblo.jp/{blog_name}/entrylist.html') as resp: resp_html = await resp.text() json_obj = ujson.loads(re.findall(r'