import sys import re import time from io import BytesIO from h5py import File, string_dtype import requests from numpy import array, ceil from tqdm import tqdm from settings import datadir, theme_curator from concurrent.futures import ProcessPoolExecutor from os import cpu_count from os.path import join from bs4 import BeautifulSoup import ujson from more_itertools import chunked from datetime import datetime, timezone, timedelta JST = timezone(timedelta(hours=9), "JST") def parse_article(url: str) -> tuple[str, str, str, str, str]: while True: with requests.get(url) as resp: html = resp.text try: json_obj = list(ujson.loads(re.findall(r'