helloproject-ai/text_processing/hdf2sql.py

44 lines
2.0 KiB
Python

from h5py import File
from io import BytesIO
from os import getcwd
from os.path import join
from pandas import DataFrame, to_datetime, concat
from sqlite3 import connect
from tqdm import tqdm
filename = join('/mnt/shm/blog_text.hdf5')
with open(file=filename, mode='rb') as f:
file_bio = BytesIO(initial_bytes=f.read())
article_tables = []
comment_tables = []
with (File(name=file_bio, mode='r+') as hdf5):
for blog_group in hdf5.keys():
print(blog_group)
article_table = DataFrame(columns=['group', 'theme', 'title', 'date', 'article'])
for blog_entry in tqdm(hdf5[blog_group].keys()):
# print(blog_entry)
# print(hdf5[blog_group][blog_entry].keys())
blog_article = hdf5[blog_group][blog_entry]['article'][()].decode('utf-8')
(_, entry_theme), (_, entry_title), (_, entry_date) = \
list(hdf5[blog_group][blog_entry]['article'].attrs.items())
# print(entry_theme, entry_title, entry_date)
article_table.loc[blog_entry] = [blog_group, entry_theme, entry_title, entry_date, blog_article]
comment_table = DataFrame(columns=['blog_id', 'user_id', 'nickname', 'title', 'date', 'article'])
for comment_entry, comment_text in hdf5[blog_group][blog_entry]['comments_dataset'].items():
comment_article = comment_text[()].decode('utf-8')
comment_blog_id, comment_user_id, comment_nickname, comment_title, comment_date = \
list(comment_text.attrs.values())
comment_table.loc[comment_entry] = \
[comment_blog_id, comment_user_id, comment_nickname, comment_title, comment_date, comment_article]
comment_tables.append(comment_table.copy(deep=True))
article_tables.append(article_table.copy(deep=True))
# break
# print(table)
with connect('blog_post.sqlite') as connector:
concat(objs=article_tables).to_sql(name='blog', con=connector)
concat(comment_tables).to_sql(name='comments', con=connector)