44 lines
2.0 KiB
Python
44 lines
2.0 KiB
Python
from h5py import File
|
|
from io import BytesIO
|
|
from os import getcwd
|
|
from os.path import join
|
|
from pandas import DataFrame, to_datetime, concat
|
|
from sqlite3 import connect
|
|
from tqdm import tqdm
|
|
|
|
filename = join('/mnt/shm/blog_text.hdf5')
|
|
|
|
with open(file=filename, mode='rb') as f:
|
|
file_bio = BytesIO(initial_bytes=f.read())
|
|
|
|
article_tables = []
|
|
comment_tables = []
|
|
with (File(name=file_bio, mode='r+') as hdf5):
|
|
for blog_group in hdf5.keys():
|
|
print(blog_group)
|
|
article_table = DataFrame(columns=['group', 'theme', 'title', 'date', 'article'])
|
|
for blog_entry in tqdm(hdf5[blog_group].keys()):
|
|
# print(blog_entry)
|
|
# print(hdf5[blog_group][blog_entry].keys())
|
|
blog_article = hdf5[blog_group][blog_entry]['article'][()].decode('utf-8')
|
|
(_, entry_theme), (_, entry_title), (_, entry_date) = \
|
|
list(hdf5[blog_group][blog_entry]['article'].attrs.items())
|
|
# print(entry_theme, entry_title, entry_date)
|
|
article_table.loc[blog_entry] = [blog_group, entry_theme, entry_title, entry_date, blog_article]
|
|
comment_table = DataFrame(columns=['blog_id', 'user_id', 'nickname', 'title', 'date', 'article'])
|
|
for comment_entry, comment_text in hdf5[blog_group][blog_entry]['comments_dataset'].items():
|
|
comment_article = comment_text[()].decode('utf-8')
|
|
comment_blog_id, comment_user_id, comment_nickname, comment_title, comment_date = \
|
|
list(comment_text.attrs.values())
|
|
comment_table.loc[comment_entry] = \
|
|
[comment_blog_id, comment_user_id, comment_nickname, comment_title, comment_date, comment_article]
|
|
comment_tables.append(comment_table.copy(deep=True))
|
|
article_tables.append(article_table.copy(deep=True))
|
|
|
|
# break
|
|
|
|
# print(table)
|
|
with connect('blog_post.sqlite') as connector:
|
|
concat(objs=article_tables).to_sql(name='blog', con=connector)
|
|
concat(comment_tables).to_sql(name='comments', con=connector)
|