helloproject-ai/text_processing/sqlite_process.py

53 lines
2.5 KiB
Python

from sqlite3 import connect
from shutil import copyfile
from settings import datadir
from os.path import join
from os import getcwd
from numpy import array, tile, int_, rot90, where, ndarray, vectorize, str_
# copyfile(src=join(datadir(), 'blog_post.sqlite'), dst=join('/mnt/shm/blog_post.sqlite'))
hash_func = vectorize(hash, otypes=[str])
with connect(database='/mnt/shm/blog_post.sqlite') as connector:
cursor = connector.cursor()
print(cursor.execute("SELECT t.sql FROM sqlite_master t WHERE name = 'blog'").fetchone())
if 'article_cleaned' not in cursor.execute("SELECT t.sql FROM sqlite_master t WHERE name = 'blog'").fetchone()[0]:
cursor.execute("ALTER TABLE blog add article_cleaned TEXT")
print(cursor.execute("SELECT t.sql FROM sqlite_master t WHERE name = 'blog'").fetchone())
for (theme,) in cursor.execute(f'SELECT DISTINCT theme FROM blog').fetchall():
print(theme)
blog_contents = cursor.execute(
f'SELECT title,date,article,theme,"index" FROM blog WHERE theme = \'{theme}\' ORDER BY date').fetchall()
# cleaned_list = [None] * (blog_contents.__len__() - 1)
# cleaned_list = []
for i in range(blog_contents.__len__() - 1):
if blog_contents[i][3] != '八木栞':
pass
a = blog_contents[i]
b = blog_contents[i + 1]
# print(a)
# print(b)
a_list = array(a[2].split('\n'), dtype=object)
b_list = array(b[2].split('\n'), dtype=object)
a_hash_list = hash_func(a_list)
b_hash_list = hash_func(b_list)
# print(a_hash_list.shape)
# print(b_hash_list.shape)
a_ndarray: ndarray = tile(array(object=a_hash_list), reps=(*b_hash_list.shape, 1))
b_ndarray: ndarray = rot90(tile(array(object=b_hash_list), reps=(*a_hash_list.shape, 1)))
# print(a_ndarray.shape)
# print(b_ndarray.shape)
dup = a_ndarray == b_ndarray
# print(dup)
# print(list(zip(*where(dup))))
cleaned_text = '\n'.join(a_list[(~dup.any(axis=0))]).replace('\'', '\'\'')
cursor.execute(
f"UPDATE blog SET article_cleaned = \'{cleaned_text}\' WHERE \"index\" = {blog_contents[i][4]}")
# cleaned_list[i] = cleaned_text
# cleaned_list.append(cleaned_text)
print('\t' + blog_contents[i][0])
# input()
connector.cursor().close()
connector.commit()