53 lines
2.5 KiB
Python
53 lines
2.5 KiB
Python
from sqlite3 import connect
|
|
from shutil import copyfile
|
|
from settings import datadir
|
|
from os.path import join
|
|
from os import getcwd
|
|
from numpy import array, tile, int_, rot90, where, ndarray, vectorize, str_
|
|
|
|
# copyfile(src=join(datadir(), 'blog_post.sqlite'), dst=join('/mnt/shm/blog_post.sqlite'))
|
|
hash_func = vectorize(hash, otypes=[str])
|
|
with connect(database='/mnt/shm/blog_post.sqlite') as connector:
|
|
cursor = connector.cursor()
|
|
print(cursor.execute("SELECT t.sql FROM sqlite_master t WHERE name = 'blog'").fetchone())
|
|
if 'article_cleaned' not in cursor.execute("SELECT t.sql FROM sqlite_master t WHERE name = 'blog'").fetchone()[0]:
|
|
cursor.execute("ALTER TABLE blog add article_cleaned TEXT")
|
|
print(cursor.execute("SELECT t.sql FROM sqlite_master t WHERE name = 'blog'").fetchone())
|
|
|
|
for (theme,) in cursor.execute(f'SELECT DISTINCT theme FROM blog').fetchall():
|
|
print(theme)
|
|
blog_contents = cursor.execute(
|
|
f'SELECT title,date,article,theme,"index" FROM blog WHERE theme = \'{theme}\' ORDER BY date').fetchall()
|
|
# cleaned_list = [None] * (blog_contents.__len__() - 1)
|
|
# cleaned_list = []
|
|
for i in range(blog_contents.__len__() - 1):
|
|
if blog_contents[i][3] != '八木栞':
|
|
pass
|
|
a = blog_contents[i]
|
|
b = blog_contents[i + 1]
|
|
# print(a)
|
|
# print(b)
|
|
a_list = array(a[2].split('\n'), dtype=object)
|
|
b_list = array(b[2].split('\n'), dtype=object)
|
|
a_hash_list = hash_func(a_list)
|
|
b_hash_list = hash_func(b_list)
|
|
# print(a_hash_list.shape)
|
|
# print(b_hash_list.shape)
|
|
a_ndarray: ndarray = tile(array(object=a_hash_list), reps=(*b_hash_list.shape, 1))
|
|
b_ndarray: ndarray = rot90(tile(array(object=b_hash_list), reps=(*a_hash_list.shape, 1)))
|
|
# print(a_ndarray.shape)
|
|
# print(b_ndarray.shape)
|
|
dup = a_ndarray == b_ndarray
|
|
# print(dup)
|
|
# print(list(zip(*where(dup))))
|
|
cleaned_text = '\n'.join(a_list[(~dup.any(axis=0))]).replace('\'', '\'\'')
|
|
cursor.execute(
|
|
f"UPDATE blog SET article_cleaned = \'{cleaned_text}\' WHERE \"index\" = {blog_contents[i][4]}")
|
|
# cleaned_list[i] = cleaned_text
|
|
# cleaned_list.append(cleaned_text)
|
|
print('\t' + blog_contents[i][0])
|
|
# input()
|
|
|
|
connector.cursor().close()
|
|
connector.commit()
|