Help us understand the problem. What is going on with this article?

地域再生計画をテキストマイニングして地方創生分析ーベクトル化&コレスポンデンス分析編

More than 1 year has passed since last update.

地域再生計画をテキストマイニングして地方創生分析ー単語カウント&ワードクラウド編の続きです。

認定年毎の傾向分析

ここからは、認定年毎に地域再生計画概要のテキストを分解して、年毎にどのような特徴があるのか分析していきたいと思います。改めて、認定年毎の地域再生計画数の確認。

data[['year','plan_name']].groupby(['year']).agg(['count'])

label = list(data['year'].unique())
os.chdir('/mnt/c/ubuntu_home/text_mining/regional_plan/year')
for i in range(len(label)):
    data[data['year']==label[i]]['plan_abs'].to_csv("{}.txt".format(label[i]), index=False, line_terminator='\n')
files = glob.glob('*.txt')
subdata_year = word_list_with_fileID('txt')
os.chdir('/mnt/c/ubuntu_home/text_mining/regional_plan/')

regional_plan/yearというフォルダに、年毎のテキストデータを保存していきます。filesというのは後にカラム名で使うためです。実際の単語の分かち書きはword_list_with_fileIDで行っています。コーディングスキルがないので、時間かかります…。

まずはコラム名から.txtを削除。

col_names = files
for i in range(0,len(col_names)):
    col_names[i] = col_names[i].replace(".txt","")

次に必要なデータのみ抽出、subdata_year1の名前で格納。

stop_term = ['ある','いう','いる','する','できる','なる','思う','いる','れる','図る','行う','*',
             '等','地域','事業','活用','整備','目指す','促進','推進','進める','取り組む','活かす','実施',
             '本市','本町','本村','平成','実践','向上','分野'] 

subdata_year1 = subdata_year[(subdata_year['pos1']=='名詞') | 
                             (subdata_year['pos1']=='形容詞') | 
                             (subdata_year['pos1']=='動詞')]

subdata_year1 = subdata_year1[(subdata_year1['pos2']=='一般') | 
                              (subdata_year1['pos2']=='固有名詞') | 
                              (subdata_year1['pos2']=='自立') | 
                              (subdata_year1['pos2']=='サ変接続')]
subdata_year1 = subdata_year1[~subdata_year1['term'].isin(stop_term)]

上位10単語はこちら

word_freq(subdata_year1['term']).head(10)

コレスポンデンス分析(頻度)

vec_data = preVectorizer(subdata_year1)

from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(token_pattern='(?u)\\b\\w+\\b')
vec.fit(vec_data)
X = vec.transform(vec_data)

vec_mat = pd.DataFrame(X.toarray().transpose(), index=vec.get_feature_names())
vec_mat.columns = col_names

vec_mat['sum']= vec_mat.iloc[:,0:-1].sum(axis=1)
vec_mat.sort_values('sum', ascending =False)

12.png

こんな表が出来ます。sumが多いもののみを抽出。

vec_mat_ca = vec_mat[vec_mat['sum']>200]
vec_mat_ca = vec_mat_ca.iloc[:,:-1]

print(vec_mat_ca.shape[0])

197単語。

data_ca = vec_mat_ca
ncol = data_ca.shape[1];
import mca
mca = mca.MCA(data_ca, ncols=ncol, benzecri=False)

result_col = pd.DataFrame(mca.fs_c(N=2));result_col.index = list(data_ca.columns)
result_row = pd.DataFrame(mca.fs_r(N=2));result_row.index = list(data_ca.index)

from sklearn.cluster import KMeans
n_cluster_col = 2
x_kmeans_col = KMeans(n_cluster_col).fit(result_col)

n_cluster_row = 2
x_kmeans_row = KMeans(n_cluster_row).fit(result_row)
plt.figure(figsize=(30,20))
plt.rcParams["font.size"] = 10

col_list = list(result_col.index)
result_col = result_col.join(pd.DataFrame(x_kmeans_col.labels_, index=result_col.index, columns=['group']))

myPalette_col = sns.color_palette("dark", n_cluster_col)
[plt.scatter(result_col.iloc[i,0], result_col.iloc[i,1], 
             color=myPalette_col[int(result_col.iloc[i,2])], marker="x") for i in range(result_col.shape[0])]
texts_col = [plt.text(result_col.iloc[i,0], result_col.iloc[i,1], col_list[i],
                      bbox=dict(facecolor='none', edgecolor='black'),
                      fontproperties=font_prop, color=myPalette_col[int(result_col.iloc[i,2])]) for i in range(result_col.shape[0]) ]

row_list = list(result_row.index)
result_row = result_row.join(pd.DataFrame(x_kmeans_row.labels_, index=result_row.index, columns=['group']))
myPalette_row = sns.color_palette("muted", n_cluster_row)
[plt.scatter(result_row.iloc[i,0], result_row.iloc[i,1], 
             color=myPalette_row[int(result_row.iloc[i,2])], marker=".", alpha=0.7) for i in range(result_row.shape[0])]
texts_row = [plt.text(result_row.iloc[i,0], result_row.iloc[i, 1], row_list[i],
                      fontproperties=font_prop, color=myPalette_row[int(result_row.iloc[i,2])]) for i in range(0, len(row_list)) ]
adjust_text(texts_row, arrowprops=dict(arrowstyle='-', color='black'))

13.png

コレスポンデンス分析(TF-IDF)

vec_data = preVectorizer(subdata_year1)

from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer(token_pattern='(?u)\\b\\w+\\b')
vec.fit(vec_data)
X = vec.transform(vec_data)

vec_mat = pd.DataFrame(X.toarray().transpose(), index=vec.get_feature_names())
vec_mat.columns = files

vec_mat['sum']= vec_mat.iloc[:,0:-1].sum(axis=1)
vec_mat.sort_values('sum', ascending =False)

14.png

vec_mat_ca = vec_mat[vec_mat['sum']>0.25]
vec_mat_ca = vec_mat_ca.iloc[:,:-1]

print(vec_mat_ca.shape[0])

192単語。

data_ca = vec_mat_ca
ncol = data_ca.shape[1];
import mca
mca = mca.MCA(data_ca, ncols=ncol, benzecri=False)

result_col = pd.DataFrame(mca.fs_c(N=2));result_col.index = list(data_ca.columns)
result_row = pd.DataFrame(mca.fs_r(N=2));result_row.index = list(data_ca.index)

from sklearn.cluster import KMeans
n_cluster_col = 2
x_kmeans_col = KMeans(n_cluster_col).fit(result_col)

n_cluster_row = 2
x_kmeans_row = KMeans(n_cluster_row).fit(result_row)
plt.figure(figsize=(30,20))
plt.rcParams["font.size"] = 10

col_list = list(result_col.index)
result_col = result_col.join(pd.DataFrame(x_kmeans_col.labels_, index=result_col.index, columns=['group']))

myPalette_col = sns.color_palette("dark", n_cluster_col)
[plt.scatter(result_col.iloc[i,0], result_col.iloc[i,1], 
             color=myPalette_col[int(result_col.iloc[i,2])], marker="x") for i in range(result_col.shape[0])]
texts_col = [plt.text(result_col.iloc[i,0], result_col.iloc[i,1], col_list[i],
                      bbox=dict(facecolor='none', edgecolor='black'),
                      fontproperties=font_prop, color=myPalette_col[int(result_col.iloc[i,2])]) for i in range(result_col.shape[0]) ]

row_list = list(result_row.index)
result_row = result_row.join(pd.DataFrame(x_kmeans_row.labels_, index=result_row.index, columns=['group']))
myPalette_row = sns.color_palette("muted", n_cluster_row)
[plt.scatter(result_row.iloc[i,0], result_row.iloc[i,1], 
             color=myPalette_row[int(result_row.iloc[i,2])], marker=".", alpha=0.7) for i in range(result_row.shape[0])]
texts_row = [plt.text(result_row.iloc[i,0], result_row.iloc[i, 1], row_list[i],
                      fontproperties=font_prop, color=myPalette_row[int(result_row.iloc[i,2])]) for i in range(0, len(row_list)) ]
adjust_text(texts_row, arrowprops=dict(arrowstyle='-', color='black'))

15.png

hrkz_szk
英国のとある大学の数学研究科修士課程を卒業。数学を使った仕事をしたいのに、現職はまったく関係ないところに配属されてしまった。専門は統計学と最適化数学、数理ファイナンスです。
Why not register and get more from Qiita?
  1. We will deliver articles that match you
    By following users and tags, you can catch up information on technical fields that you are interested in as a whole
  2. you can read useful information later efficiently
    By "stocking" the articles you like, you can search right away