LoginSignup
4
7

More than 3 years have passed since last update.

地域再生計画をテキストマイニングして地方創生分析ーベクトル化&コレスポンデンス分析編

Posted at

地域再生計画をテキストマイニングして地方創生分析ー単語カウント&ワードクラウド編の続きです。

認定年毎の傾向分析

ここからは、認定年毎に地域再生計画概要のテキストを分解して、年毎にどのような特徴があるのか分析していきたいと思います。改めて、認定年毎の地域再生計画数の確認。

data[['year','plan_name']].groupby(['year']).agg(['count'])

label = list(data['year'].unique())
os.chdir('/mnt/c/ubuntu_home/text_mining/regional_plan/year')
for i in range(len(label)):
    data[data['year']==label[i]]['plan_abs'].to_csv("{}.txt".format(label[i]), index=False, line_terminator='\n')
files = glob.glob('*.txt')
subdata_year = word_list_with_fileID('txt')
os.chdir('/mnt/c/ubuntu_home/text_mining/regional_plan/')

regional_plan/yearというフォルダに、年毎のテキストデータを保存していきます。filesというのは後にカラム名で使うためです。実際の単語の分かち書きはword_list_with_fileIDで行っています。コーディングスキルがないので、時間かかります…。

まずはコラム名から.txtを削除。

col_names = files
for i in range(0,len(col_names)):
    col_names[i] = col_names[i].replace(".txt","")

次に必要なデータのみ抽出、subdata_year1の名前で格納。

stop_term = ['ある','いう','いる','する','できる','なる','思う','いる','れる','図る','行う','*',
             '等','地域','事業','活用','整備','目指す','促進','推進','進める','取り組む','活かす','実施',
             '本市','本町','本村','平成','実践','向上','分野'] 

subdata_year1 = subdata_year[(subdata_year['pos1']=='名詞') | 
                             (subdata_year['pos1']=='形容詞') | 
                             (subdata_year['pos1']=='動詞')]

subdata_year1 = subdata_year1[(subdata_year1['pos2']=='一般') | 
                              (subdata_year1['pos2']=='固有名詞') | 
                              (subdata_year1['pos2']=='自立') | 
                              (subdata_year1['pos2']=='サ変接続')]
subdata_year1 = subdata_year1[~subdata_year1['term'].isin(stop_term)]

上位10単語はこちら

word_freq(subdata_year1['term']).head(10)

コレスポンデンス分析(頻度)

vec_data = preVectorizer(subdata_year1)

from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(token_pattern='(?u)\\b\\w+\\b')
vec.fit(vec_data)
X = vec.transform(vec_data)

vec_mat = pd.DataFrame(X.toarray().transpose(), index=vec.get_feature_names())
vec_mat.columns = col_names

vec_mat['sum']= vec_mat.iloc[:,0:-1].sum(axis=1)
vec_mat.sort_values('sum', ascending =False)

12.png

こんな表が出来ます。sumが多いもののみを抽出。

vec_mat_ca = vec_mat[vec_mat['sum']>200]
vec_mat_ca = vec_mat_ca.iloc[:,:-1]

print(vec_mat_ca.shape[0])

197単語。

data_ca = vec_mat_ca
ncol = data_ca.shape[1];
import mca
mca = mca.MCA(data_ca, ncols=ncol, benzecri=False)

result_col = pd.DataFrame(mca.fs_c(N=2));result_col.index = list(data_ca.columns)
result_row = pd.DataFrame(mca.fs_r(N=2));result_row.index = list(data_ca.index)

from sklearn.cluster import KMeans
n_cluster_col = 2
x_kmeans_col = KMeans(n_cluster_col).fit(result_col)

n_cluster_row = 2
x_kmeans_row = KMeans(n_cluster_row).fit(result_row)
plt.figure(figsize=(30,20))
plt.rcParams["font.size"] = 10

col_list = list(result_col.index)
result_col = result_col.join(pd.DataFrame(x_kmeans_col.labels_, index=result_col.index, columns=['group']))

myPalette_col = sns.color_palette("dark", n_cluster_col)
[plt.scatter(result_col.iloc[i,0], result_col.iloc[i,1], 
             color=myPalette_col[int(result_col.iloc[i,2])], marker="x") for i in range(result_col.shape[0])]
texts_col = [plt.text(result_col.iloc[i,0], result_col.iloc[i,1], col_list[i],
                      bbox=dict(facecolor='none', edgecolor='black'),
                      fontproperties=font_prop, color=myPalette_col[int(result_col.iloc[i,2])]) for i in range(result_col.shape[0]) ]

row_list = list(result_row.index)
result_row = result_row.join(pd.DataFrame(x_kmeans_row.labels_, index=result_row.index, columns=['group']))
myPalette_row = sns.color_palette("muted", n_cluster_row)
[plt.scatter(result_row.iloc[i,0], result_row.iloc[i,1], 
             color=myPalette_row[int(result_row.iloc[i,2])], marker=".", alpha=0.7) for i in range(result_row.shape[0])]
texts_row = [plt.text(result_row.iloc[i,0], result_row.iloc[i, 1], row_list[i],
                      fontproperties=font_prop, color=myPalette_row[int(result_row.iloc[i,2])]) for i in range(0, len(row_list)) ]
adjust_text(texts_row, arrowprops=dict(arrowstyle='-', color='black'))

13.png

コレスポンデンス分析(TF-IDF)

vec_data = preVectorizer(subdata_year1)

from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer(token_pattern='(?u)\\b\\w+\\b')
vec.fit(vec_data)
X = vec.transform(vec_data)

vec_mat = pd.DataFrame(X.toarray().transpose(), index=vec.get_feature_names())
vec_mat.columns = files

vec_mat['sum']= vec_mat.iloc[:,0:-1].sum(axis=1)
vec_mat.sort_values('sum', ascending =False)

14.png

vec_mat_ca = vec_mat[vec_mat['sum']>0.25]
vec_mat_ca = vec_mat_ca.iloc[:,:-1]

print(vec_mat_ca.shape[0])

192単語。

data_ca = vec_mat_ca
ncol = data_ca.shape[1];
import mca
mca = mca.MCA(data_ca, ncols=ncol, benzecri=False)

result_col = pd.DataFrame(mca.fs_c(N=2));result_col.index = list(data_ca.columns)
result_row = pd.DataFrame(mca.fs_r(N=2));result_row.index = list(data_ca.index)

from sklearn.cluster import KMeans
n_cluster_col = 2
x_kmeans_col = KMeans(n_cluster_col).fit(result_col)

n_cluster_row = 2
x_kmeans_row = KMeans(n_cluster_row).fit(result_row)
plt.figure(figsize=(30,20))
plt.rcParams["font.size"] = 10

col_list = list(result_col.index)
result_col = result_col.join(pd.DataFrame(x_kmeans_col.labels_, index=result_col.index, columns=['group']))

myPalette_col = sns.color_palette("dark", n_cluster_col)
[plt.scatter(result_col.iloc[i,0], result_col.iloc[i,1], 
             color=myPalette_col[int(result_col.iloc[i,2])], marker="x") for i in range(result_col.shape[0])]
texts_col = [plt.text(result_col.iloc[i,0], result_col.iloc[i,1], col_list[i],
                      bbox=dict(facecolor='none', edgecolor='black'),
                      fontproperties=font_prop, color=myPalette_col[int(result_col.iloc[i,2])]) for i in range(result_col.shape[0]) ]

row_list = list(result_row.index)
result_row = result_row.join(pd.DataFrame(x_kmeans_row.labels_, index=result_row.index, columns=['group']))
myPalette_row = sns.color_palette("muted", n_cluster_row)
[plt.scatter(result_row.iloc[i,0], result_row.iloc[i,1], 
             color=myPalette_row[int(result_row.iloc[i,2])], marker=".", alpha=0.7) for i in range(result_row.shape[0])]
texts_row = [plt.text(result_row.iloc[i,0], result_row.iloc[i, 1], row_list[i],
                      fontproperties=font_prop, color=myPalette_row[int(result_row.iloc[i,2])]) for i in range(0, len(row_list)) ]
adjust_text(texts_row, arrowprops=dict(arrowstyle='-', color='black'))

15.png

4
7
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
4
7