More than 5 years have passed since last update.

Python3.4でword frequency counterをつくる

Last updated at 2016-01-07Posted at 2016-01-04

やりたいこと

動画#1、動画#2と動画#3を見ながらウェブクローラーをつくる。やってほしいことは指定したサイトまで行って各見出しリンクで使われている単語を頻度が高い順に表示。

動画#1

import requests
from bs4 import BeautifulSoup
import operator


def start(url):
	word_list = []
	source_code = requests.get(url).text #gonna connect to the link and use it as plain text
	soup = BeautifulSoup(source_code, 'html.parser')
	for post_text in soup.findAll('a', {'class': 'title text-semibold'}): #go through all the contents
		content = post_text.string #.string = only get the texts thats inside "soup"
		words = content.lower().split()
		for each_word in words:
			print(each_word)
			word_list.append(each_word)
		

start("https://www.thenewboston.com/forum/")

word_listリストをつくる（あとでバラバラにした単語を全てここに放り込むため）
サイトに行きhtmlのtextをsource_codeに保存
BeautifulSoupを使って体裁を整える
必要な箇所をCSSセレクタで絞ってその中のテキストだけを摘出しcontentに
contentの中身を全て小文字にしてスペース毎にバラバラにしてwordsに投げ込む
loopを使ってwordsの中に入っている単語一つ一つをword_listに投げ込む
と言った感じ。

アウトプット

dictionary
print
order
permanent
display
of
content
rendering
problems
whenever
i
start
the
android
studio
two
beginner
python
courses?
vector
about
double
buffering
arduino
code
asterisk
before
a
pointer
can
you
provide
me
the
arduino
code
for
eye
blinking
sensor(ir
sensor)
for
accidental
prevention.
can't
import
images
in
android
studio
can't
install
intel
haxm
free
internet
javascript
interpreter
lambda
function
my
funny
litlte
program
navigation
drawer
activity
not
able
to
find
the
problem
need
help
org.apache.http.client.httpclient
deprecated
question
about
themes
someone
share
a
link
to
source
codes??
source
code
?
which
all
views
should
be
turned
on?
x86
emulation
error
error
when
trying
to
build
and
run.
computer
doesn't
support
virtualization.
web
development
using
html
java
game
about
getting
user
input
eclipse
doesn't
recognise
my
imports
other
ways
of
styling

動画#2

import requests
from bs4 import BeautifulSoup
import operator


def start(url):
	word_list = []
	source_code = requests.get(url).text #gonna connect to the link and use it as plain text
	soup = BeautifulSoup(source_code, 'html.parser')
	for post_text in soup.findAll('a', {'class': 'title text-semibold'}): #go through all the contents
		content = post_text.string #.string = only get the texts thats inside "soup"
		words = content.lower().split()
		for each_word in words:
			word_list.append(each_word)
	clean_up_list(word_list)

def clean_up_list(word_list):
	clean_word_list = []
	for word in word_list:
		symbols = "!@#$%^&*()_+{}:\"<>?,./;'[]-="
		for i in range(0, len(symbols)): 
			word = word.replace(symbols[i], "") #and replace it with nothing (=delete) if finds any symbols
		if len(word) > 0: #allows it to take only the actual clean words
			#print(word)
			clean_word_list.append(word)

start("https://www.thenewboston.com/forum/")

start関数では文字を取ってきてword_listに入れてくるという作業までしたが、今回はその取ってきた単語たちの取捨選択する関数をつくる。例えば単語以外の記号への対処やただのスペースだけの文字などだ。

先にclean_word_listを作っておく
start関数で取ってきた単語が入っているword_list一つ一つの単語（=word）をループするfor loop
一つ一つのwordを一つ一つのsymbol毎にマッチするかしないかを判定。もしした場合はブランクに換える
もしwordの長さが0より大きければ（＝ただのブランクでなければ）clean_word_listへ追加

アウトプット

variables
in
enum
dictionary
print
order
permanent
display
of
content
rendering
problems
whenever
i
start
the
android
studio
two
beginner
python
courses
vector
about
double
buffering
arduino
code
asterisk
before
a
pointer
can
you
provide
me
the
arduino
code
for
eye
blinking
sensorir
sensor
for
accidental
prevention
cant
import
images
in
android
studio
cant
install
intel
haxm
free
internet
javascript
interpreter
lambda
function
my
funny
litlte
program
navigation
drawer
activity
not
able
to
find
the
problem
need
help
orgapachehttpclienthttpclient
deprecated
question
about
themes
someone
share
a
link
to
source
codes
source
code
which
all
views
should
be
turned
on
x86
emulation
error
error
when
trying
to
build
and
run
computer
doesnt
support
virtualization
web
development
using
html
java
game
about
getting
user
input
eclipse
doesnt
recognise
my
imports

動画#3

import requests
from bs4 import BeautifulSoup
import operator #allows you to work with build-in data types in python


def start(url):
	word_list = []
	source_code = requests.get(url).text #gonna connect to the link and use it as plain text
	soup = BeautifulSoup(source_code, 'html.parser')
	for post_text in soup.findAll('a', {'class': 'title text-semibold'}): #go through all the contents
		content = post_text.string #.string = only get the texts thats inside "soup"
		words = content.lower().split()
		for each_word in words:
			word_list.append(each_word)
	clean_up_list(word_list)

def clean_up_list(word_list):
	clean_word_list = []
	for word in word_list:
		symbols = "!@#$%^&*()_+{}:\"<>?,./;'[]-="
		for i in range(0, len(symbols)): 
			word = word.replace(symbols[i], "") #and replace it with nothing (=delete) if finds any symbols
		if len(word) > 0: #allows it to take only the actual clean words
			print(word)
			clean_word_list.append(word)
	create_dictionary(clean_word_list)

def create_dictionary(clean_word_list):
	word_count = {}
	for word in clean_word_list:
		if word in word_count:
			word_count[word] += 1 # word_count[word]の数字が1つ繰り上がる
		else:
			word_count[word] = 1
	for key, value in sorted(word_count.items(), key = operator.itemgetter(1)):
	#go to the dic. and get an item from the dic.
	# key = 0 and value = 1, so if you wanted to sort by key then operator.itemgetter(0) = alphabetical order
	
		print(key, value)

start("https://www.thenewboston.com/forum/")

単語が使われている頻度をvalueとして単語をkeyとして保存できるcreate_dictionary関数をつくる。
if構文を使ってすでにあるなら１ポイント付け加え無いなら新たに作成。for key, value in sorted(word_count.items(), key = operator.itemgetter(1))を使ってdictionaryから単語を引っ張りvalueが大きい順に単語を並べる。

アウトプット

variables
in
enum
dictionary
print
order
permanent
display
of
content
rendering
problems
whenever
i
start
the
android
studio
two
beginner
python
courses
vector
about
double
buffering
arduino
code
asterisk
before
a
pointer
can
you
provide
me
the
arduino
code
for
eye
blinking
sensorir
sensor
for
accidental
prevention
cant
import
images
in
android
studio
cant
install
intel
haxm
free
internet
javascript
interpreter
lambda
function
my
funny
litlte
program
navigation
drawer
activity
not
able
to
find
the
problem
need
help
orgapachehttpclienthttpclient
deprecated
question
about
themes
someone
share
a
link
to
source
codes
source
code
which
all
views
should
be
turned
on
x86
emulation
error
error
when
trying
to
build
and
run
computer
doesnt
support
virtualization
web
development
using
html
java
game
about
getting
user
input
eclipse
doesnt
recognise
my
imports
courses 1
images 1
order 1
litlte 1
i 1
link 1
variables 1
input 1
when 1
someone 1
pointer 1
vector 1
x86 1
buffering 1
on 1
of 1
blinking 1
recognise 1
beginner 1
enum 1
javascript 1
should 1
need 1
eclipse 1
computer 1
dictionary 1
virtualization 1
navigation 1
can 1
permanent 1
provide 1
prevention 1
print 1
function 1
game 1
internet 1
html 1
question 1
rendering 1
deprecated 1
you 1
turned 1
orgapachehttpclienthttpclient 1
find 1
haxm 1
activity 1
asterisk 1
using 1
which 1
intel 1
double 1
all 1
support 1
problem 1
two 1
funny 1
whenever 1
display 1
problems 1
sensor 1
accidental 1
java 1
interpreter 1
me 1
eye 1
help 1
before 1
imports 1
getting 1
development 1
trying 1
import 1
not 1
drawer 1
install 1
codes 1
views 1
be 1
user 1
share 1
themes 1
web 1
content 1
able 1
program 1
build 1
sensorir 1
python 1
emulation 1
and 1
start 1
run 1
lambda 1
free 1
in 2
for 2
android 2
arduino 2
cant 2
error 2
doesnt 2
studio 2
a 2
my 2
source 2
the 3
to 3
code 3
about 3

追記：

@lazykyama より「collectionsmoduleのCounterを使えば動画3で作った関数がほぼ要らなくなるよ」とのアドバイスを頂いたので早速実装してみることに。

import requests
from bs4 import BeautifulSoup
import operator #allows you to work with build-in data types in python
from collections import Counter


def start(url):
	word_list = []
	source_code = requests.get(url).text #gonna connect to the link and use it as plain text
	soup = BeautifulSoup(source_code, 'html.parser')
	for post_text in soup.findAll('a', {'class': 'title text-semibold'}): #go through all the contents
		content = post_text.string #.string = only get the texts thats inside "soup"
		words = content.lower().split()
		for each_word in words:
			word_list.append(each_word)
	clean_up_list(word_list)

def clean_up_list(word_list):
	clean_word_list = []
	for word in word_list:
		symbols = "!@#$%^&*()_+{}:\"<>?,./;'[]-="
		for i in range(0, len(symbols)): 
			word = word.replace(symbols[i], "") #and replace it with nothing (=delete) if finds any symbols
		if len(word) > 0: #allows it to take only the actual clean words
			#print(word)
			clean_word_list.append(word)

	counts = Counter(clean_word_list)
	print(counts)

start("https://www.thenewboston.com/forum/")

アウトプットがこちら：

Counter({'the': 9, 'to': 5, 'i': 5, 'with': 5, 'program': 3, 'image': 3, 'code': 3, 'web': 3, 'help': 3, 'simple': 3, 'source': 3, 'crawler': 3, 'a': 3, 'in': 3, 'am': 2, 'not': 2, 'error': 2, 'cant': 2, 'is': 2, 'my': 2, 'images': 2, 'when': 2, 'getting': 2, 'tutorial': 2, 'about': 2, 'for': 2, 'need': 2, 'app': 2, 'problem': 2, 'android': 2, 'find': 2, 'and': 2, 'studio': 1, 'running': 1, 'clock': 1, 'selenium': 1, 'codes': 1, 'mergesort': 1, 'it': 1, 'trouble': 1, 'someone': 1, 'please': 1, 'webpage': 1, 'method': 1, 'beginners': 1, 'camera': 1, 'lambda': 1, 'specified': 1, 'build': 1, 'buying': 1, 'development': 1, 'dosent': 1, 'run': 1, 'of': 1, 'anything': 1, 'mac': 1, 'reference': 1, 'mistake': 1, 'linked': 1, 'haxm': 1, 'list': 1, 'now': 1, 'trying': 1, 'on': 1, 'typecasting': 1, 'got': 1, 'current': 1, 'imagemap': 1, 'question': 1, 'undefined': 1, 'assignment': 1, 'population': 1, 'import': 1, 'able': 1, 'apple': 1, 'system': 1, 'needs': 1, 'show': 1, 'prepaid': 1, 'install': 1, 'how': 1, 'cannot': 1, 'hover': 1, 'add': 1, 'video': 1, '4': 1, 'default': 1, 'involving': 1, 'inserting': 1, 'you': 1, 'only': 1, 'function': 1, 'file': 1, 'themes': 1, 'this': 1, '28': 1, 'chooser': 1, 'refresh': 1, 'share': 1, 'link': 1, 'where': 1, 'tagif': 1, 'tip': 1, 'practice': 1, 'python': 1, 'get': 1, 'visa': 1, 'environment': 1, 'funny': 1, 'possible': 1, '42': 1, 'css': 1, 'step': 1, 'bitcoins': 1, 'time': 1, 'which': 1, 'variable': 1, 'date': 1, 'litlte': 1, 'as': 1, 'override': 1, 'capture': 1, 'effect': 1, 'intel': 1, 'can': 1, 'but': 1, 'at': 1, 'bug': 1, 'onattach': 1, 'loop': 1, 'what': 1})

スリムになって且つdictionaryにまとめてくれるので便利です。

加えて具体的に単語を指定してその使用頻度を表示させることもできるようです。たとえばtheという単語だけの頻度を表示させたい場合：

import requests
from bs4 import BeautifulSoup
import operator #allows you to work with build-in data types in python
from collections import Counter


def start(url):
	word_list = []
	source_code = requests.get(url).text #gonna connect to the link and use it as plain text
	soup = BeautifulSoup(source_code, 'html.parser')
	for post_text in soup.findAll('a', {'class': 'title text-semibold'}): #go through all the contents
		content = post_text.string #.string = only get the texts thats inside "soup"
		words = content.lower().split()
		for each_word in words:
			word_list.append(each_word)
	clean_up_list(word_list)

def clean_up_list(word_list):
	clean_word_list = []
	for word in word_list:
		symbols = "!@#$%^&*()_+{}:\"<>?,./;'[]-="
		for i in range(0, len(symbols)): 
			word = word.replace(symbols[i], "") #and replace it with nothing (=delete) if finds any symbols
		if len(word) > 0: #allows it to take only the actual clean words
			#print(word)
			clean_word_list.append(word)

	counts = Counter(clean_word_list)
	specific = counts["the"] #9
	print(specific)

start("https://www.thenewboston.com/forum/")

またcounts["the"] = 15で通常のdictionaryではできませんがCountを使えば任意にdictionary内のwordの頻度を変えることも出来る。counts["the"] = 0でdictionary一番最後の方に持っていける。del counts[1]で削除することも可能。

x = list(counts.elements())でリスト作成もできる。

# 上と同じなので省略
	counts = Counter(clean_word_list)
	counts_list = list(counts.elements())
	print(counts_list)

start("https://www.thenewboston.com/forum/")

['please', 'problem', 'problem', 'add', 'crawler', 'crawler', 'crawler', 'running', 'specified', 'is', 'is', 'dosent', 'practice', 'intel', 'anything', 'show', 'mergesort', 'image', 'image', 'image', 'list', 'import', 'tip', 'loop', 'am', 'am', 'getting', 'getting', 'population', 'get', 'buying', 'for', 'for', 'about', 'about', 'which', '4', 'on', 'prepaid', 'mistake', 'override', 'got', 'function', 'share', 'as', 'clock', 'reference', 'cannot', 'bitcoins', 'effect', 'code', 'code', 'code', 'assignment', 'you', 'can', 'images', 'images', 'haxm', 'find', 'find', 'install', 'with', 'with', 'with', 'with', 'with', 'trying', 'file', 'and', 'and', 'what', 'android', 'android', 'typecasting', 'source', 'source', 'source', 'beginners', 'someone', 'possible', 'cant', 'cant', 'how', 'method', 'app', 'app', 'i', 'i', 'i', 'i', 'i', 'system', 'where', 'webpage', 'involving', 'funny', 'current', 'it', 'linked', 'in', 'in', 'in', 'variable', 'web', 'web', 'web', 'hover', 'litlte', 'question', 'tagif', 'time', 'inserting', 'trouble', 'program', 'program', 'program', 'bug', '42', 'tutorial', 'tutorial', 'need', 'need', 'video', 'lambda', 'date', 'chooser', 'run', 'error', 'error', 'default', 'to', 'to', 'to', 'to', 'to', 'of', 'apple', 'link', 'when', 'when', 'capture', 'mac', 'css', 'step', 'refresh', 'not', 'not', 'imagemap', 'development', 'camera', 'but', 'simple', 'simple', 'simple', 'needs', 'help', 'help', 'help', 'studio', 'a', 'a', 'a', '28', 'selenium', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'now', 'themes', 'environment', 'python', 'visa', 'only', 'this', 'able', 'undefined', 'onattach', 'build', 'at', 'my', 'my', 'codes']

most_frequent = counts.most_common(2)で使用頻度の高い単語上位２つを表示
most_frequent = counts.most_common(2) print(most_frequent[1])で使用頻度の高い単語２位を表示

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up