More than 5 years have passed since last update.

Pythonで前処理。ニコニコ動画のタグ検索結果をCSV形式に変換する

Last updated at 2014-05-03Posted at 2014-05-01

概要

ニコニコ動画のタグ検索結果(XML)をCSV形式へ変換する。
加えて、作品別のタグ情報を1タグ1カラムにして、各行へ追記する。
具体的には、下記のような形式にする。

video_id,user_id,...,タグ1,タグ2,...
sm00000001,111111111,...,1,1,...
sm00000002,222222222,...,0,0,...
sm00000003,333333333,...,0,1,...

データ

-> ニコニコ動画のタグ検索結果をXML形式で取得する。(ログイン不要)

コード

ncxml2csv.py

# !/usr/bin/env python
# -*- coding: utf-8 -*-

# ncxml2csv.py
# Copyright (c) 2014 nezuq
# This software is released under the MIT License.
# http://opensource.org/licenses/mit-license.php

from __future__ import unicode_literals
import sys
import codecs
from lxml import etree
import copy

argvs = sys.argv
argc = len(argvs)

# 入力ファイル
FILE_INPUT = 'INPUT.xml'
if 1 < argc:
	FILE_INPUT = argvs[1].decode('UTF-8')

# しきい値（出現回数が指定範囲外のタグのカラムは出力しない）
MIN_COUNT = 3
if 2 < argc:
	MIN_COUNT = int(argvs[2])
MAX_COUNT = 9999
if 3 < argc:
	MAX_COUNT = int(argvs[3])

# 元データ出力フラグ
DISP_SRCCOL = 1
if 4 < argc:
	DISP_SRCCOL = int(argvs[4])

# 列名
COLUMNS_NAME = ['video_id','user_id','deleted','title','description','length_in_seconds','length','size_high','size_low',
                'movie_type','thumbnail_url','upload_time','first_retrieve','default_thread',
                'view_counter','comment_num','mylist_counter',
                'last_res_body','watch_url','thumb_type','embeddable','no_live_play',
                'option_flag_ichiba','option_flag_community','option_flag_domestic','option_flag_comment_type',
                'option_flag_adult','option_flag_mobile','option_flag_economy_mp4','option_flag_middle_video',
                'option_flag_mobile_ng_apple','main_category','main_category_key',
                'thread_id','thread_public','thread_num_res','thread_community_id','tags']

def main():
	rows = []
	tags = {}
	tags_default_col = []
	tree = etree.parse(FILE_INPUT)
	for vi in tree.findall('./video_info'):
		row = []
		row.append(vi.find('video/id').text) #video_id
		row.append(vi.find('video/user_id').text) #user_id
		row.append(vi.find('video/deleted').text) #deleted
		row.append(vi.find('video/title').text) #title
		row.append(vi.find('video/description').text) #description
		row.append(vi.find('video/length_in_seconds').text) #length_in_seconds
		row.append('') #length
		row.append('') #size_high
		row.append(vi.find('video/size_low').text) #size_low
		row.append(vi.find('video/movie_type').text) #movie_type
		row.append(vi.find('video/thumbnail_url').text) #thumbnail_url
		row.append(vi.find('video/upload_time').text) #upload_time
		row.append(vi.find('video/first_retrieve').text) #first_retrieve
		row.append(vi.find('video/default_thread').text) #default_thread
		row.append(vi.find('video/view_counter').text) #view_counter
		row.append('') #comment_num
		row.append(vi.find('video/mylist_counter').text) #mylist_counter
		row.append('') #last_res_body
		row.append('') #watch_url
		row.append('') #thumb_type
		row.append('') #embeddable
		row.append('') #no_live_play
		row.append(vi.find('video/option_flag_ichiba').text) #option_flag_ichiba
		row.append(vi.find('video/option_flag_community').text) #option_flag_community
		row.append(vi.find('video/option_flag_domestic').text) #option_flag_domestic
		row.append(vi.find('video/option_flag_comment_type').text) #option_flag_comment_type
		row.append(vi.find('video/option_flag_adult').text) #option_flag_adult
		row.append(vi.find('video/option_flag_mobile').text) #option_flag_mobile
		row.append(vi.find('video/option_flag_economy_mp4').text) #option_flag_economy_mp4
		row.append(vi.find('video/option_flag_middle_video').text) #option_flag_middle_video
		row.append(vi.find('video/option_flag_mobile_ng_apple').text) #option_flag_mobile_ng_apple
		row.append(vi.find('video/main_category').text) #main_category
		row.append(vi.find('video/main_category_key').text) #main_category_key
		row.append(vi.find('thread/id').text) #thread_id
		row.append(vi.find('thread/public').text) #thread_public
		row.append(vi.find('thread/num_res').text) #thread_num_res
		row.append(vi.find('thread/community_id').text) #thread_community_id
		row.append(etree.tostring(vi.find('tags'))) #tags
		rows.append((map(lambda x:x.replace(',', '，') if x else '', row)))
		tagname_per_row = map(lambda x:x.text, vi.findall('tags/tag_info/tag'))
		tagname_all = list(set(tags.keys() + tagname_per_row))
		for tagname in tagname_all:
			if tagname not in tags.keys():
				tags[tagname] = copy.copy(tags_default_col)
			if tagname in tagname_per_row:
				tags[tagname].append(1)
			else:
				tags[tagname].append(0)
		tags_default_col.append(0)
	tags_matched = []
	for key,val in tags.items():
		cnt = reduce(lambda x,y:x+y, val)
		if MIN_COUNT <= cnt <= MAX_COUNT:
			tags_matched.append((key, val, cnt))
	sorted_tags = sorted(tags_matched, key = (lambda x:x[2]), reverse = True)
	print ','.join((COLUMNS_NAME if DISP_SRCCOL == 1 else []) + map(lambda x:unicode(x[0]), sorted_tags))
	for i, row in enumerate(rows):
		print ','.join((row if DISP_SRCCOL == 1 else []) + map(lambda x:unicode(x[1][i]), sorted_tags))

if __name__ == '__main__':
	sys.stdout = codecs.getwriter('utf_8')(sys.stdout)
	main()

ncxml2csv.pyを実行する

python ncxml2csv.py INPUT.xml 3 9999 1 > OUTPUT.csv

※最後の1を0にすると、作品別のタグ情報のみ表示する。

参考ページ

-> i.nicovideo.jp APIとgetthumbinfo APIの比較

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up