LoginSignup
8
9

More than 5 years have passed since last update.

Pythonで前処理。ニコニコ動画のタグ検索結果をCSV形式に変換する

Last updated at Posted at 2014-05-01

概要

ニコニコ動画のタグ検索結果(XML)をCSV形式へ変換する。
加えて、作品別のタグ情報を1タグ1カラムにして、各行へ追記する。
具体的には、下記のような形式にする。

video_id,user_id,...,タグ1,タグ2,...
sm00000001,111111111,...,1,1,...
sm00000002,222222222,...,0,0,...
sm00000003,333333333,...,0,1,...

データ

-> ニコニコ動画のタグ検索結果をXML形式で取得する。(ログイン不要)

コード

ncxml2csv.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# ncxml2csv.py
# Copyright (c) 2014 nezuq
# This software is released under the MIT License.
# http://opensource.org/licenses/mit-license.php

from __future__ import unicode_literals
import sys
import codecs
from lxml import etree
import copy

argvs = sys.argv
argc = len(argvs)

#入力ファイル
FILE_INPUT = 'INPUT.xml'
if 1 < argc:
    FILE_INPUT = argvs[1].decode('UTF-8')

#しきい値(出現回数が指定範囲外のタグのカラムは出力しない)
MIN_COUNT = 3
if 2 < argc:
    MIN_COUNT = int(argvs[2])
MAX_COUNT = 9999
if 3 < argc:
    MAX_COUNT = int(argvs[3])

#元データ出力フラグ
DISP_SRCCOL = 1
if 4 < argc:
    DISP_SRCCOL = int(argvs[4])

#列名
COLUMNS_NAME = ['video_id','user_id','deleted','title','description','length_in_seconds','length','size_high','size_low',
                'movie_type','thumbnail_url','upload_time','first_retrieve','default_thread',
                'view_counter','comment_num','mylist_counter',
                'last_res_body','watch_url','thumb_type','embeddable','no_live_play',
                'option_flag_ichiba','option_flag_community','option_flag_domestic','option_flag_comment_type',
                'option_flag_adult','option_flag_mobile','option_flag_economy_mp4','option_flag_middle_video',
                'option_flag_mobile_ng_apple','main_category','main_category_key',
                'thread_id','thread_public','thread_num_res','thread_community_id','tags']

def main():
    rows = []
    tags = {}
    tags_default_col = []
    tree = etree.parse(FILE_INPUT)
    for vi in tree.findall('./video_info'):
        row = []
        row.append(vi.find('video/id').text) #video_id
        row.append(vi.find('video/user_id').text) #user_id
        row.append(vi.find('video/deleted').text) #deleted
        row.append(vi.find('video/title').text) #title
        row.append(vi.find('video/description').text) #description
        row.append(vi.find('video/length_in_seconds').text) #length_in_seconds
        row.append('') #length
        row.append('') #size_high
        row.append(vi.find('video/size_low').text) #size_low
        row.append(vi.find('video/movie_type').text) #movie_type
        row.append(vi.find('video/thumbnail_url').text) #thumbnail_url
        row.append(vi.find('video/upload_time').text) #upload_time
        row.append(vi.find('video/first_retrieve').text) #first_retrieve
        row.append(vi.find('video/default_thread').text) #default_thread
        row.append(vi.find('video/view_counter').text) #view_counter
        row.append('') #comment_num
        row.append(vi.find('video/mylist_counter').text) #mylist_counter
        row.append('') #last_res_body
        row.append('') #watch_url
        row.append('') #thumb_type
        row.append('') #embeddable
        row.append('') #no_live_play
        row.append(vi.find('video/option_flag_ichiba').text) #option_flag_ichiba
        row.append(vi.find('video/option_flag_community').text) #option_flag_community
        row.append(vi.find('video/option_flag_domestic').text) #option_flag_domestic
        row.append(vi.find('video/option_flag_comment_type').text) #option_flag_comment_type
        row.append(vi.find('video/option_flag_adult').text) #option_flag_adult
        row.append(vi.find('video/option_flag_mobile').text) #option_flag_mobile
        row.append(vi.find('video/option_flag_economy_mp4').text) #option_flag_economy_mp4
        row.append(vi.find('video/option_flag_middle_video').text) #option_flag_middle_video
        row.append(vi.find('video/option_flag_mobile_ng_apple').text) #option_flag_mobile_ng_apple
        row.append(vi.find('video/main_category').text) #main_category
        row.append(vi.find('video/main_category_key').text) #main_category_key
        row.append(vi.find('thread/id').text) #thread_id
        row.append(vi.find('thread/public').text) #thread_public
        row.append(vi.find('thread/num_res').text) #thread_num_res
        row.append(vi.find('thread/community_id').text) #thread_community_id
        row.append(etree.tostring(vi.find('tags'))) #tags
        rows.append((map(lambda x:x.replace(',', ',') if x else '', row)))
        tagname_per_row = map(lambda x:x.text, vi.findall('tags/tag_info/tag'))
        tagname_all = list(set(tags.keys() + tagname_per_row))
        for tagname in tagname_all:
            if tagname not in tags.keys():
                tags[tagname] = copy.copy(tags_default_col)
            if tagname in tagname_per_row:
                tags[tagname].append(1)
            else:
                tags[tagname].append(0)
        tags_default_col.append(0)
    tags_matched = []
    for key,val in tags.items():
        cnt = reduce(lambda x,y:x+y, val)
        if MIN_COUNT <= cnt <= MAX_COUNT:
            tags_matched.append((key, val, cnt))
    sorted_tags = sorted(tags_matched, key = (lambda x:x[2]), reverse = True)
    print ','.join((COLUMNS_NAME if DISP_SRCCOL == 1 else []) + map(lambda x:unicode(x[0]), sorted_tags))
    for i, row in enumerate(rows):
        print ','.join((row if DISP_SRCCOL == 1 else []) + map(lambda x:unicode(x[1][i]), sorted_tags))

if __name__ == '__main__':
    sys.stdout = codecs.getwriter('utf_8')(sys.stdout)
    main()
ncxml2csv.pyを実行する
python ncxml2csv.py INPUT.xml 3 9999 1 > OUTPUT.csv

※最後の1を0にすると、作品別のタグ情報のみ表示する。

参考ページ

-> i.nicovideo.jp APIとgetthumbinfo APIの比較

8
9
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
8
9