MediapipeのTextタスクとAudioタスクとVisionタスクを試したので、メモ程度にまとめました。
1. 環境構築
$ sw_vers
ProductName: macOS
ProductVersion: 13.2.1
$ python -V
Python 3.9.9
$ pip install mediapipe==0.10.14
2. テキスト分類
モデルはこちら(BERT分類器)から
TextClassification.py
from mediapipe.tasks import python
from mediapipe.tasks.python import text
MODEL_PATH = "./model/classifier.tflite"
TEXT_LIST = [
"I'm going to work today.", #今日は仕事に行きます。
"you're no good", #君はダメだ。
"It is sunny today", #今日は晴れです。
"It's cloudy today" #今日は曇りです。
]
def main():
base_options = python.BaseOptions(model_asset_path=MODEL_PATH)
options = text.TextClassifierOptions(base_options=base_options)
classifier = text.TextClassifier.create_from_options(options)
for input_text in TEXT_LIST:
classification_result = classifier.classify(input_text)
top_category = classification_result.classifications[0].categories[0]
print("word:[", input_text, "]")
print(top_category)
if __name__ == "__main__":
main()
2-1. 実行してみる
$ python TextClassification.py
word:[ I'm going to work today. ]
Category(index=0, score=0.6713167428970337, display_name='', category_name='negative')
word:[ you're no good ]
Category(index=0, score=0.9954270124435425, display_name='', category_name='negative')
word:[ It is sunny today ]
Category(index=1, score=0.996543824672699, display_name='', category_name='positive')
word:[ It's cloudy today ]
Category(index=0, score=0.8927320241928101, display_name='', category_name='negative')
3. テキスト埋め込み
モデルはこちら(平均単語埋め込み)から
TextEmbedded.py
import mediapipe as mp
MODEL_PATH = './model/universal_sentence_encoder.tflite'
INPUT_TEXT = "The input text to be embedded."
def main():
BaseOptions = mp.tasks.BaseOptions
TextEmbedder = mp.tasks.text.TextEmbedder
TextEmbedderOptions = mp.tasks.text.TextEmbedderOptions
options = TextEmbedderOptions(
base_options=BaseOptions(model_asset_path=MODEL_PATH),
quantize=True)
text_embedder = TextEmbedder.create_from_options(options)
embedding_result = text_embedder.embed(INPUT_TEXT)
print("word:[", INPUT_TEXT, "]:")
print(embedding_result.embeddings)
if __name__ == "__main__":
main()
3-1. 実行してみる
$ python TextEmbedded.py
word:[ The input text to be embedded. ]:
[Embedding(embedding=array([127, 233, 159, 127, 46, 32, 128, 128, 127, 204, 128, 162, 127,
128, 33, 78, 220, 196, 173, 128, 142, 127, 123, 210, 249, 127,
184, 128, 77, 127, 7, 181, 180, 86, 16, 19, 202, 181, 127,
92, 162, 127, 96, 127, 134, 127, 128, 113, 127, 1, 128, 127,
178, 85, 127, 125, 30, 53, 17, 127, 128, 128, 127, 211, 32,
127, 91, 248, 128, 127, 183, 187, 204, 237, 227, 128, 127, 40,
211, 206, 127, 91, 127, 245, 1, 24, 127, 172, 128, 80, 128,
243, 128, 128, 128, 223, 127, 115, 45, 147], dtype=uint8), head_index=1, head_name='response_encoding')]
4. 言語の検出
モデルはこちら(言語検出)から
LanguageDetection.py
from mediapipe.tasks import python
from mediapipe.tasks.python import text
model_path = './model/language_detector.tflite'
langauge_list = [
"ハロー世界", #日本語
"hello world", #英語
"Bonjour le monde", #フランス語
"안녕 세계", #韓国語
"Привет, мир", #ロシア語
"你好世界" #中国語
]
def main():
base_options = python.BaseOptions(model_asset_path=model_path)
base_options = python.BaseOptions(model_asset_path=model_path)
options = text.LanguageDetectorOptions(base_options=base_options)
for input_text in langauge_list:
with python.text.LanguageDetector.create_from_options(options) as detector:
detection_result = detector.detect(input_text)
print("[", input_text, "]:")
print(detection_result.detections[0])
if __name__ == "__main__":
main()
4-1. 実行してみる
$ python LanguageDetection.py
[ ハロー世界 ]:
LanguageDetectorResult.Detection(language_code='ja', probability=0.9841582775115967)
[ hello world ]:
LanguageDetectorResult.Detection(language_code='en', probability=0.8353540897369385)
[ Bonjour le monde ]:
LanguageDetectorResult.Detection(language_code='fr', probability=0.997443437576294)
[ 안녕 세계 ]:
LanguageDetectorResult.Detection(language_code='ko', probability=0.9999277591705322)
[ Привет, мир ]:
LanguageDetectorResult.Detection(language_code='ru', probability=0.7444538474082947)
[ 你好世界 ]:
LanguageDetectorResult.Detection(language_code='zh', probability=0.9943158030509949)
5. 音声分類(会話の音声)
モデルはこちら(YamNet)から
スピーチ用の音声はこちらを使用
AudioClassifcation_speech.py
import urllib.request
import os
import numpy as np
from mediapipe.tasks import python
from mediapipe.tasks.python.components import containers
from mediapipe.tasks.python import audio
from scipy.io import wavfile
AUDIO_PATH = './audio/speech_16000_hz_mono.wav'
MODEL_PATH = './model/lite-model_yamnet_classification_tflite_1.tflite'
def main():
if not os.path.isfile(AUDIO_PATH):
url = 'https://storage.googleapis.com/mediapipe-assets/speech_16000_hz_mono.wav'
urllib.request.urlretrieve(url, AUDIO_PATH)
base_options = python.BaseOptions(model_asset_path=MODEL_PATH)
options = audio.AudioClassifierOptions(base_options=base_options, max_results=4)
with audio.AudioClassifier.create_from_options(options) as classifier:
sample_rate, wav_data = wavfile.read(AUDIO_PATH)
audio_clip = containers.AudioData.create_from_array(
wav_data.astype(float) / np.iinfo(np.int16).max, sample_rate)
classification_result_list = classifier.classify(audio_clip)
print("[", AUDIO_PATH, "]:")
for idx, _ in enumerate([0, 975, 1950, 2925]):
classification_result = classification_result_list[idx]
top_category = classification_result.classifications[0].categories[0]
print(top_category)
if __name__ == "__main__":
main()
5-1. 実行してみる
$ python AudioClassifcation_speech.py
[ ./audio/speech_16000_hz_mono.wav ]
Category(index=0, score=0.91796875, display_name='', category_name='Speech')
Category(index=0, score=0.98828125, display_name='', category_name='Speech')
Category(index=0, score=0.984375, display_name='', category_name='Speech')
Category(index=0, score=0.99609375, display_name='', category_name='Speech')
6. 音声分類(音楽の音声)
音楽用の音声はこちらを使用
AudioClassifcation_music.py
import urllib.request
import os
import numpy as np
from mediapipe.tasks import python
from mediapipe.tasks.python.components import containers
from mediapipe.tasks.python import audio
from scipy.io import wavfile
AUDIO_PATH = './audio/loop1.wav'
MODEL_PATH = './model/lite-model_yamnet_classification_tflite_1.tflite'
def main():
if not os.path.isfile(AUDIO_PATH):
url = 'https://www.ne.jp/asahi/music/myuu/wave/loop1.wav'
urllib.request.urlretrieve(url, AUDIO_PATH)
base_options = python.BaseOptions(model_asset_path=MODEL_PATH)
options = audio.AudioClassifierOptions(
base_options=base_options, max_results=4)
with audio.AudioClassifier.create_from_options(options) as classifier:
sample_rate, wav_data = wavfile.read(AUDIO_PATH)
audio_clip = containers.AudioData.create_from_array(
wav_data.astype(float) / np.iinfo(np.int16).max, sample_rate)
classification_result_list = classifier.classify(audio_clip)
print("[", AUDIO_PATH, "]:")
for idx, _ in enumerate([0, 975, 1950, 2925]):
classification_result = classification_result_list[idx]
top_category = classification_result.classifications[0].categories[0]
print(top_category)
if __name__ == "__main__":
main()
6-1. 実行してみる
$ python AudioClassifcation_music.py
[ ./audio/loop1.wav ]:
Category(index=132, score=0.984375, display_name='', category_name='Music')
Category(index=132, score=0.984375, display_name='', category_name='Music')
Category(index=132, score=0.98828125, display_name='', category_name='Music')
Category(index=132, score=0.96875, display_name='', category_name='Music')
7. オブジェクト検出
モデルはこちら(EfficientNet-Lite0 (int8))から
使用する画像
ObjectDetection.py
import numpy as np
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
import cv2
# 「image」フォルダに格納してください。
IMAGE_LIST = [
'cat.jpeg',
'dog.jpg'
]
model_path = './model/efficientdet_lite0_int8.tflite'
MARGIN = 10 # pixels
ROW_SIZE = 10 # pixels
FONT_SIZE = 2
FONT_THICKNESS = 2
TEXT_COLOR = (0, 0, 255) # red
def visualize(image, detection_result) -> np.ndarray:
"""Draws bounding boxes on the input image and return it.
Args:
image: The input RGB image.
detection_result: The list of all "Detection" entities to be visualize.
Returns:
Image with bounding boxes.
"""
for detection in detection_result.detections:
# Draw bounding_box
bbox = detection.bounding_box
start_point = bbox.origin_x, bbox.origin_y
end_point = bbox.origin_x + bbox.width, bbox.origin_y + bbox.height
cv2.rectangle(image, start_point, end_point, TEXT_COLOR, 3)
# Draw label and score
category = detection.categories[0]
category_name = category.category_name
probability = round(category.score, 100)
result_text = category_name + ' (' + str(probability) + ')'
text_location = (MARGIN + bbox.origin_x, MARGIN + ROW_SIZE + bbox.origin_y)
cv2.putText(image, result_text, text_location, cv2.FONT_HERSHEY_PLAIN, FONT_SIZE, TEXT_COLOR, FONT_THICKNESS)
return image
def main():
base_options = python.BaseOptions(model_asset_path=model_path)
options = vision.ObjectDetectorOptions(base_options=base_options, score_threshold=0.5)
detector = vision.ObjectDetector.create_from_options(options)
for input_image in IMAGE_LIST:
save_image = f'./image/detection_result_{input_image}'
input_image = "./image/" + input_image
image = mp.Image.create_from_file(input_image)
detection_result = detector.detect(image)
image_copy = np.copy(image.numpy_view())
annotated_image = visualize(image_copy, detection_result)
rgb_annotated_image = cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB)
cv2.imwrite(save_image, rgb_annotated_image)
print("image:[", input_image, "]")
print("result:[", detection_result.detections[0].categories[0], "]")
if __name__ == "__main__":
main()
7-1. 実行してみる
$ python ObjectDetection.py
image:[ ./image/cat.jpeg ]
result:[ Category(index=None, score=0.80078125, display_name=None, category_name='cat') ]
image:[ ./image/dog.jpg ]
result:[ Category(index=None, score=0.88671875, display_name=None, category_name='dog') ]
他のモデルと比較
ObjectDetection.py
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
# 「image」フォルダに格納してください。
INPUT_IMAGE = './image/cat.jpeg'
MODEL_LIST = [
'./model/efficientdet_lite0_int8.tflite', #EfficientDet-Lite0 (int8)
'./model/efficientdet_lite0_float32.tflite', #EfficientDet-Lite0(float 32)
'./model/efficientdet_lite2_int8.tflite', #EfficientDet-Lite2 (int8)
'./model/efficientdet_lite2_float16.tflite', #EfficientDet-Lite2 (float16)
'./model/efficientdet_lite2_float32.tflite', #EfficientDet-Lite2 (float32)
'./model/ssd_mobilenet_v2_int8.tflite', #SSDMobileNet-V2 (int8)
'./model/ssd_mobilenet_v2_float32.tflite' #SSDMobileNet-V2 (float32)
]
def main():
for model_path in MODEL_LIST :
base_options = python.BaseOptions(model_asset_path=model_path)
options = vision.ObjectDetectorOptions(base_options=base_options, score_threshold=0.5)
detector = vision.ObjectDetector.create_from_options(options)
image = mp.Image.create_from_file(INPUT_IMAGE)
detection_result = detector.detect(image)
print("model:[", model_path, "]")
print(detection_result.detections[0].categories)
if __name__ == "__main__":
main()
model:[ ./model/efficientdet_lite0_int8.tflite ]
[Category(index=None, score=0.80078125, display_name=None, category_name='cat')]
model:[ ./model/efficientdet_lite0_float32.tflite ]
[Category(index=None, score=0.7745001316070557, display_name=None, category_name='cat')]
model:[ ./model/efficientdet_lite2_int8.tflite ]
[Category(index=None, score=0.7109375, display_name=None, category_name='cat')]
model:[ ./model/efficientdet_lite2_float16.tflite ]
[Category(index=None, score=0.7353924512863159, display_name=None, category_name='cat')]
model:[ ./model/efficientdet_lite2_float32.tflite ]
[Category(index=None, score=0.7346012592315674, display_name=None, category_name='cat')]
model:[ ./model/ssd_mobilenet_v2_int8.tflite ]
[Category(index=None, score=0.6182375550270081, display_name=None, category_name='teddy bear')]
model:[ ./model/ssd_mobilenet_v2_float32.tflite ]
[Category(index=None, score=0.6182098984718323, display_name=None, category_name='teddy bear')]
8. 画像分類
モデルはこちら(EfficientNet-Lite0(float 32)から
ImageClassification.py
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python.components import processors
from mediapipe.tasks.python import vision
IMAGE_FILENAMES = [
'./image/cat.jpeg',
'./image/dog.jpg'
]
model_path = './model/efficientdet_lite0_float16.tflite'
def main():
base_options = python.BaseOptions(model_asset_path=model_path)
options = vision.ImageClassifierOptions(base_options=base_options, max_results=4)
classifier = vision.ImageClassifier.create_from_options(options)
for image_name in IMAGE_FILENAMES:
image = mp.Image.create_from_file(image_name)
classification_result = classifier.classify(image)
print("[", image_name, "]:")
for result in classification_result.classifications[0].categories:
print("result:", result)
if __name__ == "__main__":
main()
8-1. 実行してみる
$ python ImageClassification.py
[ ./image/cat.jpeg ]:
result: Category(index=258, score=0.09676501899957657, display_name='', category_name='Samoyed')
result: Category(index=850, score=0.08314952254295349, display_name='', category_name='teddy')
result: Category(index=265, score=0.05580028146505356, display_name='', category_name='toy poodle')
result: Category(index=283, score=0.0362594872713089, display_name='', category_name='Persian cat')
[ ./image/dog.jpg ]:
result: Category(index=207, score=0.27572956681251526, display_name='', category_name='golden retriever')
result: Category(index=208, score=0.20771914720535278, display_name='', category_name='Labrador retriever')
result: Category(index=222, score=0.05555836856365204, display_name='', category_name='kuvasz')
result: Category(index=267, score=0.03965079411864281, display_name='', category_name='standard poodle')