##Huggingface版CLIPのマニュアル
>>> from PIL import Image >> import requests
from transformers import CLIPProcessor, CLIPModel
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
outputs = model(**inputs)
logits_per_image = outputs.logits_per_image # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
##使ってみる
electron@diynoMacBook-Pro ~ % python3
Python 3.9.6 (default, Jun 29 2021, 06:20:32)
[Clang 12.0.0 (clang-1200.0.32.29)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> from PIL import Image
>>> import requests
>>> from transformers import CLIPProcessor, CLIPModel
>>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
Downloading: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 3.98k/3.98k [00:00<00:00, 931kB/s]
Downloading: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 605M/605M [00:10<00:00, 60.1MB/s]
>>>
>>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
Downloading: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 316/316 [00:00<00:00, 213kB/s]
Downloading: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 862k/862k [00:02<00:00, 412kB/s]
Downloading: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 525k/525k [00:00<00:00, 731kB/s]
Downloading: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 389/389 [00:00<00:00, 209kB/s]
Downloading: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 568/568 [00:00<00:00, 394kB/s]
Downloading: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 1.49M/1.49M [00:01<00:00, 1.22MB/s]
>>>
画像1つ目
- 指定キーワード:"a photo of a cat", "a photo of a dog"
- 該当確率: 0.9949, 0.0051
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>>
>>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
>>> outputs = model(**inputs)
>>>
>>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
>>> print(logits_per_image)
tensor([[24.5701, 19.3049]], grad_fn=<PermuteBackward>)
>>>
>>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
>>> print(probs)
tensor([[0.9949, 0.0051]], grad_fn=<SoftmaxBackward>)
>>>
- 指定キーワード:"cute", "beautiful", "scaring", "awful", "miserable"
- 該当確率: 0.2234, 0.0775, 0.1882, 0.1565, 0.3543
>>> input2 = processor(text=["cute", "beautiful", "scaring", "awful", "miserable"], images=image, return_tensors="pt", padding=True)
>>>
>>> outputs2 = model(**input2)
>>>
>>> logits_per_image2 = outputs2.logits_per_image # this is the image-text similarity score
>>> probs2 = logits_per_image2.softmax(dim=1) # we can take the softmax to get the label probabilities
>>> print(probs2)
tensor([[0.2234, 0.0775, 0.1882, 0.1565, 0.3543]], grad_fn=<SoftmaxBackward>)
>>>
- 指定キーワード:"dog", "cat", "pig", "human", "desk", "chair"
- 該当確率: 0.0038, 0.5642, 0.0087, 0.1428, 0.0242, 0.2562
>>> input3 = processor(text=["dog", "cat", "pig", "human", "desk", "chair"], images=image, return_tensors="pt", padding=True)
>>> outputs3 = model(**input3)
>>>
>>> logits_per_image3 = outputs3.logits_per_image # this is the image-text similarity score
>>> probs3 = logits_per_image3.softmax(dim=1) # we can take the softmax to get the label probabilities
>>> print(probs3)
tensor([[0.0038, 0.5642, 0.0087, 0.1428, 0.0242, 0.2562]],
grad_fn=<SoftmaxBackward>)
>>>
画像2つ目
- 指定キーワード:"a gate", "a pyramid", "a building", "a house", "human", "zoo", "animal", "car"
- 該当確率: 0.0086, 0.0379, 0.0007, 0.0111, 0.4692, 0.1291, 0.3180, 0.0253
>>> url2 = "https://upload.wikimedia.org/wikipedia/commons/thumb/8/8d/Paris_July_2011-30.jpg/500px-Paris_July_2011-30.jpg"
>>> image2 = Image.open(requests.get(url2, stream=True).raw)
>>>
>>> inputs_new = processor(text=["a gate", "a pyramid", "a building", "a house", "human", "zoo", "animal", "car"], images=image, return_tensors="pt", padding=True)
>>>
>>> outputs_new = model(**inputs_new)
>>> logits_per_image_new = outputs_new.logits_per_image
>>> print(logits_per_image_new)
tensor([[17.8990, 19.3880, 15.3294, 18.1629, 21.9030, 20.6123, 21.5141, 18.9841]],
grad_fn=<PermuteBackward>)
>>>
>>> probs_new = logits_per_image_new.softmax(dim=1)
>>> print(probs_new)
tensor([[0.0086, 0.0379, 0.0007, 0.0111, 0.4692, 0.1291, 0.3180, 0.0253]],
grad_fn=<SoftmaxBackward>)
>>>
画像3つ目
- 指定キーワード:""cat", "dog", "raccoon dog", "a house", "human", "zoo", "car"
- 該当確率: 0.4706, 0.0458, 0.0059, 0.0036, 0.1275, 0.0736, 0.2730
>>> url3 = "https://www.tv-asahi.co.jp/doraemon/cast/img/doraemon.jpg"
>>> image3 = Image.open(requests.get(url3, stream=True).raw)
>>>
>>> inputs_new2 = processor(text=["cat", "dog", "raccoon dog", "a house", "human", "zoo", "car"], images=image3, return_tensors="pt", padding=True)
>>> outputs_new2 = model(**inputs_new2)
>>> logits_per_image_new2 = outputs_new2.logits_per_image
>>> probs_new2 = logits_per_image_new2.softmax(dim=1)
>>> print(probs_new2)
tensor([[0.4706, 0.0458, 0.0059, 0.0036, 0.1275, 0.0736, 0.2730]],
grad_fn=<SoftmaxBackward>)
>>>