More than 5 years have passed since last update.

Instagramのアカウントからプロファイルを取得する

Posted at 2020-07-04

概要

Instagramの投稿に付帯しているハッシュタグから投稿者のIDを取得し、そこから投稿者のプロファイルを取得します。公式のInstagram APIを使わずに他人の投稿数、フォロー・フォロワー数、バイオグラフィーなどを取得することができます。今回はRでの方法を紹介します。ただしFaceBook関連のAPIはたびたびの制限により使用できなくなっているので、その点ご了承ください。

作業手順

１．投稿から投稿者のIDを取得する

今回はInstaCrawlRを使用しました。ここからダウンロードできます。
https://github.com/JonasSchroeder/InstaCrawlR

２．投稿者のIDから投稿者名を取得する

GraphQLでクエリを実行し結果を返信させます。
：https://www.instagram.com/graphql/query/?query_hash=c9100bf9110dd6361671f113dd02e7d6&variables={"user_id":"ここにIDを入れる","include_chaining":false,"include_reel":true,"include_suggested_users":false,"include_logged_out_extras":false,"include_highlight_reels":false,"include_related_profiles":false}

３．投稿者名からプロファイルを取得する

投稿者名から、各プロファイルを取得します。
https://www.instagram.com/"ここに投稿者名を入れる"/?__a=1

導入例

順を追って実行していきます。まずInstaCrawlRを実行します。

jsonReader.R


# ------------------------------------------------------------------------
# Part of InstaCrawlR
# Git Hub: https://github.com/JonasSchroeder/InstaCrawlR
# Code by Jonas Schröder
# See ReadME for instructions and examples
# Version 3: Additional column for export (post_url based on shortlinks)
# Last Updated July 2019
# ------------------------------------------------------------------------

library(jsonlite)
library(stringr)
library("jpeg")
library(tidyr)
library(utf8)

# ---------------------------------------------------------
# Download JSON File from Instagram for a specific Hashtag
# ---------------------------------------------------------
hashtag <- "sponsored" #ここに調べたいハッシュタグを入れる
url_start <- str_glue("http://instagram.com/explore/tags/{hashtag}/?__a=1")
json <- fromJSON(url_start)
edge_hashtag_to_media <- json$graphql$hashtag$edge_hashtag_to_media
end_cursor <- edge_hashtag_to_media$page_info$end_cursor
posts <- edge_hashtag_to_media$edges$node

# -----------------------------
# Extract Information per Post
# -----------------------------
index <- 1
post_id <- list()
post_url <- list()
post_text <- list()
post_time <- list()
post_likes <- list()
post_owner <- list()
post_img_url <- list()

extractInfo <- function(index){
    print("extractInfo function called")
    maxrows <- nrow(posts)
    for(i in 1:maxrows){
        if(i == maxrows){
            assign("index", index, envir = .GlobalEnv)
            assign("post_id", post_id, envir = .GlobalEnv)
            assign("post_text", post_text, envir = .GlobalEnv)
            assign("post_time", post_time, envir = .GlobalEnv)
            assign("post_img_url", post_img_url, envir = .GlobalEnv)
            assign("post_url", post_url, envir = .GlobalEnv)
            assign("post_likes", post_likes, envir = .GlobalEnv)
            assign("post_owner", post_owner, envir = .GlobalEnv)
            getNewPosts(index)
        } else {
            if(length(posts$edge_media_to_caption$edges[[i]][["node"]][["text"]])==0){
                post_text[index] <- "no-text"
                print("no text in post")
            } else {
                temp <- posts$edge_media_to_caption$edges[[i]][["node"]][["text"]]
                post_text[index] <- gsub("\n", " ", temp)
            }
            
            post_id_temp <- posts[i,5]
            post_url[index] <-  str_glue("http://instagram.com/p/{post_id_temp}")
            post_id[index] <- post_id_temp
            post_time[index] <- toString(as.POSIXct(posts[i,7], origin="1970-01-01"))
            post_img_url[index] <- posts[i,9]
            post_likes[index] <- posts[i,11]
            post_owner[index] <- posts[i,12]
            
            #optional: download image
            #img_dir <- str_glue("images/{index}_{hashtag}_post_img.jpg")
            #download.file(posts[i,8], img_dir, mode = 'wb')
            
            index <- index + 1
            #手動でコードを止める設定なので
            #自動で止めたい場合はstop()を入れる
        }
    }    
}

# ------------------------------
# Get New Posts from Instagram
# ------------------------------
getNewPosts <- function(index){
    print("getNewPosts function called")
    url_next <- str_glue("{url_start}&max_id={end_cursor}")
    json <- fromJSON(url_next)
    edge_hashtag_to_media <- json$graphql$hashtag$edge_hashtag_to_media
    end_cursor <- edge_hashtag_to_media$page_info$end_cursor
    posts <- edge_hashtag_to_media$edges$node
    assign("end_cursor", end_cursor, envir = .GlobalEnv)
    assign("posts", posts, envir = .GlobalEnv)
    print(index)
    Sys.sleep(1)
    extractInfo(index)
}

# Start the Madness
extractInfo(index)

ここまで実行すると、入力したハッシュタグが付帯している投稿をクロールしてきます。
コード内でも書いたように、実行しても自動で停止することはないので、必要なクロール数に達したらSTOPボタンを押すか、コードを書き換える必要があります。

jsonReader.R

# -----------------------------
# Export Dataframe to CSV()
# -----------------------------
table <- do.call(rbind.data.frame, Map('c', post_id, post_url, post_img_url, post_likes, post_owner, post_text, post_time))
colnames(table) <- c("ID", "Post_URL", "Img_URL", "Likes", "Owner", "Text", "Date")
# time <- Sys.time()
# filename <- str_glue("table-{hashtag}-{time}.csv")
# write.csv(table, 'filename', fileEncoding = "UTF-8")

すぐにCSVに保存する必要はありませんが、データフレームを作成しておくと便利です。
投稿者のIDはここのpost_ownerに入っています。

次にこのpost_ownerから投稿者名（いわゆるユーザー名）を取得し、投稿者のプロファイルを取得します。今回は投稿者名、バイオグラフィー、フォロー・フォロワー数、投稿数、ハイライトリール数を取得します。
コードとしては単純で、上記と同じfromJSONを使用します。

crawlUserName.R

# -----------------------------
# Extract User Name From User ID
# -----------------------------
User_Name <- list()
Biography <- list()
Follower <- list()
Following <- list()
Count_Post <- list()
Count_HighlightReel <- list()

maxrows <- nrow(table)
for(j in 1:maxrows){
    
    Owner_id <- table[j,5]
    url_username <- paste("https://www.instagram.com/graphql/query/?query_hash=c9100bf9110dd6361671f113dd02e7d6&variables={%22user_id%22:%22",Owner_id,"%22,%22include_chaining%22:false,%22include_reel%22:true,%22include_suggested_users%22:false,%22include_logged_out_extras%22:false,%22include_highlight_reels%22:false,%22include_related_profiles%22:false}",sep="")
    json2 <- fromJSON(url_username)
    User_Name[j] <- json2[["data"]][["user"]][["reel"]][["user"]][["username"]]
    
    print(j)
    Sys.sleep(1)
    #APIのリクエスト制限を回避するため
    #スクリプトの停止を行う
}

上下のコードは同一のforループ内で記述できます。
（あえて記述していない理由は後述します）

crawlProfile.R

 
for (k in 1:maxrows) {
    url_profile <- paste("https://www.instagram.com/",User_Name[k] ,"/?__a=1",sep="")
    json3 <- fromJSON(url_profile)
    
    Biography[k] <- json3[["graphql"]][["user"]][["biography"]]
    Follower[k] <- json3[["graphql"]][["user"]][["edge_followed_by"]][["count"]]
    Following[k] <- json3[["graphql"]][["user"]][["edge_follow"]][["count"]]
    Count_Post[k] <- json3[["graphql"]][["user"]][["edge_owner_to_timeline_media"]][["count"]]
    Count_HighlightReel[k] <- json3[["graphql"]][["user"]][["highlight_reel_count"]]
    
    print(k)
    Sys.sleep(1)
    #APIのリクエスト制限を回避するため
    #スクリプトの停止を行う
    
}

table2 <- do.call(rbind.data.frame, Map('c', User_Name, Biography, Follower, Following, Count_Post, Count_HighlightReel))
colnames(table2) <- c("User_Name", "Bio", "Follower", "Following", "Count_Post", "Count_HR")

table2 <- table2[!duplicated(table2$User_Name),] #重複した投稿者を削除

未解決の問題点

コード内にも記述しましたが、実行するとAPIのリクエスト制限によるエラーが（どこかで）発生します。
私が行った場合だとcrawlUserName.Rは制限にかかりませんでしたが、crawlProfile.Rは200回ほど行ったところでエラーを吐きました。とほほ。

Sys.sleep()の変数を調整するか、他の解決方法を見つけてくださるとうれしいです。

お世話になった参考文献

Instagram のあるアカウントの Follower をクロールする
https://qiita.com/kon_now/items/aa6fdd58c86664d55113

Instagram？__ a = 1 URLが機能しなくなり、データを取得するgraphql / queryの問題
https://www.it-swarm.dev/ja/javascript/instagram？-a-1-urlが機能しなくなり、データを取得するgraphql-queryの問題/837511091/

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up