この辺り
を参考に、Beebox + NUC のベアボーン PC のみで 3 ノードの Nutanix CE クラスタを構築した。
4 ノードに拡張 (Expand Cluster) する際にエラーになったが、無理やり拡張した時のメモ。
バージョンは ce-2017.02.23-stable
状況
Prism から Expand Cluster を実行すると、こうなる
Failure in pre expand-cluster tests. Errors: Failed to perform network validation!
と言われてもどうしたらいいのか分からない。
Forum を見ると ncli を使ってうまくいった人がいる様子。
- http://next.nutanix.com/t5/Discussion-Forum/Trouble-expanding-cluster/m-p/18034
- http://virtual-hike.com/add-new-node-nutanix-cluster-using-cli/
が、私の環境だとそれだけではダメだった。ncli を使うと、こうなる。
$ ncli cluster discover-nodes
Cluster Id :
Controller Vm Address : 192.168.37.54
Current Network Interface : eth0
Foundation Version : foundation-3.5-7ca0fb1f
Hypervisor : kvm
Hypervisor Address : 192.168.37.44
Hypervisor Version : el7.nutanix.20170222
Ip : fe80::526b:8dff:fef4:49e0%eth0
Ipmi Address :
Node Position : A
Node Uuid : 4baf4768-561d-4e49-ab01-c2eda8c2fb39
Nos Version : 2017.02.23
Rackable Unit Model : CommunityEdition
Rackable Unit Serial : 5452d712
Sed Node : false
Svm Ip : 192.168.37.54
$ ncli cluster add-node node-uuid=4baf4768-561d-4e49-ab01-c2eda8c2fb39
Error: Unknown error, received null response from Genesis
ログを見てみる。
DEBUG 0327 01:16:04.136 main GenesisProxy.postData:863 Complete JSON response: {".return": [{"rackable_unit_serial": "5452d712", "node_position": "A", "ip": "fe80::526b:8dff:fef4:49e0%eth0", "svm_ip": "192.168.37.54", "cluster_id": "", "hypervisor_version": "el7.nutanix.20170222", "rackable_unit_model": "CommunityEdition", "foundation_version": "foundation-3.5-7ca0fb1f", "node_uuid": "4baf4768-561d-4e49-ab01-c2eda8c2fb39", "current_network_interface": "eth0", "hypervisor": "kvm", "nos_version": "2017.02.23", "current_cvm_vlan_tag": null, "attributes": {"is_model_supported": true}}]}
DEBUG 0327 01:16:04.137 main GenesisProxy.discoverNodes:161 JSON response for discover nodes: [{"ip":"fe80::526b:8dff:fef4:49e0%eth0","current_network_interface":"eth0","cluster_id":"","svm_ip":"192.168.37.54","nos_version":"2017.02.23","hypervisor_version":"el7.nutanix.20170222","rackable_unit_model":"CommunityEdition","hypervisor":"kvm","attributes":{"is_model_supported":true},"rackable_unit_serial":"5452d712","node_position":"A","node_uuid":"4baf4768-561d-4e49-ab01-c2eda8c2fb39","foundation_version":"foundation-3.5-7ca0fb1f","current_cvm_vlan_tag":null}]
DEBUG 0327 01:16:04.138 main GenesisProxy.getSEDDriveStatus:927 Executing genesis rpc to query sed drive status: {".oid":"NodeManager", ".method":"disks_are_self_encrypting_drives", ".kwargs":{}}
DEBUG 0327 01:16:04.138 main GenesisProxy.postData:821 Rpc URL: http://127.0.0.1:2100/proxy/fe80::526b:8dff:fef4:49e0%eth0/jsonrpc
DEBUG 0327 01:16:04.257 main GenesisProxy.postData:863 Complete JSON response: {".return": {"S344J9GG101013": false, "160894441321": false}}
DEBUG 0327 01:16:04.257 main GenesisProxy.getSEDDriveStatus:935 Genesis rpc response for sed drive status: {"S344J9GG101013":false,"160894441321":false}
DEBUG 0327 01:16:04.258 main GenesisProxy.enrichDiscovedNodes:218 Getting IP addresses of node fe80::526b:8dff:fef4:49e0%eth0
DEBUG 0327 01:16:04.258 main GenesisProxy.enrichDiscovedNodes:224 JSON request to get node ip addresses: {".oid":"NodeManager", ".method":"get_ip", ".kwargs":{}}
DEBUG 0327 01:16:04.259 main GenesisProxy.postData:821 Rpc URL: http://127.0.0.1:2100/proxy/fe80::526b:8dff:fef4:49e0%eth0/jsonrpc
DEBUG 0327 01:16:04.330 main GenesisProxy.postData:863 Complete JSON response: {".return": [{"netmask": "255.255.255.0", "gateway": "192.168.37.254", "address": "192.168.37.54"}, {"netmask": "255.255.255.0", "gateway": "192.168.37.254", "address": "192.168.37.44"}, {"netmask": null, "gateway": null, "address": null}]}
DEBUG 0327 01:16:04.330 main GenesisProxy.enrichDiscovedNodes:232 JSON response for get node ip addresses: [{"address":"192.168.37.54","netmask":"255.255.255.0","gateway":"192.168.37.254"},{"address":"192.168.37.44","netmask":"255.255.255.0","gateway":"192.168.37.254"},{"address":null,"netmask":null,"gateway":null}]
DEBUG 0327 01:16:04.506 main GenesisProxy.configureNode:420 JSON request for configure node: {".oid":"NodeManager", ".method":"configure_ip", ".kwargs":{"svm_ip":{"address":"192.168.37.54","netmask":"255.255.255.0","gateway":"192.168.37.254"},"hypervisor_ip":{"address":"192.168.37.44","netmask":"255.255.255.0","gateway":"192.168.37.254"},"ipmi_ip":{"address":""}}}
DEBUG 0327 01:16:04.511 main GenesisProxy.postData:821 Rpc URL: http://127.0.0.1:2100/proxy/fe80::526b:8dff:fef4:49e0%eth0/jsonrpc
DEBUG 0327 01:16:04.666 main GenesisProxy.postData:863 Complete JSON response: {".error": "Caught exception: equal_to_parameters() takes exactly 4 non-keyword arguments (2 given)\nTraceback (most recent call last):\n File \"/home/afg/src/main/builds/build-ce-2017.02.23-stable-release/python-tree/bdist.linux-x86_64/egg/util/net/rpc.py\", line 316, in handle_jsonrpc\n File \"/home/afg/src/main/builds/build-ce-2017.02.23-stable-release/python-tree/bdist.linux-x86_64/egg/cluster/genesis/node_manager.py\", line 5396, in configure_ip\nTypeError: equal_to_parameters() takes exactly 4 non-keyword arguments (2 given)\n", ".exception": "equal_to_parameters() takes exactly 4 non-keyword arguments (2 given)"}
ERROR 0327 01:16:04.677 main GenesisProxy.postData:882 Unknown error, received null response from Genesis
com.nutanix.prism.cli.managers.genesis.GenesisException: Unknown error, received null response from Genesis
at com.nutanix.prism.cli.managers.genesis.GenesisProxy.postData(GenesisProxy.java:872)
at com.nutanix.prism.cli.managers.genesis.GenesisProxy.configureNode(GenesisProxy.java:421)
at com.nutanix.prism.cli.operations.ClusterHostCommonOps.addNode(ClusterHostCommonOps.java:83)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at com.nutanix.cli.base.execution.ExecutorImpl.executeCommand(ExecutorImpl.java:148)
at com.nutanix.cli.base.execution.NonInteractiveExecutor.execute(NonInteractiveExecutor.java:17)
at com.nutanix.cli.base.AbstractCli.execute(AbstractCli.java:65)
at com.nutanix.prism.cli.Ncli.main(Ncli.java:142)
ERROR 0327 01:16:04.678 main GenesisProxy.configureNode:433 Unknown error, received null response from Genesis
com.nutanix.prism.cli.managers.genesis.GenesisException: Unknown error, received null response from Genesis
at com.nutanix.prism.cli.managers.genesis.GenesisProxy.postData(GenesisProxy.java:883)
at com.nutanix.prism.cli.managers.genesis.GenesisProxy.configureNode(GenesisProxy.java:421)
at com.nutanix.prism.cli.operations.ClusterHostCommonOps.addNode(ClusterHostCommonOps.java:83)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at com.nutanix.cli.base.execution.ExecutorImpl.executeCommand(ExecutorImpl.java:148)
at com.nutanix.cli.base.execution.NonInteractiveExecutor.execute(NonInteractiveExecutor.java:17)
at com.nutanix.cli.base.AbstractCli.execute(AbstractCli.java:65)
at com.nutanix.prism.cli.Ncli.main(Ncli.java:142)
ERROR 0327 01:16:04.717 main PrintUtils.printException:5861 Unknown error, received null response from Genesis
com.nutanix.prism.cli.managers.genesis.GenesisException: Unknown error, received null response from Genesis
at com.nutanix.prism.cli.managers.genesis.GenesisProxy.configureNode(GenesisProxy.java:434)
at com.nutanix.prism.cli.operations.ClusterHostCommonOps.addNode(ClusterHostCommonOps.java:83)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at com.nutanix.cli.base.execution.ExecutorImpl.executeCommand(ExecutorImpl.java:148)
at com.nutanix.cli.base.execution.NonInteractiveExecutor.execute(NonInteractiveExecutor.java:17)
at com.nutanix.cli.base.AbstractCli.execute(AbstractCli.java:65)
at com.nutanix.prism.cli.Ncli.main(Ncli.java:142)
equal_to_parameters() takes exactly 4 non-keyword arguments (2 given)
えー、引数足りていないっぽいんですけど・・・
調査
Prism から実行した時と ncli でエラー内容が違いそうなので、何をやっているのかパケットを見てみる。
ncli は 自身 (127.0.0.1)
の genesis 宛てに HTTP POST したものが proxy されて 4ノード目の 2100/tcp
に届いていた。
アドレスは IPv6 link local で /jsonrpc
に以下のような JSON データが飛んでいる。
{
".oid": "NodeManager",
".method": "configure_ip",
".kwargs": {
"svm_ip": {
"address": "192.168.37.54",
"netmask": "255.255.255.0",
"gateway": "192.168.37.254"
},
"hypervisor_ip": {
"address": "192.168.37.44",
"netmask": "255.255.255.0",
"gateway": "192.168.37.254"
},
"ipmi_ip": {
"address": ""
}
}
}
対して Prism は 4ノード目に configure_ip
を叩きに行っておらず、自身 (127.0.0.1)
宛てに expand_cluster
を叩いていた。
(この後 configure_ip
も叩くのかもしれないが)
{
".oid": "ClusterManager",
".method": "expand_cluster",
".kwargs": {
"node_params": {
"node_list": [
{
"node_uuid": "4baf4768-561d-4e49-ab01-c2eda8c2fb39",
"block_id": "5452d712",
"node_position": "A",
"cvm_ip": "192.168.37.54",
"hypervisor_ip": "192.168.37.44",
"ipmi_ip": "0.0.0.0",
"digital_certificate_map_list": [],
"model": "CommunityEdition",
"is_light_compute": false,
"hypervisor_type": "kvm",
"hypervisor_version": "el7.nutanix.20170222",
"nos_version": "2017.02.23",
"current_network_interface": "eth0",
"robo_mixed_hypervisor": false
}
]
},
"extra_params": {},
"skip_add_node": false,
"skip_pre_expand_checks": false
}
}
そもそも呼んでいるメソッドが違うようだが、ipmi_ip
の値の違いが気になる。
ncli から飛んだ HTTP POST を curl
で再現してみる。
## genesis proxy 越しに飛ぶように投げたが、直接 4ノード目に投げても良かったはず
$ curl -v -XPOST --data "$(cat ncli.json)" 'http://127.0.0.1:2100/prox
y/fe80::526b:8dff:fef4:49e0%eth0/jsonrpc'
* About to connect() to 127.0.0.1 port 2100 (#0)
* Trying 127.0.0.1... connected
* Connected to 127.0.0.1 (127.0.0.1) port 2100 (#0)
> POST /proxy/fe80::526b:8dff:fef4:49e0%eth0/jsonrpc HTTP/1.1
> User-Agent: curl/7.19.7 (x86_64-redhat-linux-gnu) libcurl/7.19.7 NSS/3.21 Basic ECC zlib/1.2.3 libidn/1.18 libssh2/1.4.2
> Host: 127.0.0.1:2100
> Accept: */*
> Content-Length: 376
> Content-Type: application/x-www-form-urlencoded
>
< HTTP/1.1 200 OK
< Server: SimpleHTTP/0.6 Python/2.6.6
< Date: Mon, 27 Mar 2017 08:22:11 GMT
< expires: Mon, 27 Mar 2017 08:22:10 GMT
< content-type: text/plain
< cache-control: no-cache
< Content-length: 629
<
* Connection #0 to host 127.0.0.1 left intact
* Closing connection #0
{".error": "Caught exception: equal_to_parameters() takes exactly 4 non-keyword arguments (2 given)\nTraceback (most recent call last):\n File \"/home/afg/src/main/builds/build-ce-2017.02.23-stable-release/python-tree/bdist.linux-x86_64/egg/util/net/rpc.py\", line 316, in handle_jsonrpc\n File \"/home/afg/src/main/builds/build-ce-2017.02.23-stable-release/python-tree/bdist.linux-x86_64/egg/cluster/genesis/node_manager.py\", line 5396, in configure_ip\nTypeError: equal_to_parameters() takes exactly 4 non-keyword arguments (2 given)\n", ".exception": "equal_to_parameters() takes exactly 4 non-keyword arguments (2 given)"}
同じエラーが再現したようだ。今度は ipmi_ip
を 0.0.0.0
にしてみる。
{
".oid": "NodeManager",
".method": "configure_ip",
".kwargs": {
"svm_ip": {
"address": "192.168.37.54",
"netmask": "255.255.255.0",
"gateway": "192.168.37.254"
},
"hypervisor_ip": {
"address": "192.168.37.44",
"netmask": "255.255.255.0",
"gateway": "192.168.37.254"
},
"ipmi_ip": {
"address": "0.0.0.0"
}
}
}
$ curl -v -XPOST --data "$(cat ncli.json)" 'http://127.0.0.1:2100/prox
y/fe80::526b:8dff:fef4:49e0%eth0/jsonrpc'
* About to connect() to 127.0.0.1 port 2100 (#0)
* Trying 127.0.0.1... connected
* Connected to 127.0.0.1 (127.0.0.1) port 2100 (#0)
> POST /proxy/fe80::526b:8dff:fef4:49e0%eth0/jsonrpc HTTP/1.1
> User-Agent: curl/7.19.7 (x86_64-redhat-linux-gnu) libcurl/7.19.7 NSS/3.21 Basic ECC zlib/1.2.3 libidn/1.18 libssh2/1.4.2
> Host: 127.0.0.1:2100
> Accept: */*
> Content-Length: 383
> Content-Type: application/x-www-form-urlencoded
>
< HTTP/1.1 200 OK
< Server: SimpleHTTP/0.6 Python/2.6.6
< Date: Mon, 27 Mar 2017 08:22:59 GMT
< expires: Mon, 27 Mar 2017 08:22:59 GMT
< content-type: text/plain
< cache-control: no-cache
< Content-length: 25
<
* Connection #0 to host 127.0.0.1 left intact
* Closing connection #0
{".return": [true, null]}
どうやらエラーらしいものは出ていない。ipmi_ip
に関してはこっちが正しそうな気がするので、とりあえず間に割り込んで configure_ip
の時だけ ipmi_ip
の address を書き換えてみる。
対策
2100/tcp
を使っている genesis は listen port を変えられそうなので、genesis は別ポートで起動することにして、
2100/tcp
で待ち受けて特定の値だけを書き換える reverse proxy を作り、4ノード目で起動させる。
Go で以下のようなコードを書いて static link なバイナリを作って scp などで CVM に持っていく。
package main
import (
"bufio"
"bytes"
"encoding/json"
"io/ioutil"
"log"
"net/http"
"net/http/httputil"
)
var (
rp *httputil.ReverseProxy
)
type ipconfig struct {
Address string `json:"address"`
Netmask string `json:"netmask"`
Gateway string `json:"gateway"`
}
type jsonrpc struct {
Oid string `json:".oid"`
Method string `json:".method"`
Kwargs map[string]ipconfig `json:".kwargs"`
}
func rewriteaddr(w http.ResponseWriter, r *http.Request) {
buf, err := ioutil.ReadAll(r.Body)
if err != nil {
log.Print(err)
return
}
r.Body = ioutil.NopCloser(bufio.NewReader(bytes.NewBuffer(buf)))
var j jsonrpc
err = json.Unmarshal(buf, &j)
if err != nil {
log.Print(err)
return
}
if j.Oid == "NodeManager" && j.Method == "configure_ip" {
if _, ok := j.Kwargs["ipmi_ip"]; ok && j.Kwargs["ipmi_ip"].Address == "" {
j.Kwargs["ipmi_ip"] = ipconfig{"0.0.0.0", "", ""}
b, err := json.Marshal(j)
if err != nil {
log.Print(err)
return
}
reader := ioutil.NopCloser(bufio.NewReader(bytes.NewBuffer(b)))
r.Body = reader
r.ContentLength = int64(len(b))
}
}
rp.ServeHTTP(w, r)
}
func main() {
director := func(request *http.Request) {
request.URL.Scheme = "http"
request.URL.Host = ":2101"
}
rp = &httputil.ReverseProxy{Director: director}
http.HandleFunc("/jsonrpc", rewriteaddr)
http.HandleFunc("/", rp.ServeHTTP)
log.Fatal(http.ListenAndServe(":2100", nil))
}
## 4ノード目で genesis 起動ポートを変更
$ genesis stop
$ genesis --genesis_port=2101 start
## 別ホストでビルドした reverse proxy を起動
$ ./rewriter
rewriter プロセスを起動した状態で、既存クラスタ内の CVM から ncli によるノード追加を試す。
$ ncli cluster add-node node-uuid=4baf4768-561d-4e49-ab01-c2eda8c2fb39
Node added successfully
別のエラーが出るかと思っていたが、何と成功してしまった。
rewriter は 1回 unmarshal のエラーを出していたが、致命傷にはならずに済んだようだ。
(パケットを見ておくべきだった、失敗した・・・)
2017/03/27 04:08:06 json: cannot unmarshal string into Go value of type main.ipconfig
このまま rewriter を動かしていたくないので、後始末をする。
Ctrl-c で rewriter を止めて genesis を起動し直す。
$ genesis stop
$ genesis start
今のところ 4ノードクラスタとして普通に使えている。
そもそも IPMI があるハードでアドレスを振っていれば問題を踏まない気がする。
IPMI のないハードで 4ノードクラスタへの拡張をすること自体がレアケースかもしれない。