在openstack运维中,有时会遇到虚拟机热迁移,evacuate等操作中,发生虚拟机脑裂的情况,即同一个虚拟机同时在两个hypervisor上面运行,在使用ceph等共享存储时,十有八九会造成虚拟机文件系统损伤,运气好的情况下能修复文件系统错误,重则数据混乱,虚拟机无法启动.为此,我写了一个Python脚本,用于检测openstack的hypervisor(KVM)是否存在脑裂虚拟机,其原理是通过libvirt的API,获取hypervisor上面的虚拟机的名字,比较是否不同的hypervisor上面有相同的虚拟机名字.脚本需要在控制节点运行(需要openrc的环境变量文件),且该控制节点与计算节点打通了ssh秘钥验证,/etc/hosts解析所有hypervisor的主机名.笔者测试在liberty版本和Mitika能正常运行.
2018年10月25日更新:
1. 脚本内容变更如下
import re
import os
import libvirt
import json
from novaclient import client
from multiprocessing import Pool,Queue
from collections import Counter,defaultdict
q=Queue()
EnvFile='/root/openrc'
def get_nova_creds():
d={}
pattern_save=re.compile(r'^export.*=.*')
pattern_split=re.compile(r'=')
with open(EnvFile,'r') as f:
for i in f.readlines():
match=pattern_save.search(i)
if match:
temp_str = match.group(0).strip("export").strip()
environ_value_dic = pattern_split.split(temp_str)
os.environ[environ_value_dic[0]] = environ_value_dic[1].strip("'")
d['username'] = os.environ['OS_USERNAME']
d['api_key'] = os.environ['OS_PASSWORD']
d['auth_url'] = os.environ['OS_AUTH_URL']
d['project_id'] = os.environ['OS_TENANT_NAME']
d['region_name']= os.environ['OS_REGION_NAME']
return d
def getHypervisor():
HypervisorHostname = []
pattern = re.compile(r'node-d.domain.tld')
creds = get_nova_creds()
nova = client.Client('2', **creds)
for i in nova.hypervisors.list():
match = pattern.search(i.hypervisor_hostname)
if match:
HypervisorHostname.append(match.group())
return HypervisorHostname
def getVM(node):
try:
virtcon=libvirt.open("qemu+ssh://%s/system" %node)
except libvirt.libvirtError,e:
print "wrong to connect %s libvirt api" %node+' '+str(e) #增加无法连接计算节点libvirt情况下错误输出
for id in virtcon.listDomainsID():
vminfo=virtcon.lookupByID(id)
if vminfo.state(0)[0]==libvirt.VIR_DOMAIN_RUNNING: #增加判断,只把running状态的虚拟机放入Queue命令
q.put((node,vminfo.name()))
virtcon.close()
def getVMList():
InstanceNameList=[]
# a=getHypervisor()
# HyperDict = {}.fromkeys(a, [])
HyperDict=defaultdict(list)
while not q.empty():
node,vm=q.get()
InstanceNameList.append(vm)
HyperDict[node].append(vm)
return InstanceNameList,HyperDict
def VMSplitCheck(instancelist,nodedict):
SplitList=[]
SplitDict=defaultdict(list)
c=Counter(instancelist)
for k,v in c.iteritems():
if v>=2:
SplitList.append(k)
if len(SplitList)!=0:
for i in SplitList:
for k,v in nodedict.iteritems():
if i in v:
SplitDict[i].append(k)
return SplitDict
def main():
hypername=getHypervisor()
p=Pool()
for i in hypername:
p.apply_async(getVM,args=(i,))
p.close()
p.join()
inslist,hydict=getVMList()
vmsplit=VMSplitCheck(inslist,hydict)
if len(vmsplit)!=0:
print "found_split_vm"+":"+json.dumps(vmsplit) #只在发现脑裂虚拟机情况下输出文本内容
if __name__ =="__main__":
main()
2 zabbix的item修改
item的Type of information改成text类型
3.zabbix触发器修改
触发器修改为item输出内容里面关键字过滤是否有found_vm_split字段,有的情况下触发告警。
4 测试情况
############################################此为分割线###############################################
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File : VMSplitCheck.py
import re
import os
import sys
import libvirt
from novaclient import client
from multiprocessing import Pool,Queue
from collections import Counter,defaultdict
import pdb
q=Queue()
EnvFile='/root/openrc'
# 获取nova的认证token
def get_nova_creds():
d={}
try:
pattern_save=re.compile(r'^export.*=.*')
pattern_split=re.compile(r'=')
with open(EnvFile,'r') as f:
for i in f.readlines():
match=pattern_save.search(i)
if match:
temp_str = match.group(0).strip("export").strip()
environ_value_dic = pattern_split.split(temp_str)
os.environ[environ_value_dic[0]] = environ_value_dic[1].strip("'")
d['username'] = os.environ['OS_USERNAME']
d['api_key'] = os.environ['OS_PASSWORD']
d['auth_url'] = os.environ['OS_AUTH_URL']
d['project_id'] = os.environ['OS_TENANT_NAME']
d['region_name']= os.environ['OS_REGION_NAME']
return d
except:
print "error"
sys.exit(5)
#通过调用novaclient获取hypervisor,笔者环境hypervisor主机名都为node-xxx.domain.tld
def getHypervisor():
HypervisorHostname = []
pattern = re.compile(r'node-d.domain.tld')
creds = get_nova_creds()
nova = client.Client('2', **creds)
for i in nova.hypervisors.list():
match = pattern.search(i.hypervisor_hostname)
if match:
HypervisorHostname.append(match.group())
return HypervisorHostname
#获取各个hypervisor上面的虚拟机的名字,放入队列
def getVM(node):
try:
virtcon=libvirt.open("qemu+ssh://%s/system" %node)
except libvirtError,e:
print "wrong to connect"
for id in virtcon.listDomainsID():
vminfo=virtcon.lookupByID(id)
q.put((node,vminfo.name()))
# 获取所有运行的虚拟机名字的的列表,和以各个hypervisor的主机名为key,上面运行虚拟机为value的字典
def getVMList():
InstanceNameList=[]
# a=getHypervisor()
# HyperDict = {}.fromkeys(a, [])
HyperDict=defaultdict(list)
while not q.empty():
node,vm=q.get()
InstanceNameList.append(vm)
HyperDict[node].append(vm)
return InstanceNameList,HyperDict
# 检测是否有脑裂虚拟机,如果存在,获取此脑裂虚拟机运行在哪些hypervisor上面.
def VMSplitCheck(instancelist,nodedict):
SplitList=[]
SplitDict=defaultdict(list)
c=Counter(instancelist)
for k,v in c.iteritems():
if v>=2:
SplitList.append(k)
if len(SplitList)==0:
print "no split vm"
else:
for i in SplitList:
for k,v in nodedict.iteritems():
if i in v:
SplitDict[i].append(k)
return SplitDict
# 主函数
def main():
hypername=getHypervisor()
# pdb.set_trace()
# print hypername
p=Pool()
for i in hypername:
p.apply_async(getVM,args=(i,))
p.close()
p.join()
inslist,hydict=getVMList()
# print inslist,hydict
vmsplit=VMSplitCheck(inslist,hydict)
print len(vmsplit)
if __name__ =="__main__":
main()
笔者测试如下:
如下图,有2台虚拟机同时在node-4和node-6运行
运行脚本后,返回以脑裂虚拟机名字为key,同时运行的hypervisor主机名为value的字典.
2 增加zabbix报警设置
zabbix-agent增加item
zabbix-dashboard操作
测试item能否正常
增加一个触发器
表达式:{node-1.domain.tld:vm.split.status.last(0)}>0
参考:
https://www.ibm.com/developerworks/cn/cloud/library/cl-openstack-pythonapis/
http://blog.csdn.net/gzhouc/article/details/52915822