Skip to content

Commit

Permalink
feat(dbm-services): dbha should report double-check ID close #8009
Browse files Browse the repository at this point in the history
  • Loading branch information
xjxia authored and iSecloud committed Nov 18, 2024
1 parent df56542 commit 5e32db0
Show file tree
Hide file tree
Showing 16 changed files with 129 additions and 72 deletions.
31 changes: 18 additions & 13 deletions dbm-services/common/dbha/ha-module/client/hadb.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ type SwitchQueueRequest struct {

// SwitchQueueResponse switch queue response
type SwitchQueueResponse struct {
RowsAffected int `json:"rowsAffected"`
Uid uint `json:"uid"`
RowsAffected int `json:"rowsAffected"`
Uid int64 `json:"uid"`
}

// HaLogsRequest request ha_logs table
Expand All @@ -91,7 +91,8 @@ type HaLogsRequest struct {

// HaLogsResponse response for ha_logs
type HaLogsResponse struct {
RowsAffected int `json:"rowsAffected"`
RowsAffected int `json:"rowsAffected"`
Uid int64 `json:"uid"`
}

// SwitchLogRequest request switch log
Expand Down Expand Up @@ -238,9 +239,14 @@ func (c *HaDBClient) ReportDBStatus(app, agentIp, ip string, port int, dbType, s
return nil
}

// ReportHaLogRough report ha logs
func (c *HaDBClient) ReportHaLogRough(monIP, app, ip string, port int, module, comment string) {
_, _ = c.ReportHaLog(monIP, app, ip, port, module, comment)
}

// ReportHaLog report ha logs
func (c *HaDBClient) ReportHaLog(monIP, app, ip string, port int, module, comment string) {
var result HaLogsRequest
func (c *HaDBClient) ReportHaLog(monIP, app, ip string, port int, module, comment string) (int64, error) {
var result HaLogsResponse
log.Logger.Infof("reporter log. ip:%s, port:%d, module:%s, comment:%s",
ip, port, module, comment)

Expand All @@ -264,18 +270,17 @@ func (c *HaDBClient) ReportHaLog(monIP, app, ip string, port int, module, commen
response, err := c.DoNew(http.MethodPost,
c.SpliceUrlByPrefix(c.Conf.UrlPre, constvar.HaLogsUrl, ""), req, nil)
if err != nil {
log.Logger.Errorf("reporter log failed. err:%s", err.Error())
return
return 0, fmt.Errorf("reporter ha log failed. err:%s", err.Error())
}
if response.Code != 0 {
err = fmt.Errorf("%s failed, return code:%d, msg:%s", util.AtWhere(), response.Code, response.Msg)
log.Logger.Errorf("reporter log failed. err:%s", err.Error())
return
return 0, fmt.Errorf("%s failed, return code:%d, msg:%s", util.AtWhere(), response.Code, response.Msg)
}
err = json.Unmarshal(response.Data, &result)
if err != nil {
log.Logger.Errorf("reporter log failed. err:%s", err.Error())
return 0, fmt.Errorf("reporter ha log failed. err:%s", err.Error())
}

return result.Uid, err
}

// RegisterDBHAInfo register agent info to ha_status table
Expand Down Expand Up @@ -612,7 +617,7 @@ func (c *HaDBClient) UpdateTimeDelay(ip string, port int, app string) error {
}

// InsertSwitchQueue insert pre-switch instance to switch queue
func (c *HaDBClient) InsertSwitchQueue(reqInfo *SwitchQueueRequest) (uint, error) {
func (c *HaDBClient) InsertSwitchQueue(reqInfo *SwitchQueueRequest) (int64, error) {
var result SwitchQueueResponse

log.Logger.Debugf("InsertSwitchQueue param:%#v", util.GraceStructString(reqInfo))
Expand Down Expand Up @@ -685,7 +690,7 @@ func (c *HaDBClient) UpdateSwitchQueue(reqInfo *SwitchQueueRequest) error {
}

// InsertSwitchLog insert switch log to hadb
func (c *HaDBClient) InsertSwitchLog(swId uint, ip string, port int, app, result,
func (c *HaDBClient) InsertSwitchLog(swId int64, ip string, port int, app, result,
comment string, switchFinishTime time.Time) error {
var res SwitchLogResponse
req := SwitchLogRequest{
Expand Down
14 changes: 9 additions & 5 deletions dbm-services/common/dbha/ha-module/constvar/constant.go
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,7 @@ const (
SuccessResult = "success"
)

// status in tb_mon_switch_queue(status field)
// status in ha_switch_queue(status field)
const (
SwitchStart = "doing"
SwitchFailed = "failed"
Expand All @@ -414,10 +414,14 @@ const (
SlaveIpKey = "slave_ip"
// SlavePortKey use to set slave port
SlavePortKey = "slave_port"
//BinlogFile consistent switch binlog file
BinlogFile = "binlog_file"
//BinlogPos consistent switch binlog pos
BinlogPos = "binlog_pos"
//NewMasterBinlogFile consistent switch binlog file
NewMasterBinlogFile = "new_master_binlog_file"
//NewMasterBinlogPos consistent switch binlog pos
NewMasterBinlogPos = "new_master_binlog_pos"
//NewMasterHost new master's host
NewMasterHost = "new_master_host"
//NewMasterPort new master's port
NewMasterPort = "new_master_port"
)

// checksum sql
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -681,8 +681,10 @@ func (ins *MySQLCommonSwitch) ResetSlaveExtend(slaveIp string, slavePort int) (s
return "", 0, fmt.Errorf("reset slave failed. err:%s", err.Error())
}
log.Logger.Infof("executed %s on %s:%d successed", resetSql, slaveIp, slavePort)
ins.SetInfo(constvar.BinlogFile, masterStatus.File)
ins.SetInfo(constvar.BinlogPos, masterStatus.Position)
ins.SetInfo(constvar.NewMasterBinlogFile, masterStatus.File)
ins.SetInfo(constvar.NewMasterBinlogPos, masterStatus.Position)
ins.SetInfo(constvar.NewMasterHost, slaveIp)
ins.SetInfo(constvar.NewMasterPort, slavePort)

return masterStatus.File, masterStatus.Position, nil
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ type MongosSwitch struct {
// # switch operation
// # step 1, check if mongos can switch
// # step 2, mark the current inst as can switch in sw_queue
// # step 3, mark the current inst to in_switch status in tb_mon_switch_queue
// # step 3, mark the current inst to in_switch status in ha_switch_queue
// # step 5, delete the instance from that dns, print the instances number before/after switch
// # step 6, update the dns_param table to make the dns change take affect
// # step 7, return
Expand Down
28 changes: 21 additions & 7 deletions dbm-services/common/dbha/ha-module/dbutil/db_switch.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,13 @@ type DataBaseSwitch interface {
GetApp() string
GetClusterType() string
GetMetaType() string
GetSwitchUid() uint
GetSwitchUid() int64
GetDoubleCheckId() int64
GetRole() string // proxy没有role
GetCluster() string

SetSwitchUid(uint)
SetSwitchUid(int64)
SetDoubleCheckId(int64)
SetInfo(infoKey string, infoValue interface{})
GetInfo(infoKey string) (bool, interface{})
ReportLogs(result string, comment string) bool
Expand All @@ -46,7 +48,7 @@ type PolarisInfo struct {
BindPort int `json:"bind_port"`
}

// CLBInfo clb detail info, response by cmdb api
// ClbInfo clb detail info, response by cmdb api
type ClbInfo struct {
Region string `json:"clb_region"`
LoadBalanceId string `json:"clb_id"`
Expand Down Expand Up @@ -91,8 +93,10 @@ type BaseSwitch struct {
App string
ClusterType string
//machine type in cmdb api response
MetaType string
SwitchUid uint
MetaType string
//double check id
CheckID int64
SwitchUid int64
//cluster domain
Cluster string
ClusterId int
Expand Down Expand Up @@ -136,15 +140,25 @@ func (ins *BaseSwitch) GetMetaType() string {
}

// GetSwitchUid TODO
func (ins *BaseSwitch) GetSwitchUid() uint {
func (ins *BaseSwitch) GetSwitchUid() int64 {
return ins.SwitchUid
}

// SetSwitchUid TODO
func (ins *BaseSwitch) SetSwitchUid(uid uint) {
func (ins *BaseSwitch) SetSwitchUid(uid int64) {
ins.SwitchUid = uid
}

// GetDoubleCheckId get gmm double check id
func (ins *BaseSwitch) GetDoubleCheckId() int64 {
return ins.CheckID
}

// SetDoubleCheckId set gmm double check id
func (ins *BaseSwitch) SetDoubleCheckId(uid int64) {
ins.CheckID = uid
}

// GetRole TODO
// override if needed
func (ins *BaseSwitch) GetRole() string {
Expand Down
5 changes: 3 additions & 2 deletions dbm-services/common/dbha/ha-module/gm/gcm.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ func (gcm *GCM) DoSwitchSingle(switchInstance dbutil.DataBaseSwitch) {
return
}

log.Logger.Infof("insert tb_mon_switch_queue. info:{%s}", switchInstance.ShowSwitchInstanceInfo())
log.Logger.Infof("insert ha_switch_queue. info:{%s}", switchInstance.ShowSwitchInstanceInfo())
err = gcm.InsertSwitchQueue(switchInstance)
if err != nil {
log.Logger.Errorf("insert switch queue failed. err:%s, info{%s}", err.Error(),
Expand Down Expand Up @@ -171,7 +171,7 @@ func (gcm *GCM) DoSwitchSingle(switchInstance dbutil.DataBaseSwitch) {
}
}

// InsertSwitchQueue insert switch info to tb_mon_switch_queue
// InsertSwitchQueue insert switch info to ha_switch_queue
func (gcm *GCM) InsertSwitchQueue(instance dbutil.DataBaseSwitch) error {
log.Logger.Debugf("switch instance info:%#v", instance)
ip, port := instance.GetAddress()
Expand All @@ -192,6 +192,7 @@ func (gcm *GCM) InsertSwitchQueue(instance dbutil.DataBaseSwitch) error {
BKCloudID: gcm.Conf.GetCloudId(),
Name: constvar.InsertSwitchQueue,
SetArgs: &model.HASwitchQueue{
CheckID: instance.GetDoubleCheckId(),
IP: ip,
Port: port,
IdcID: instance.GetIdcID(),
Expand Down
2 changes: 2 additions & 0 deletions dbm-services/common/dbha/ha-module/gm/gm.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ type DoubleCheckInstanceInfo struct {
ConfirmTime time.Time
//double check result
ResultInfo string
//gmm double check id
CheckID int64
}

// ModuleReportInfo module info
Expand Down
17 changes: 11 additions & 6 deletions dbm-services/common/dbha/ha-module/gm/gmm.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ func (gmm *GMM) Process(instance DoubleCheckInstanceInfo) {
{ // machine level switch never satisfy this condition, agent only report ssh failed instance.
ip, port := instance.db.GetAddress()
// no switch in machine level switch
gmm.HaDBClient.ReportHaLog(
gmm.HaDBClient.ReportHaLogRough(
gmIP,
instance.db.GetApp(),
ip,
Expand All @@ -86,7 +86,7 @@ func (gmm *GMM) Process(instance DoubleCheckInstanceInfo) {
err := doubleCheckInstance.db.Detection()
switch doubleCheckInstance.db.GetStatus() {
case constvar.DBCheckSuccess:
gmm.HaDBClient.ReportHaLog(
gmm.HaDBClient.ReportHaLogRough(
gmIP,
doubleCheckInstance.db.GetApp(),
ip,
Expand All @@ -97,7 +97,7 @@ func (gmm *GMM) Process(instance DoubleCheckInstanceInfo) {
case constvar.SSHCheckSuccess:
{
// no switch in machine level switch
gmm.HaDBClient.ReportHaLog(
gmm.HaDBClient.ReportHaLogRough(
gmIP,
doubleCheckInstance.db.GetApp(),
ip,
Expand All @@ -108,15 +108,20 @@ func (gmm *GMM) Process(instance DoubleCheckInstanceInfo) {
}
case constvar.SSHCheckFailed, constvar.SSHAuthFailed:
{
content := fmt.Sprintf("double check failed: ssh check failed. sshcheck err:%s", err)
gmm.HaDBClient.ReportHaLog(
content := fmt.Sprintf("double check failed: ssh check failed. sshcheck err:%s", err.Error())
checkId, err := gmm.HaDBClient.ReportHaLog(
gmIP,
doubleCheckInstance.db.GetApp(),
ip,
port,
"gmm",
content,
)
if err != nil {
log.Logger.Errorf(fmt.Sprintf("insert ha logs failed:%s", err.Error()))
return
}
doubleCheckInstance.CheckID = checkId
// ssh auth failed, report event also
if doubleCheckInstance.db.GetStatus() == constvar.SSHAuthFailed {
monitor.MonitorSendDetect(
Expand All @@ -136,7 +141,7 @@ func (gmm *GMM) Process(instance DoubleCheckInstanceInfo) {
{
content := fmt.Sprintf("database authenticate failed, err:%s", err.Error())
log.Logger.Errorf(content)
gmm.HaDBClient.ReportHaLog(
gmm.HaDBClient.ReportHaLogRough(
gmIP,
doubleCheckInstance.db.GetApp(),
ip,
Expand Down
25 changes: 13 additions & 12 deletions dbm-services/common/dbha/ha-module/gm/gqa.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ func (gqa *GQA) PreProcess(instance DoubleCheckInstanceInfo) []dbutil.DataBaseSw
if err != nil {
errInfo := fmt.Sprintf("get idc failed. err:%s", err.Error())
log.Logger.Errorf(errInfo)
gqa.HaDBClient.ReportHaLog(gqa.Conf.GMConf.LocalIP, instance.db.GetApp(), ip, port, "gqa", errInfo)
gqa.HaDBClient.ReportHaLogRough(gqa.Conf.GMConf.LocalIP, instance.db.GetApp(), ip, port, "gqa", errInfo)
return nil
}
return cmdbInfos
Expand Down Expand Up @@ -111,9 +111,9 @@ func (gqa *GQA) Process(cmdbInfos []dbutil.DataBaseSwitch) {
if err != nil {
errInfo := fmt.Sprintf("delay switch failed. err:%s", err.Error())
log.Logger.Errorf(errInfo)
gqa.HaDBClient.ReportHaLog(gmIP, instanceInfo.GetApp(), ip, port, "gqa", errInfo)
gqa.HaDBClient.ReportHaLogRough(gmIP, instanceInfo.GetApp(), ip, port, "gqa", errInfo)
} else {
gqa.HaDBClient.ReportHaLog(gmIP, instanceInfo.GetApp(), ip, port, "gqa",
gqa.HaDBClient.ReportHaLogRough(gmIP, instanceInfo.GetApp(), ip, port, "gqa",
"single IDC switch too much, delay switch")
}
continue
Expand All @@ -122,7 +122,7 @@ func (gqa *GQA) Process(cmdbInfos []dbutil.DataBaseSwitch) {

// check status
if instanceInfo.GetStatus() != constvar.RUNNING && instanceInfo.GetStatus() != constvar.AVAILABLE {
gqa.HaDBClient.ReportHaLog(gmIP, instanceInfo.GetApp(), ip, port, "gqa",
gqa.HaDBClient.ReportHaLogRough(gmIP, instanceInfo.GetApp(), ip, port, "gqa",
fmt.Sprintf("status:%s not equal RUNNING or AVAILABLE", instanceInfo.GetStatus()))
continue
}
Expand All @@ -132,11 +132,11 @@ func (gqa *GQA) Process(cmdbInfos []dbutil.DataBaseSwitch) {
if err != nil {
errInfo := fmt.Sprintf("query single total failed. err:%s", err.Error())
log.Logger.Errorf(errInfo)
gqa.HaDBClient.ReportHaLog(gmIP, instanceInfo.GetApp(), ip, port, "gqa", errInfo)
gqa.HaDBClient.ReportHaLogRough(gmIP, instanceInfo.GetApp(), ip, port, "gqa", errInfo)
continue
}
if singleTotal >= gqa.SingleSwitchLimit {
gqa.HaDBClient.ReportHaLog(gmIP, instanceInfo.GetApp(), ip, port, "gqa", "reached single total.")
gqa.HaDBClient.ReportHaLogRough(gmIP, instanceInfo.GetApp(), ip, port, "gqa", "reached single total.")
continue
}

Expand All @@ -145,17 +145,17 @@ func (gqa *GQA) Process(cmdbInfos []dbutil.DataBaseSwitch) {
if err != nil {
errInfo := fmt.Sprintf("query interval total failed. err:%s", err.Error())
log.Logger.Errorf(errInfo)
gqa.HaDBClient.ReportHaLog(gmIP, instanceInfo.GetApp(), ip, port, "gqa", errInfo)
gqa.HaDBClient.ReportHaLogRough(gmIP, instanceInfo.GetApp(), ip, port, "gqa", errInfo)
continue
}
if intervalTotal >= gqa.AllSwitchLimit {
err = gqa.delaySwitch(instanceInfo)
if err != nil {
errInfo := fmt.Sprintf("delay switch failed. err:%s", err.Error())
log.Logger.Errorf(errInfo)
gqa.HaDBClient.ReportHaLog(gmIP, instanceInfo.GetApp(), ip, port, "gqa", errInfo)
gqa.HaDBClient.ReportHaLogRough(gmIP, instanceInfo.GetApp(), ip, port, "gqa", errInfo)
} else {
gqa.HaDBClient.ReportHaLog(gmIP, instanceInfo.GetApp(), ip, port, "gqa",
gqa.HaDBClient.ReportHaLogRough(gmIP, instanceInfo.GetApp(), ip, port, "gqa",
"dbha switch too much, delay switch")
}
continue
Expand All @@ -168,7 +168,7 @@ func (gqa *GQA) Process(cmdbInfos []dbutil.DataBaseSwitch) {
if err != nil {
errInfo := fmt.Sprintf("query single idc failed. err:%s", err.Error())
log.Logger.Errorf(errInfo)
gqa.HaDBClient.ReportHaLog(gmIP, instanceInfo.GetApp(), ip, port, "gqa", errInfo)
gqa.HaDBClient.ReportHaLogRough(gmIP, instanceInfo.GetApp(), ip, port, "gqa", errInfo)
continue
}
if idcTotal >= gqa.SingleSwitchIDCLimit {
Expand All @@ -180,9 +180,9 @@ func (gqa *GQA) Process(cmdbInfos []dbutil.DataBaseSwitch) {
if err != nil {
errInfo := fmt.Sprintf("delay switch failed. err:%s", err.Error())
log.Logger.Errorf(errInfo)
gqa.HaDBClient.ReportHaLog(gmIP, instanceInfo.GetApp(), ip, port, "gqa", errInfo)
gqa.HaDBClient.ReportHaLogRough(gmIP, instanceInfo.GetApp(), ip, port, "gqa", errInfo)
} else {
gqa.HaDBClient.ReportHaLog(gmIP, instanceInfo.GetApp(), ip, port, "gqa",
gqa.HaDBClient.ReportHaLogRough(gmIP, instanceInfo.GetApp(), ip, port, "gqa",
"single IDC switch too much, delay switch")
}
continue
Expand Down Expand Up @@ -230,6 +230,7 @@ func (gqa *GQA) getAllInstanceFromCMDB(
log.Logger.Errorf("need process instances detail:%#v", ret)

for _, sins := range ret {
sins.SetDoubleCheckId(instance.CheckID)
sins.SetInfo(constvar.DoubleCheckInfoKey, instance.ResultInfo)
sins.SetInfo(constvar.DoubleCheckTimeKey, instance.ConfirmTime)
}
Expand Down
Loading

0 comments on commit 5e32db0

Please sign in to comment.