diff --git a/src/backend/ci/core/common/common-api/src/main/kotlin/com/tencent/devops/common/api/exception/UniqueIdException.kt b/src/backend/ci/core/common/common-api/src/main/kotlin/com/tencent/devops/common/api/exception/UniqueIdException.kt index 4f4847b1c5f..c89fa5eaf52 100644 --- a/src/backend/ci/core/common/common-api/src/main/kotlin/com/tencent/devops/common/api/exception/UniqueIdException.kt +++ b/src/backend/ci/core/common/common-api/src/main/kotlin/com/tencent/devops/common/api/exception/UniqueIdException.kt @@ -38,5 +38,4 @@ import com.tencent.devops.common.api.util.UUIDUtil open class UniqueIdException( val msg: String?, val uniqueId: String? = UUIDUtil.generate() -) : - RuntimeException("[uniqueId=$uniqueId]$msg") +) : RuntimeException("[uniqueId=$uniqueId]$msg") diff --git a/src/backend/ci/core/common/common-dispatch-sdk/src/main/kotlin/com/tencent/devops/common/dispatch.sdk/listener/BuildListener.kt b/src/backend/ci/core/common/common-dispatch-sdk/src/main/kotlin/com/tencent/devops/common/dispatch.sdk/listener/BuildListener.kt index a714b7807a1..75774c70ed2 100644 --- a/src/backend/ci/core/common/common-dispatch-sdk/src/main/kotlin/com/tencent/devops/common/dispatch.sdk/listener/BuildListener.kt +++ b/src/backend/ci/core/common/common-dispatch-sdk/src/main/kotlin/com/tencent/devops/common/dispatch.sdk/listener/BuildListener.kt @@ -127,7 +127,7 @@ interface BuildListener { buildId = event.buildId, containerHashId = event.containerHashId, vmSeqId = event.vmSeqId, - message = "${I18nUtil.getCodeLanMessage("$BK_FAILED_START_BUILD_MACHINE")}- ${e.message}", + message = "${I18nUtil.getCodeLanMessage(BK_FAILED_START_BUILD_MACHINE)}- ${e.message}", executeCount = event.executeCount, jobId = event.jobId ) @@ -136,14 +136,14 @@ interface BuildListener { errorMessage = e.formatErrorMessage errorType = e.errorType - onFailure(dispatchService, event, e) + dispatchService.onFailure(event, e) } catch (t: Throwable) { logger.warn("Fail to handle the start up message - DispatchService($event)", t) dispatchService.logRed( buildId = event.buildId, containerHashId = event.containerHashId, vmSeqId = event.vmSeqId, - message = "${I18nUtil.getCodeLanMessage("$BK_FAILED_START_BUILD_MACHINE")} - ${t.message}", + message = "${I18nUtil.getCodeLanMessage(BK_FAILED_START_BUILD_MACHINE)} - ${t.message}", executeCount = event.executeCount, jobId = event.jobId ) @@ -152,8 +152,7 @@ interface BuildListener { errorMessage = "Fail to handle the start up message" errorType = ErrorType.SYSTEM - onFailure( - dispatchService = dispatchService, + dispatchService.onFailure( event = event, e = BuildFailureException( errorType = ErrorType.SYSTEM, @@ -361,15 +360,6 @@ interface BuildListener { private fun getClient() = SpringContextUtil.getBean(Client::class.java) - private fun onFailure( - dispatchService: DispatchService, - event: PipelineAgentStartupEvent, - e: BuildFailureException - ) { - dispatchService.onContainerFailure(event, e) - DispatchLogRedisUtils.removeRedisExecuteCount(event.buildId) - } - companion object { private val logger = LoggerFactory.getLogger(BuildListener::class.java) } diff --git a/src/backend/ci/core/common/common-dispatch-sdk/src/main/kotlin/com/tencent/devops/common/dispatch.sdk/service/DispatchService.kt b/src/backend/ci/core/common/common-dispatch-sdk/src/main/kotlin/com/tencent/devops/common/dispatch.sdk/service/DispatchService.kt index 2a5d734fb86..280ea7fa8e3 100644 --- a/src/backend/ci/core/common/common-dispatch-sdk/src/main/kotlin/com/tencent/devops/common/dispatch.sdk/service/DispatchService.kt +++ b/src/backend/ci/core/common/common-dispatch-sdk/src/main/kotlin/com/tencent/devops/common/dispatch.sdk/service/DispatchService.kt @@ -48,6 +48,7 @@ import com.tencent.devops.common.dispatch.sdk.pojo.docker.DockerConstants.ENV_KE import com.tencent.devops.common.dispatch.sdk.pojo.docker.DockerConstants.ENV_KEY_BUILD_ID import com.tencent.devops.common.dispatch.sdk.pojo.docker.DockerConstants.ENV_KEY_PROJECT_ID import com.tencent.devops.common.dispatch.sdk.utils.ChannelUtils +import com.tencent.devops.common.dispatch.sdk.utils.DispatchLogRedisUtils import com.tencent.devops.common.event.dispatcher.pipeline.PipelineEventDispatcher import com.tencent.devops.common.event.pojo.pipeline.IPipelineEvent import com.tencent.devops.common.log.utils.BuildLogPrinter @@ -153,10 +154,33 @@ class DispatchService constructor( } fun checkRunning(event: PipelineAgentStartupEvent): Boolean { - val (startBuildTask, buildContainer) = getContainerStartupInfo(event) + return checkRunning( + projectId = event.projectId, + buildId = event.buildId, + containerId = event.containerId, + retryTime = event.retryTime, + executeCount = event.executeCount, + logTag = "$event" + ) + } + + fun checkRunning( + projectId: String, + buildId: String, + containerId: String, + retryTime: Int, + executeCount: Int?, + logTag: String? + ): Boolean { + val (startBuildTask, buildContainer) = getContainerStartupInfo( + projectId = projectId, + buildId = buildId, + containerId = containerId, + logTag = logTag + ) var needStart = true - if (event.executeCount != startBuildTask.executeCount) { + if (executeCount != startBuildTask.executeCount) { // 如果已经重试过或执行次数不匹配则直接丢弃 needStart = false } else if (startBuildTask.status.isFinish() && buildContainer.status.isRunning()) { @@ -167,9 +191,9 @@ class DispatchService constructor( } if (!needStart) { - logger.warn("The build event($event) is not running") + logger.warn("The build event($logTag) is not running") // dispatch主动发起的重试或者用户已取消的流水线忽略异常报错 - if (event.retryTime > 1 || buildContainer.status.isCancel()) { + if (retryTime > 1 || buildContainer.status.isCancel()) { return false } @@ -184,26 +208,71 @@ class DispatchService constructor( return true } - fun onContainerFailure(event: PipelineAgentStartupEvent, e: BuildFailureException) { - logger.warn("[${event.buildId}|${event.vmSeqId}] Container startup failure") + fun onFailure( + event: PipelineAgentStartupEvent, + e: BuildFailureException + ) { + onFailure( + projectId = event.projectId, + pipelineId = event.pipelineId, + buildId = event.buildId, + vmSeqId = event.vmSeqId, + e = e, + logTag = "$event" + ) + } + + fun onFailure( + projectId: String, + pipelineId: String, + buildId: String, + vmSeqId: String, + e: BuildFailureException, + logTag: String? + ) { + onContainerFailure( + projectId = projectId, + pipelineId = pipelineId, + buildId = buildId, + vmSeqId = vmSeqId, + e = e, + logTag + ) + DispatchLogRedisUtils.removeRedisExecuteCount(buildId) + } + + private fun onContainerFailure( + projectId: String, + pipelineId: String, + buildId: String, + vmSeqId: String, + e: BuildFailureException, + logTag: String? + ) { + logger.warn("[$buildId|$vmSeqId] Container startup failure") try { - val (startBuildTask, buildContainer) = getContainerStartupInfo(event) + val (startBuildTask, buildContainer) = getContainerStartupInfo( + projectId = projectId, + buildId = buildId, + containerId = vmSeqId, + logTag = logTag + ) if (buildContainer.status.isCancel() || startBuildTask.status.isCancel()) { return } client.get(ServiceBuildResource::class).setVMStatus( - projectId = event.projectId, - pipelineId = event.pipelineId, - buildId = event.buildId, - vmSeqId = event.vmSeqId, + projectId = projectId, + pipelineId = pipelineId, + buildId = buildId, + vmSeqId = vmSeqId, status = BuildStatus.FAILED, errorType = e.errorType, errorCode = e.errorCode, errorMsg = e.formatErrorMessage ) } catch (ignore: ClientException) { - logger.error("SystemErrorLogMonitor|onContainerFailure|${event.buildId}|error=${e.message},${e.errorCode}") + logger.error("SystemErrorLogMonitor|onContainerFailure|$buildId|error=${e.message},${e.errorCode}") } } @@ -250,20 +319,23 @@ class DispatchService constructor( } private fun getContainerStartupInfo( - event: PipelineAgentStartupEvent + projectId: String, + buildId: String, + containerId: String, + logTag: String? ): Pair { // 判断流水线当前container是否在运行中 val statusResult = client.get(ServicePipelineTaskResource::class).getContainerStartupInfo( - projectId = event.projectId, - buildId = event.buildId, - containerId = event.containerId, - taskId = VMUtils.genStartVMTaskId(event.containerId) + projectId = projectId, + buildId = buildId, + containerId = containerId, + taskId = VMUtils.genStartVMTaskId(containerId) ) val startBuildTask = statusResult.data?.startBuildTask val buildContainer = statusResult.data?.buildContainer if (statusResult.isNotOk() || startBuildTask == null || buildContainer == null) { logger.warn( - "The build event($event) fail to check if pipeline task is running " + + "The build event($logTag) fail to check if pipeline task is running " + "because of statusResult(${statusResult.message})" ) val errorMessage = I18nUtil.getCodeLanMessage(UNABLE_GET_PIPELINE_JOB_STATUS) diff --git a/src/backend/ci/core/common/common-event/src/main/kotlin/com/tencent/devops/common/event/dispatcher/pipeline/mq/MQ.kt b/src/backend/ci/core/common/common-event/src/main/kotlin/com/tencent/devops/common/event/dispatcher/pipeline/mq/MQ.kt index b05be424801..acd41b65bd1 100644 --- a/src/backend/ci/core/common/common-event/src/main/kotlin/com/tencent/devops/common/event/dispatcher/pipeline/mq/MQ.kt +++ b/src/backend/ci/core/common/common-event/src/main/kotlin/com/tencent/devops/common/event/dispatcher/pipeline/mq/MQ.kt @@ -149,6 +149,11 @@ object MQ { const val ROUTE_AGENT_SHUTDOWN = "r.engine.pipeline.agent.shutdown" const val QUEUE_AGENT_SHUTDOWN = "q.engine.pipeline.agent.shutdown" + // 第三方 AGENT 排队消息队列 ==================================== + const val EXCHANGE_THIRD_PARTY_AGENT_QUEUE = "e.dispatch.tp.agent.queue" + const val ROUTE_THIRD_PARTY_AGENT_QUEUE = "r.dispatch.tp.agent.queue" + const val QUEUE_THIRD_PARTY_AGENT_QUEUE = "q.dispatch.tp.agent.queue" + // 无构建环境的Docker构建机启停消息队列 ==================================== const val EXCHANGE_BUILD_LESS_AGENT_LISTENER_DIRECT = "e.engine.pipeline.bl.agent" const val ROUTE_BUILD_LESS_AGENT_STARTUP_DISPATCH = "r.engine.pipeline.bl.agent.dispatch.startup" diff --git a/src/backend/ci/core/common/common-pipeline/src/main/kotlin/com/tencent/devops/common/pipeline/type/agent/ThirdPartyAgentDispatch.kt b/src/backend/ci/core/common/common-pipeline/src/main/kotlin/com/tencent/devops/common/pipeline/type/agent/ThirdPartyAgentDispatch.kt index ecf25829ff2..4a693b14016 100644 --- a/src/backend/ci/core/common/common-pipeline/src/main/kotlin/com/tencent/devops/common/pipeline/type/agent/ThirdPartyAgentDispatch.kt +++ b/src/backend/ci/core/common/common-pipeline/src/main/kotlin/com/tencent/devops/common/pipeline/type/agent/ThirdPartyAgentDispatch.kt @@ -9,6 +9,9 @@ import com.tencent.devops.common.pipeline.type.DispatchType abstract class ThirdPartyAgentDispatch( override var value: String, open val agentType: AgentType, + open var workspace: String?, + // 第三方构建机用docker作为构建机 + open val dockerInfo: ThirdPartyAgentDockerInfo?, // 类型为REUSE_JOB时,被复用的job的value,防止同一个stage并发下拿不到agent,启动时填充 open var reusedInfo: ReusedInfo? ) : DispatchType(value) { @@ -19,6 +22,9 @@ abstract class ThirdPartyAgentDispatch( // 是否在复用锁定链上 fun hasReuseMutex(): Boolean = this.agentType.isReuse() || this.reusedInfo != null + + fun isEnv() = this is ThirdPartyAgentEnvDispatchType + fun isSingle() = this is ThirdPartyAgentIDDispatchType } /** diff --git a/src/backend/ci/core/common/common-pipeline/src/main/kotlin/com/tencent/devops/common/pipeline/type/agent/ThirdPartyAgentEnvDispatchType.kt b/src/backend/ci/core/common/common-pipeline/src/main/kotlin/com/tencent/devops/common/pipeline/type/agent/ThirdPartyAgentEnvDispatchType.kt index ff9f9155eea..7cc444271c4 100644 --- a/src/backend/ci/core/common/common-pipeline/src/main/kotlin/com/tencent/devops/common/pipeline/type/agent/ThirdPartyAgentEnvDispatchType.kt +++ b/src/backend/ci/core/common/common-pipeline/src/main/kotlin/com/tencent/devops/common/pipeline/type/agent/ThirdPartyAgentEnvDispatchType.kt @@ -30,23 +30,21 @@ package com.tencent.devops.common.pipeline.type.agent import com.fasterxml.jackson.annotation.JsonProperty import com.tencent.devops.common.api.util.EnvUtils import com.tencent.devops.common.pipeline.type.BuildType -import io.swagger.v3.oas.annotations.media.Schema data class ThirdPartyAgentEnvDispatchType( @JsonProperty("value") var envName: String, - @get:Schema(title = "共享环境时必填,值为提供共享环境的项目id") + override var workspace: String?, + // 共享环境时必填,值为提供共享环境的项目id var envProjectId: String?, - @get:Schema(title = "工作空间") - var workspace: String?, - @get:Schema(title = "agent类型,默认NAME") override val agentType: AgentType = AgentType.NAME, - // 第三方构建机用docker作为构建机 - val dockerInfo: ThirdPartyAgentDockerInfo?, + override val dockerInfo: ThirdPartyAgentDockerInfo?, override var reusedInfo: ReusedInfo? ) : ThirdPartyAgentDispatch( value = envName, + workspace = workspace, agentType = agentType, + dockerInfo = dockerInfo, reusedInfo = reusedInfo ) { override fun cleanDataBeforeSave() { diff --git a/src/backend/ci/core/common/common-pipeline/src/main/kotlin/com/tencent/devops/common/pipeline/type/agent/ThirdPartyAgentIDDispatchType.kt b/src/backend/ci/core/common/common-pipeline/src/main/kotlin/com/tencent/devops/common/pipeline/type/agent/ThirdPartyAgentIDDispatchType.kt index 9891f2f3ae2..dd35445f670 100644 --- a/src/backend/ci/core/common/common-pipeline/src/main/kotlin/com/tencent/devops/common/pipeline/type/agent/ThirdPartyAgentIDDispatchType.kt +++ b/src/backend/ci/core/common/common-pipeline/src/main/kotlin/com/tencent/devops/common/pipeline/type/agent/ThirdPartyAgentIDDispatchType.kt @@ -32,15 +32,17 @@ import com.tencent.devops.common.api.util.EnvUtils import com.tencent.devops.common.pipeline.type.BuildType data class ThirdPartyAgentIDDispatchType( - @JsonProperty("value") var displayName: String, - var workspace: String?, + @JsonProperty("value") + var displayName: String, + override var workspace: String?, override val agentType: AgentType = AgentType.NAME, - // 第三方构建机用docker作为构建机 - val dockerInfo: ThirdPartyAgentDockerInfo?, + override val dockerInfo: ThirdPartyAgentDockerInfo?, override var reusedInfo: ReusedInfo? ) : ThirdPartyAgentDispatch( value = displayName, agentType = agentType, + workspace = workspace, + dockerInfo = dockerInfo, reusedInfo = reusedInfo ) { override fun cleanDataBeforeSave() { diff --git a/src/backend/ci/core/common/common-redis/src/main/kotlin/com/tencent/devops/common/redis/RedisLock.kt b/src/backend/ci/core/common/common-redis/src/main/kotlin/com/tencent/devops/common/redis/RedisLock.kt index dbf86123ec5..7a38eca44bc 100644 --- a/src/backend/ci/core/common/common-redis/src/main/kotlin/com/tencent/devops/common/redis/RedisLock.kt +++ b/src/backend/ci/core/common/common-redis/src/main/kotlin/com/tencent/devops/common/redis/RedisLock.kt @@ -37,9 +37,9 @@ open class RedisLock( private val redisOperation: RedisOperation, private val lockKey: String, private val expiredTimeInSeconds: Long, - private val sleepTime: Long = 100L + private val sleepTime: Long = 100L, + private var lockValue: String = UUID.randomUUID().toString() ) : AutoCloseable { - private val lockValue = UUID.randomUUID().toString() /** * 锁是否已经被占用 @@ -127,6 +127,12 @@ open class RedisLock( private fun getLocalLock(): Any = localLock.get(lockKey)!! + fun getLockValue() = lockValue + + fun setLockValue(lockValue: String) { + this.lockValue = lockValue + } + override fun close() { unlock() } diff --git a/src/backend/ci/core/dispatch/api-dispatch/src/main/kotlin/com/tencent/devops/dispatch/api/OpAgentResource.kt b/src/backend/ci/core/dispatch/api-dispatch/src/main/kotlin/com/tencent/devops/dispatch/api/OpAgentResource.kt new file mode 100644 index 00000000000..945862c1dbc --- /dev/null +++ b/src/backend/ci/core/dispatch/api-dispatch/src/main/kotlin/com/tencent/devops/dispatch/api/OpAgentResource.kt @@ -0,0 +1,28 @@ +package com.tencent.devops.dispatch.api + +import io.swagger.v3.oas.annotations.Operation +import io.swagger.v3.oas.annotations.tags.Tag +import javax.ws.rs.Consumes +import javax.ws.rs.POST +import javax.ws.rs.Path +import javax.ws.rs.Produces +import javax.ws.rs.QueryParam +import javax.ws.rs.core.MediaType + +@Tag(name = "OP_AGENT", description = "agent相关") +@Path("/op/agent") +@Produces(MediaType.APPLICATION_JSON) +@Consumes(MediaType.APPLICATION_JSON) +interface OpAgentResource { + + @Operation(summary = "修改灰度排队功能的项目或者流水线") + @POST + @Path("/update_gray_queue") + fun updateGrayQueue( + @QueryParam("projectId") + projectId: String, + @QueryParam("operate") + operate: String, + pipelineIds: Set? + ) +} \ No newline at end of file diff --git a/src/backend/ci/core/dispatch/api-dispatch/src/main/kotlin/com/tencent/devops/dispatch/constants/Constants.kt b/src/backend/ci/core/dispatch/api-dispatch/src/main/kotlin/com/tencent/devops/dispatch/constants/Constants.kt index b238bdbfd45..4cd2d70aea7 100644 --- a/src/backend/ci/core/dispatch/api-dispatch/src/main/kotlin/com/tencent/devops/dispatch/constants/Constants.kt +++ b/src/backend/ci/core/dispatch/api-dispatch/src/main/kotlin/com/tencent/devops/dispatch/constants/Constants.kt @@ -71,3 +71,9 @@ const val AGENT_REUSE_MUTEX_WAIT_REUSED_ENV = "agentReuseMuteXWaitReusedEnv" const val BK_ENV_NODE_DISABLE = "bkEnvNodeDisable" const val BK_THIRD_JOB_ENV_CURR = "bkThirdJobEnvCurr" // 当前环境下所有构建机并发{0}已经超过配置的{1},排队{2}分钟 const val BK_THIRD_JOB_NODE_CURR = "bkThirdJobNodeCurr" // 当前环境下所有节点运行任务都超过了配置的{0},排队{1}分钟 +// 构建机复用互斥,节点 {0} 已被 {1} 构建使用,剩余可调度空间不足,重新调度 +const val AGENT_REUSE_MUTEX_RESERVE_REDISPATCH = "agentReuseMutexReserveRedispatch" +// 构建环境调度结束,已选取节点 {0} +const val BK_ENV_DISPATCH_AGENT = "bkEnvDispatchAgent" +// 尝试下发任务至节点 {0} +const val TRY_AGENT_DISPATCH = "tryAgentDispatch" \ No newline at end of file diff --git a/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/configuration/TPAQueueMqConf.kt b/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/configuration/TPAQueueMqConf.kt new file mode 100644 index 00000000000..aa3926d6b2c --- /dev/null +++ b/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/configuration/TPAQueueMqConf.kt @@ -0,0 +1,72 @@ +package com.tencent.devops.dispatch.configuration + +import com.tencent.devops.common.event.dispatcher.pipeline.mq.MQ +import com.tencent.devops.common.event.dispatcher.pipeline.mq.MQEventDispatcher +import com.tencent.devops.common.event.dispatcher.pipeline.mq.Tools +import com.tencent.devops.dispatch.listener.TPAQueueListener +import org.springframework.amqp.core.Binding +import org.springframework.amqp.core.BindingBuilder +import org.springframework.amqp.core.FanoutExchange +import org.springframework.amqp.core.Queue +import org.springframework.amqp.rabbit.connection.ConnectionFactory +import org.springframework.amqp.rabbit.core.RabbitAdmin +import org.springframework.amqp.rabbit.core.RabbitTemplate +import org.springframework.amqp.rabbit.listener.SimpleMessageListenerContainer +import org.springframework.amqp.rabbit.listener.adapter.MessageListenerAdapter +import org.springframework.amqp.support.converter.Jackson2JsonMessageConverter +import org.springframework.beans.factory.annotation.Autowired +import org.springframework.context.annotation.Bean +import org.springframework.context.annotation.Configuration + +@Configuration +class TPAQueueMqConf { + @Bean + fun rabbitAdmin(connectionFactory: ConnectionFactory): RabbitAdmin { + return RabbitAdmin(connectionFactory) + } + + @Bean + fun eventDispatcher(rabbitTemplate: RabbitTemplate) = MQEventDispatcher(rabbitTemplate) + + @Bean + fun tpAgentQueueExchange(): FanoutExchange { + val fanoutExchange = FanoutExchange(MQ.EXCHANGE_THIRD_PARTY_AGENT_QUEUE, true, false) + fanoutExchange.isDelayed = true + return fanoutExchange + } + + @Bean + fun tpAgentQueueQueue() = Queue(MQ.QUEUE_THIRD_PARTY_AGENT_QUEUE) + + @Bean + fun tpAgentQueueQueueBind( + @Autowired tpAgentQueueQueue: Queue, + @Autowired tpAgentQueueExchange: FanoutExchange + ): Binding { + return BindingBuilder.bind(tpAgentQueueQueue).to(tpAgentQueueExchange) + } + + @Bean + fun requestTriggerContainer( + @Autowired connectionFactory: ConnectionFactory, + @Autowired tpAgentQueueQueue: Queue, + @Autowired rabbitAdmin: RabbitAdmin, + @Autowired tpAgentQueueListener: TPAQueueListener, + @Autowired messageConverter: Jackson2JsonMessageConverter + ): SimpleMessageListenerContainer { + return Tools.createSimpleMessageListenerContainerByAdapter( + connectionFactory = connectionFactory, + queue = tpAgentQueueQueue, + rabbitAdmin = rabbitAdmin, + adapter = MessageListenerAdapter( + tpAgentQueueListener, + tpAgentQueueListener::listenTpAgentQueueEvent.name + ).also { it.setMessageConverter(messageConverter) }, + startConsumerMinInterval = 10000, + consecutiveActiveTrigger = 5, + concurrency = 50, + maxConcurrency = 100, + prefetchCount = 1 + ) + } +} \ No newline at end of file diff --git a/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/controller/OpAgentResourceImpl.kt b/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/controller/OpAgentResourceImpl.kt new file mode 100644 index 00000000000..5368a07ad66 --- /dev/null +++ b/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/controller/OpAgentResourceImpl.kt @@ -0,0 +1,37 @@ +package com.tencent.devops.dispatch.controller + +import com.tencent.devops.common.redis.RedisOperation +import com.tencent.devops.common.web.RestResource +import com.tencent.devops.dispatch.api.OpAgentResource +import com.tencent.devops.dispatch.service.ThirdPartyDispatchService +import org.springframework.beans.factory.annotation.Autowired + +@RestResource +class OpAgentResourceImpl @Autowired constructor( + private val redisOperation: RedisOperation +) : OpAgentResource { + override fun updateGrayQueue(projectId: String, operate: String, pipelineIds: Set?) { + val redisKey = ThirdPartyDispatchService.DISPATCH_QUEUE_GRAY_PROJECT_PIPELINE + val value = redisOperation.hget(redisKey, projectId) + if (operate == "ADD") { + if (value == null) { + redisOperation.hset(redisKey, projectId, pipelineIds?.joinToString(";") ?: "") + } else { + val pips = value.split(";").toMutableSet() + pips.addAll(pipelineIds ?: setOf()) + redisOperation.hset(redisKey, projectId, pips.joinToString(";")) + } + return + } + if (operate == "REMOVE") { + if (pipelineIds.isNullOrEmpty() || value == null) { + redisOperation.hdelete(redisKey, projectId) + } else { + val pips = value.split(";").toMutableSet() + pips.removeAll(pipelineIds) + redisOperation.hset(redisKey, projectId, pips.joinToString(";")) + } + return + } + } +} \ No newline at end of file diff --git a/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/dao/TPAQueueDao.kt b/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/dao/TPAQueueDao.kt new file mode 100644 index 00000000000..5f80e3dd466 --- /dev/null +++ b/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/dao/TPAQueueDao.kt @@ -0,0 +1,186 @@ +package com.tencent.devops.dispatch.dao + +import com.fasterxml.jackson.core.type.TypeReference +import com.tencent.devops.common.api.util.JsonUtil +import com.tencent.devops.dispatch.pojo.ThirdPartyAgentDispatchData +import com.tencent.devops.dispatch.pojo.ThirdPartyAgentDispatchDataSqlJson +import com.tencent.devops.dispatch.pojo.ThirdPartyAgentSqlQueueType +import com.tencent.devops.model.dispatch.tables.TDispatchThirdpartyAgentQueue +import com.tencent.devops.model.dispatch.tables.records.TDispatchThirdpartyAgentQueueRecord +import org.jooq.DSLContext +import org.jooq.JSON +import org.jooq.RecordMapper +import org.springframework.stereotype.Repository +import java.time.LocalDateTime + +@Repository +class TPAQueueDao { + fun add( + dslContext: DSLContext, + projectId: String, + pipelineId: String, + buildId: String, + vmSeqId: String, + data: String, + dataType: ThirdPartyAgentSqlQueueType, + info: ThirdPartyAgentDispatchDataSqlJson, + retryTime: Int, + createTime: LocalDateTime, + updateTime: LocalDateTime + ) { + with(TDispatchThirdpartyAgentQueue.T_DISPATCH_THIRDPARTY_AGENT_QUEUE) { + dslContext.insertInto( + this, + PROJECT_ID, + PIPELINE_ID, + BUILD_ID, + VM_SEQ_ID, + DATA, + DATA_TYPE, + INFO, + RETRY_TIME, + CREATED_TIME, + UPDATE_TIME + ).values( + projectId, + pipelineId, + buildId, + vmSeqId, + data, + dataType.name, + JSON.json(JsonUtil.toJson(info)), + retryTime, + createTime, + updateTime + ).execute() + } + } + + fun fetchProjectData( + dslContext: DSLContext, + projectId: String, + pipelineId: String, + data: String, + dataType: ThirdPartyAgentSqlQueueType + ): List { + with(TDispatchThirdpartyAgentQueue.T_DISPATCH_THIRDPARTY_AGENT_QUEUE) { + return dslContext.selectFrom(this) + .where(PROJECT_ID.eq(projectId)) + .and(PIPELINE_ID.eq(pipelineId)) + .and(DATA.eq(data)) + .and(DATA_TYPE.eq(dataType.name)) + .orderBy(CREATED_TIME.asc()) + .fetch(queueDataMapper) + } + } + + fun fetchProjectDataCount( + dslContext: DSLContext, + projectId: String, + pipelineId: String, + data: String, + dataType: ThirdPartyAgentSqlQueueType + ): Long { + with(TDispatchThirdpartyAgentQueue.T_DISPATCH_THIRDPARTY_AGENT_QUEUE) { + return dslContext.selectCount().from(this) + .where(PROJECT_ID.eq(projectId)) + .and(PIPELINE_ID.eq(pipelineId)) + .and(DATA.eq(data)) + .and(DATA_TYPE.eq(dataType.name)) + .fetchOne(0, Long::class.java)!! + } + } + + fun addRetryTimeByIds( + dslContext: DSLContext, + recordIds: Set + ) { + with(TDispatchThirdpartyAgentQueue.T_DISPATCH_THIRDPARTY_AGENT_QUEUE) { + dslContext.update(this) + .set(RETRY_TIME, RETRY_TIME.plus(1)) + .set(UPDATE_TIME, LocalDateTime.now()) + .where(ID.`in`(recordIds)) + .execute() + } + } + + fun delete( + dslContext: DSLContext, + id: Long + ) { + with(TDispatchThirdpartyAgentQueue.T_DISPATCH_THIRDPARTY_AGENT_QUEUE) { + dslContext.deleteFrom(this).where(ID.eq(id)).execute() + } + } + + fun deleteByIds( + dslContext: DSLContext, + recordIds: Set + ) { + with(TDispatchThirdpartyAgentQueue.T_DISPATCH_THIRDPARTY_AGENT_QUEUE) { + dslContext.deleteFrom(this).where(ID.`in`(recordIds)).execute() + } + } + + fun fetchTimeByBuild( + dslContext: DSLContext, + buildId: String, + vmSeqId: String? + ): List { + with(TDispatchThirdpartyAgentQueue.T_DISPATCH_THIRDPARTY_AGENT_QUEUE) { + val dsl = dslContext.selectFrom(this).where(BUILD_ID.eq(buildId)) + if (!vmSeqId.isNullOrBlank()) { + dsl.and(VM_SEQ_ID.eq(vmSeqId)) + } + return dsl.fetch(queueDataMapper) + } + } + + fun deleteByBuild( + dslContext: DSLContext, + buildId: String, + vmSeqId: String? + ) { + with(TDispatchThirdpartyAgentQueue.T_DISPATCH_THIRDPARTY_AGENT_QUEUE) { + val dsl = dslContext.deleteFrom(this).where(BUILD_ID.eq(buildId)) + if (!vmSeqId.isNullOrBlank()) { + dsl.and(VM_SEQ_ID.eq(vmSeqId)) + } + dsl.execute() + } + } + + companion object { + val queueDataMapper = ThirdPartyAgentDispatchDataMapper() + } +} + +class ThirdPartyAgentDispatchDataMapper : + RecordMapper { + override fun map(record: TDispatchThirdpartyAgentQueueRecord?): ThirdPartyAgentQueueSqlData? { + return record?.let { + ThirdPartyAgentQueueSqlData( + recordId = it.id, + data = ThirdPartyAgentDispatchData( + projectId = it.projectId, + pipelineId = it.pipelineId, + buildId = it.buildId, + vmSeqId = it.vmSeqId, + infoData = JsonUtil.to( + it.info.data(), + object : TypeReference() {} + ) + ), + createTime = it.createdTime, + retryTime = it.retryTime + ) + } + } +} + +data class ThirdPartyAgentQueueSqlData( + val recordId: Long, + val data: ThirdPartyAgentDispatchData, + val createTime: LocalDateTime, + val retryTime: Int +) diff --git a/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/dao/ThirdPartyAgentBuildDao.kt b/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/dao/ThirdPartyAgentBuildDao.kt index 8fd737f172e..76d86086b72 100644 --- a/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/dao/ThirdPartyAgentBuildDao.kt +++ b/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/dao/ThirdPartyAgentBuildDao.kt @@ -248,27 +248,21 @@ class ThirdPartyAgentBuildDao { fun getRunningAndQueueBuilds( dslContext: DSLContext, - agentId: String - ): Result { - with(TDispatchThirdpartyAgentBuild.T_DISPATCH_THIRDPARTY_AGENT_BUILD) { - return dslContext.selectFrom(this.forceIndex("IDX_AGENTID_STATUS_UPDATE")) - .where(AGENT_ID.eq(agentId)) - .and(DOCKER_INFO.isNull) - .and(STATUS.`in`(PipelineTaskStatus.RUNNING.status, PipelineTaskStatus.QUEUE.status)) - .fetch() - } - } - - fun getDockerRunningAndQueueBuilds( - dslContext: DSLContext, - agentId: String - ): Result { + agentId: String, + hasDocker: Boolean + ): List> { with(TDispatchThirdpartyAgentBuild.T_DISPATCH_THIRDPARTY_AGENT_BUILD) { - return dslContext.selectFrom(this.forceIndex("IDX_AGENTID_STATUS_UPDATE")) + val dsl = dslContext.select(BUILD_ID, STATUS) + .from(this.forceIndex("IDX_AGENTID_STATUS_UPDATE")) .where(AGENT_ID.eq(agentId)) - .and(DOCKER_INFO.isNotNull) - .and(STATUS.`in`(PipelineTaskStatus.RUNNING.status, PipelineTaskStatus.QUEUE.status)) + if (hasDocker) { + dsl.and(DOCKER_INFO.isNotNull) + } else { + dsl.and(DOCKER_INFO.isNull) + } + return dsl.and(STATUS.`in`(PipelineTaskStatus.RUNNING.status, PipelineTaskStatus.QUEUE.status)) .fetch() + .map { Pair(it[BUILD_ID], it[STATUS]) } } } diff --git a/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/exception/DispatchRetryMQException.kt b/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/exception/DispatchRetryMQException.kt index 1e5ae7a92be..f61a60c1fe0 100644 --- a/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/exception/DispatchRetryMQException.kt +++ b/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/exception/DispatchRetryMQException.kt @@ -32,9 +32,8 @@ import com.tencent.devops.common.api.exception.ErrorCodeException class DispatchRetryMQException( errorCodeEnum: ErrorCodeEnum, errorMessage: String? -) : - ErrorCodeException( - errorCode = errorCodeEnum.errorCode.toString(), - errorType = errorCodeEnum.errorType, - defaultMessage = errorMessage - ) +) : ErrorCodeException( + errorCode = errorCodeEnum.errorCode.toString(), + errorType = errorCodeEnum.errorType, + defaultMessage = errorMessage +) diff --git a/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/listener/TPAQueueListener.kt b/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/listener/TPAQueueListener.kt new file mode 100644 index 00000000000..9c0a9da2e0b --- /dev/null +++ b/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/listener/TPAQueueListener.kt @@ -0,0 +1,15 @@ +package com.tencent.devops.dispatch.listener + +import com.tencent.devops.dispatch.pojo.TPAQueueEvent +import com.tencent.devops.dispatch.service.tpaqueue.TPAQueueService +import org.springframework.beans.factory.annotation.Autowired +import org.springframework.stereotype.Service + +@Service +class TPAQueueListener @Autowired constructor( + private val tpaQueueService: TPAQueueService +) { + fun listenTpAgentQueueEvent(event: TPAQueueEvent) { + tpaQueueService.doQueue(event) + } +} \ No newline at end of file diff --git a/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/listener/ThirdPartyBuildListener.kt b/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/listener/ThirdPartyBuildListener.kt index 51dddeb767a..244fc86720d 100644 --- a/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/listener/ThirdPartyBuildListener.kt +++ b/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/listener/ThirdPartyBuildListener.kt @@ -27,20 +27,22 @@ package com.tencent.devops.dispatch.listener +import com.tencent.devops.common.api.util.timestampmilli import com.tencent.devops.common.dispatch.sdk.listener.BuildListener import com.tencent.devops.common.dispatch.sdk.pojo.DispatchMessage import com.tencent.devops.dispatch.exception.DispatchRetryMQException import com.tencent.devops.dispatch.pojo.enums.JobQuotaVmType -import com.tencent.devops.dispatch.service.ThirdPartyAgentService import com.tencent.devops.dispatch.service.ThirdPartyDispatchService +import com.tencent.devops.dispatch.utils.TPACommonUtil import com.tencent.devops.process.pojo.mq.PipelineAgentShutdownEvent import org.springframework.beans.factory.annotation.Autowired import org.springframework.stereotype.Service +import java.time.LocalDateTime @Service class ThirdPartyBuildListener @Autowired constructor( - private val thirdPartyAgentService: ThirdPartyAgentService, - private val thirdPartyDispatchService: ThirdPartyDispatchService + private val thirdPartyDispatchService: ThirdPartyDispatchService, + private val tpaCommonUtil: TPACommonUtil ) : BuildListener { override fun getStartupQueue(): String { @@ -56,6 +58,23 @@ class ThirdPartyBuildListener @Autowired constructor( } override fun onStartup(dispatchMessage: DispatchMessage) { + // 包一层用来计算耗时 + try { + doOnStartup(dispatchMessage) + } catch (e: Throwable) { + // 抓到了肯定是需要结束的异常 + if (dispatchMessage.event.dispatchQueueStartTimeMilliSecond != null) { + tpaCommonUtil.updateQueueTime( + event = dispatchMessage.event, + createTime = dispatchMessage.event.dispatchQueueStartTimeMilliSecond!!, + endTime = LocalDateTime.now().timestampmilli() + ) + } + throw e + } + } + + private fun doOnStartup(dispatchMessage: DispatchMessage) { try { thirdPartyDispatchService.startUp(dispatchMessage) } catch (e: DispatchRetryMQException) { @@ -73,7 +92,7 @@ class ThirdPartyBuildListener @Autowired constructor( } override fun onShutdown(event: PipelineAgentShutdownEvent) { - thirdPartyAgentService.finishBuild(event) + thirdPartyDispatchService.finishBuild(event) } override fun getVmType(): JobQuotaVmType? { diff --git a/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/pojo/TPADispatchData.kt b/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/pojo/TPADispatchData.kt new file mode 100644 index 00000000000..ed971acc19a --- /dev/null +++ b/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/pojo/TPADispatchData.kt @@ -0,0 +1,173 @@ +package com.tencent.devops.dispatch.pojo + +import com.tencent.devops.common.dispatch.sdk.pojo.DispatchMessage +import com.tencent.devops.common.pipeline.type.agent.ThirdPartyAgentDispatch +import com.tencent.devops.common.pipeline.type.agent.ThirdPartyAgentEnvDispatchType +import com.tencent.devops.common.pipeline.type.agent.ThirdPartyAgentIDDispatchType + +data class ThirdPartyAgentDispatchData( + val id: String, + val secretKey: String, + val userId: String, + val projectId: String, + val pipelineId: String, + val pipelineName: String, + val buildId: String, + val buildNo: Int, + val os: String, + val vmSeqId: String, + val taskName: String, + val channelCode: String, + val atoms: Map, + val containerHashId: String?, + val executeCount: Int?, + val jobId: String?, + val queueTimeoutMinutes: Int?, + val dispatchType: ThirdPartyAgentDispatch, + val ignoreEnvAgentIds: Set?, + val singleNodeConcurrency: Int?, + val allNodeConcurrency: Int? +) { + fun isEnv() = dispatchType.isEnv() + // 生成环境资源标识,需要添加共享项目信息 + fun genEnvWithProject(): String? { + if (dispatchType !is ThirdPartyAgentEnvDispatchType) { + return null + } + return dispatchType.envProjectId.takeIf { !it.isNullOrBlank() }?.let { "$it@${dispatchType.envName}" } + ?: dispatchType.envName + } + + fun genEnv(): String? { + if (dispatchType !is ThirdPartyAgentEnvDispatchType) { + return null + } + return dispatchType.envName + } + + fun isSingle() = dispatchType.isSingle() + // 生成Agent资源标识 + fun genAgent(): String? { + if (dispatchType !is ThirdPartyAgentIDDispatchType) { + return null + } + return dispatchType.displayName + } + + // 方便打印日志 + fun toLog(): String { + var msg = "$userId|$projectId|$pipelineId|$buildId|$vmSeqId" + msg += when (dispatchType) { + is ThirdPartyAgentEnvDispatchType -> "|env=${this.genEnvWithProject()}" + is ThirdPartyAgentIDDispatchType -> "|agent=${this.genAgent()}" + else -> "" + } + return msg + } + + constructor( + dispatchMessage: DispatchMessage, + dispatchType: ThirdPartyAgentDispatch + ) : this( + id = dispatchMessage.id, + secretKey = dispatchMessage.secretKey, + userId = dispatchMessage.event.userId, + projectId = dispatchMessage.event.projectId, + pipelineId = dispatchMessage.event.pipelineId, + pipelineName = dispatchMessage.event.pipelineName, + buildId = dispatchMessage.event.buildId, + buildNo = dispatchMessage.event.buildNo, + os = dispatchMessage.event.os, + vmSeqId = dispatchMessage.event.vmSeqId, + taskName = dispatchMessage.event.taskName, + channelCode = dispatchMessage.event.channelCode, + atoms = dispatchMessage.event.atoms, + containerHashId = dispatchMessage.event.containerHashId, + executeCount = dispatchMessage.event.executeCount, + jobId = dispatchMessage.event.jobId, + queueTimeoutMinutes = dispatchMessage.event.queueTimeoutMinutes, + dispatchType = dispatchType, + ignoreEnvAgentIds = dispatchMessage.event.ignoreEnvAgentIds, + singleNodeConcurrency = dispatchMessage.event.singleNodeConcurrency, + allNodeConcurrency = dispatchMessage.event.allNodeConcurrency + ) + + constructor( + projectId: String, + pipelineId: String, + buildId: String, + vmSeqId: String, + infoData: ThirdPartyAgentDispatchDataSqlJson + ) : this( + id = infoData.id, + secretKey = infoData.secretKey, + userId = infoData.userId, + projectId = projectId, + pipelineId = pipelineId, + pipelineName = infoData.pipelineName, + buildId = buildId, + buildNo = infoData.buildNo, + os = infoData.os, + vmSeqId = vmSeqId, + taskName = infoData.taskName, + channelCode = infoData.channelCode, + atoms = infoData.atoms, + containerHashId = infoData.containerHashId, + executeCount = infoData.executeCount, + jobId = infoData.jobId, + queueTimeoutMinutes = infoData.queueTimeoutMinutes, + dispatchType = infoData.dispatchType, + ignoreEnvAgentIds = infoData.ignoreEnvAgentIds, + singleNodeConcurrency = infoData.singleNodeConcurrency, + allNodeConcurrency = infoData.allNodeConcurrency + ) + + fun genSqlJsonData(): ThirdPartyAgentDispatchDataSqlJson { + return ThirdPartyAgentDispatchDataSqlJson( + id = this.id, + secretKey = this.secretKey, + userId = this.userId, + pipelineName = this.pipelineName, + buildNo = this.buildNo, + os = this.os, + taskName = this.taskName, + channelCode = this.channelCode, + atoms = this.atoms, + containerHashId = this.containerHashId, + executeCount = this.executeCount, + jobId = this.jobId, + queueTimeoutMinutes = this.queueTimeoutMinutes, + dispatchType = this.dispatchType, + ignoreEnvAgentIds = this.ignoreEnvAgentIds, + singleNodeConcurrency = this.singleNodeConcurrency, + allNodeConcurrency = this.allNodeConcurrency + ) + } +} + +// 保存到数据库中的 json 类型 +data class ThirdPartyAgentDispatchDataSqlJson( + val id: String, + val secretKey: String, + val userId: String, + val pipelineName: String, + val buildNo: Int, + val os: String, + val taskName: String, + val channelCode: String, + val atoms: Map, + val containerHashId: String?, + val executeCount: Int?, + val jobId: String?, + val queueTimeoutMinutes: Int?, + val dispatchType: ThirdPartyAgentDispatch, + val ignoreEnvAgentIds: Set?, + val singleNodeConcurrency: Int?, + val allNodeConcurrency: Int? +) + +// 数据库使用的排队类型 +enum class ThirdPartyAgentSqlQueueType { + AGENT, + ENV; +} diff --git a/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/pojo/TPAQueueEvent.kt b/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/pojo/TPAQueueEvent.kt new file mode 100644 index 00000000000..24a3db5f0ca --- /dev/null +++ b/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/pojo/TPAQueueEvent.kt @@ -0,0 +1,113 @@ +package com.tencent.devops.dispatch.pojo + +import com.tencent.devops.common.api.util.timestampmilli +import com.tencent.devops.common.event.annotation.Event +import com.tencent.devops.common.event.dispatcher.pipeline.mq.MQ +import com.tencent.devops.environment.pojo.thirdpartyagent.ThirdPartyAgent +import java.time.LocalDateTime + +@Event(MQ.EXCHANGE_THIRD_PARTY_AGENT_QUEUE, MQ.ROUTE_THIRD_PARTY_AGENT_QUEUE) +data class TPAQueueEvent( + val projectId: String, + val pipelineId: String, + val data: String, + val dataType: ThirdPartyAgentSqlQueueType, + // 发送这个事件的消息,只在第一次发送消息时携带,重放后置空,方便做一些日志输出或者记录 + var sendData: ThirdPartyAgentDispatchData?, + // 消息队列延迟消息时间 + val delayMills: Int, + // 拿到的锁的值,为了保证生产和消费始终只有一个消息,所以需要在生产和消费端共用一把锁 + val lockValue: String +) { + fun toLog() = "${this.projectId}|${this.pipelineId}|${this.data}|${this.dataType}" +} + +data class TPAQueueEventContext( + var context: EnvQueueContext? = null, + var needDeleteRecord: Pair? = null, + val needRetryRecord: MutableSet = mutableSetOf(), + val startTimeMilliSecond: Long = System.currentTimeMillis() +) { + fun setDelete(recordId: Long) { + needDeleteRecord = Pair(recordId, LocalDateTime.now().timestampmilli()) + } + + fun addRetry(recordId: Long) { + needRetryRecord.add(recordId) + } +} + +/** + * 用来存放构建机通用场景的上下文,非线程安全 + * 生命周期更随整个队列 + */ +open class QueueContext( + open var envId: Long? +) + +/** + * 用来存放构建机环境队列中所有消息检查共用的上下文,非线程安全。需要保证每个动态获取的校验数据在这一个校验组内都是相同的 + * 生命周期更随整个队列 + * @param envId 环境ID,调度环境类型时生效 + * @param agents 当前可以使用的 agent 列表,可能会随着不同消息经历的操作不同而发生改变 + * @param projectJobRunningAndQueueAllCount 流水线下这个Job上节点运行的所有任务数量,根据jobId区分缓存 + * @param agentsJobRunningAndQueueAllMap 流水线下这个Job上各个节点运行的任务数量,会随着 agent 列表发生改变而改变,根据jobId区分缓存 + * @param agentRunningCnt 每个节点上正在跑的任务数量,会随着 agent 列表发生改变而改变 + * @param dockerRunningCnt 每个节点上正在跑的docker任务数量,会随着 agent 列表发生改变而改变 + * @param hasTryAgents 已经尝试过不行的 agent,每个队列只尝试一次 + */ +data class EnvQueueContext( + override var envId: Long?, + var agents: List, + // AllNodeConcurrencyCheck + private val projectJobRunningAndQueueAllCount: MutableMap = mutableMapOf(), + // SingleNodeConcurrencyCheck + private val agentsJobRunningAndQueueAllMap: MutableMap> = mutableMapOf(), + // PickupAgentCheck + val agentRunningCnt: MutableMap = mutableMapOf(), + val dockerRunningCnt: MutableMap = mutableMapOf(), + val hasTryAgents: MutableSet = mutableSetOf() +) : QueueContext(envId = envId) { + fun projectJobRunningAndQueueAllCount(jobId: String): Long? { + return projectJobRunningAndQueueAllCount[jobId] + } + + fun setProjectJobRunningAndQueueAllCount(jobId: String, cnt: Long?) { + if (cnt == null) { + projectJobRunningAndQueueAllCount.remove(jobId) + return + } + projectJobRunningAndQueueAllCount[jobId] = cnt + } + + fun agentsJobRunningAndQueueAllMap(jobId: String): MutableMap? { + return agentsJobRunningAndQueueAllMap[jobId] + } + + fun setAgentsJobRunningAndQueueAllMap(jobId: String, agentId: String, cnt: Int?) { + if (cnt == null) { + agentsJobRunningAndQueueAllMap[jobId]?.remove(agentId) + return + } + agentsJobRunningAndQueueAllMap[jobId]?.set(agentId, cnt) + } + + fun setAllAgentsJobRunningAndQueueAllMap(jobId: String, map: MutableMap) { + agentsJobRunningAndQueueAllMap[jobId] = map + } +} + +/** + * 队列中单独消息自己的上下文,生命周期跟随每次消息执行 + * @param data 消息详情 + * @param retryTime 重试次数 + * @param buildAgent 这个消息每次选择的尝试去下发任务的 agent + */ +data class QueueDataContext( + val data: ThirdPartyAgentDispatchData, + val retryTime: Int, + // GenAgentBuildCheck + var buildAgent: ThirdPartyAgent? = null +) { + fun retryLog(msg: String?) = " - retry $retryTime: $msg" +} diff --git a/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/service/ThirdPartyAgentService.kt b/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/service/ThirdPartyAgentService.kt index 71b0632f130..039bdb0595c 100644 --- a/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/service/ThirdPartyAgentService.kt +++ b/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/service/ThirdPartyAgentService.kt @@ -40,15 +40,16 @@ import com.tencent.devops.common.api.util.HashUtil import com.tencent.devops.common.api.util.JsonUtil import com.tencent.devops.common.api.util.PageUtil import com.tencent.devops.common.api.util.timestamp +import com.tencent.devops.common.api.util.timestampmilli import com.tencent.devops.common.auth.api.AuthResourceType import com.tencent.devops.common.client.Client import com.tencent.devops.common.client.ClientTokenService -import com.tencent.devops.common.dispatch.sdk.pojo.DispatchMessage import com.tencent.devops.common.notify.enums.NotifyType import com.tencent.devops.common.pipeline.type.agent.ThirdPartyAgentDockerInfoDispatch import com.tencent.devops.common.redis.RedisOperation import com.tencent.devops.common.service.utils.HomeHostUtil import com.tencent.devops.dispatch.dao.ThirdPartyAgentBuildDao +import com.tencent.devops.dispatch.pojo.ThirdPartyAgentDispatchData import com.tencent.devops.dispatch.pojo.enums.PipelineTaskStatus import com.tencent.devops.dispatch.pojo.thirdpartyagent.AgentBuildInfo import com.tencent.devops.dispatch.pojo.thirdpartyagent.BuildJobType @@ -57,6 +58,7 @@ import com.tencent.devops.dispatch.pojo.thirdpartyagent.ThirdPartyAskResp import com.tencent.devops.dispatch.pojo.thirdpartyagent.ThirdPartyBuildDockerInfo import com.tencent.devops.dispatch.pojo.thirdpartyagent.ThirdPartyBuildInfo import com.tencent.devops.dispatch.pojo.thirdpartyagent.ThirdPartyBuildWithStatus +import com.tencent.devops.dispatch.utils.TPACommonUtil import com.tencent.devops.dispatch.utils.ThirdPartyAgentLock import com.tencent.devops.dispatch.utils.ThirdPartyAgentUtils import com.tencent.devops.dispatch.utils.redis.ThirdPartyAgentBuildRedisUtils @@ -67,13 +69,13 @@ import com.tencent.devops.model.dispatch.tables.records.TDispatchThirdpartyAgent import com.tencent.devops.notify.api.service.ServiceNotifyMessageTemplateResource import com.tencent.devops.notify.pojo.SendNotifyMessageTemplateRequest import com.tencent.devops.process.api.service.ServiceBuildResource -import com.tencent.devops.process.pojo.mq.PipelineAgentShutdownEvent import org.jooq.DSLContext import org.slf4j.LoggerFactory import org.springframework.beans.factory.annotation.Autowired import org.springframework.beans.factory.annotation.Value import org.springframework.dao.DeadlockLoserDataAccessException import org.springframework.stereotype.Service +import java.time.LocalDateTime import java.util.concurrent.CancellationException import java.util.concurrent.CompletableFuture import java.util.concurrent.ExecutionException @@ -91,22 +93,19 @@ class ThirdPartyAgentService @Autowired constructor( private val redisOperation: RedisOperation, private val thirdPartyAgentBuildDao: ThirdPartyAgentBuildDao, private val thirdPartyAgentDockerService: ThirdPartyAgentDockerService, - private val tokenService: ClientTokenService + private val tokenService: ClientTokenService, + private val commonUtil: TPACommonUtil ) { @Value("\${thirdagent.workerErrorTemplate:#{null}}") val workerErrorRtxTemplate: String? = null fun queueBuild( agent: ThirdPartyAgent, - thirdPartyAgentWorkspace: String, - dispatchMessage: DispatchMessage, + dispatchData: ThirdPartyAgentDispatchData, retryCount: Int = 0, - dockerInfo: ThirdPartyAgentDockerInfoDispatch?, - envId: Long?, - ignoreEnvAgentIds: Set?, - jobId: String? + envId: Long? ) { - with(dispatchMessage.event) { + with(dispatchData) { try { thirdPartyAgentBuildDao.add( dslContext = dslContext, @@ -115,13 +114,19 @@ class ThirdPartyAgentService @Autowired constructor( pipelineId = pipelineId, buildId = buildId, vmSeqId = vmSeqId, - thirdPartyAgentWorkspace = thirdPartyAgentWorkspace, + thirdPartyAgentWorkspace = dispatchType.workspace ?: "", pipelineName = pipelineName, buildNum = buildNo, taskName = taskName, agentIp = agent.ip, nodeId = HashUtil.decodeIdToLong(agent.nodeId ?: ""), - dockerInfo = dockerInfo, + dockerInfo = dispatchType.dockerInfo?.let { + ThirdPartyAgentDockerInfoDispatch( + agentId = id, + secretKey = secretKey, + info = it + ) + }, executeCount = executeCount, containerHashId = containerHashId, envId = envId, @@ -133,13 +138,9 @@ class ThirdPartyAgentService @Autowired constructor( if (retryCount <= QUEUE_RETRY_COUNT) { queueBuild( agent = agent, - thirdPartyAgentWorkspace = thirdPartyAgentWorkspace, - dispatchMessage = dispatchMessage, + dispatchData = dispatchData, retryCount = retryCount + 1, - dockerInfo = dockerInfo, - envId = envId, - ignoreEnvAgentIds = ignoreEnvAgentIds, - jobId = jobId + envId = envId ) } else { throw OperationException("Fail to add the third party agent build") @@ -159,11 +160,17 @@ class ThirdPartyAgentService @Autowired constructor( } fun getRunningBuilds(agentId: String): Int { - return thirdPartyAgentBuildDao.getRunningAndQueueBuilds(dslContext, agentId).size + return thirdPartyAgentBuildDao.getRunningAndQueueBuilds(dslContext, agentId, false).size } fun getDockerRunningBuilds(agentId: String): Int { - return thirdPartyAgentBuildDao.getDockerRunningAndQueueBuilds(dslContext, agentId).size + return thirdPartyAgentBuildDao.getRunningAndQueueBuilds(dslContext, agentId, true).size + } + + fun checkRunningAndSize(agentId: String, buildId: String, docker: Boolean): Pair { + val records = thirdPartyAgentBuildDao.getRunningAndQueueBuilds(dslContext, agentId, docker) + val hasRun = records.any { it.first == buildId && it.second == PipelineTaskStatus.RUNNING.status } + return Pair(hasRun, records.size) } fun startBuild( @@ -198,7 +205,7 @@ class ThirdPartyAgentService @Autowired constructor( if (agentResult.data!!.secretKey != secretKey) { logger.warn( "The secretKey($secretKey) is not match the expect one(${agentResult.data!!.secretKey} " + - "of project($projectId) and agent($agentId)" + "of project($projectId) and agent($agentId)" ) throw NotFoundException("Fail to get the agent") } @@ -231,7 +238,7 @@ class ThirdPartyAgentService @Autowired constructor( } catch (e: RemoteServiceException) { logger.warn( "notify agent task[$build.projectId|${build.buildId}|${build.vmSeqId}|$agentId]" + - " claim failed, cause: ${e.message} agent project($projectId)" + " claim failed, cause: ${e.message} agent project($projectId)" ) } @@ -249,9 +256,9 @@ class ThirdPartyAgentService @Autowired constructor( // 只有凭据ID的参与计算 if (dockerInfo != null) { if (( - dockerInfo.credential?.user.isNullOrBlank() && - dockerInfo.credential?.password.isNullOrBlank() - ) && + dockerInfo.credential?.user.isNullOrBlank() && + dockerInfo.credential?.password.isNullOrBlank() + ) && !(dockerInfo.credential?.credentialId.isNullOrBlank()) ) { val (userName, password) = try { @@ -379,21 +386,39 @@ class ThirdPartyAgentService @Autowired constructor( } } - fun finishBuild(event: PipelineAgentShutdownEvent) { - val buildId = event.buildId - val vmSeqId = event.vmSeqId - val success = event.buildResult + fun finishBuild(buildId: String, vmSeqId: String?, buildResult: Boolean) { + val now = LocalDateTime.now().timestampmilli() if (vmSeqId.isNullOrBlank()) { val records = thirdPartyAgentBuildDao.list(dslContext, buildId) if (records.isEmpty()) { return } - records.forEach { - finishBuild(it, success) + records.forEach { record -> + // 取消时兜底结束时间 + commonUtil.updateQueueTime( + projectId = record.projectId, + pipelineId = record.pipelineId, + buildId = record.buildId, + vmSeqId = record.vmSeqId, + executeCount = record.executeCount, + createTime = null, + endTime = now + ) + finishBuild(record, buildResult) } } else { val record = thirdPartyAgentBuildDao.get(dslContext, buildId, vmSeqId) ?: return - finishBuild(record, success) + // 取消时兜底结束时间 + commonUtil.updateQueueTime( + projectId = record.projectId, + pipelineId = record.pipelineId, + buildId = record.buildId, + vmSeqId = record.vmSeqId, + executeCount = record.executeCount, + createTime = null, + endTime = now + ) + finishBuild(record, buildResult) } } @@ -445,7 +470,7 @@ class ThirdPartyAgentService @Autowired constructor( private fun finishBuild(record: TDispatchThirdpartyAgentBuildRecord, success: Boolean) { logger.info( "Finish the third party agent(${record.agentId}) build(${record.buildId}) " + - "of seq(${record.vmSeqId}) and status(${record.status})" + "of seq(${record.vmSeqId}) and status(${record.status})" ) val agentResult = client.get(ServiceThirdPartyAgentResource::class) .getAgentByIdGlobal(record.projectId, record.agentId) @@ -489,9 +514,9 @@ class ThirdPartyAgentService @Autowired constructor( // 有些并发情况可能会导致在finish时AgentBuild状态没有被置为Done在这里改一下 val buildRecord = thirdPartyAgentBuildDao.get(dslContext, buildInfo.buildId, buildInfo.vmSeqId) if (buildRecord != null && ( - buildRecord.status != PipelineTaskStatus.DONE.status || - buildRecord.status != PipelineTaskStatus.FAILURE.status - ) + buildRecord.status != PipelineTaskStatus.DONE.status || + buildRecord.status != PipelineTaskStatus.FAILURE.status + ) ) { thirdPartyAgentBuildDao.updateStatus( dslContext = dslContext, @@ -797,7 +822,8 @@ class ThirdPartyAgentService @Autowired constructor( "nodeDetail/$nodeHashId" ) ) - ) }.onFailure { + ) + }.onFailure { logger.warn("agentStartup|sendNotifyMessageByTemplate|$projectId|$agentId") } } diff --git a/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/service/ThirdPartyDispatchService.kt b/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/service/ThirdPartyDispatchService.kt index 5c44857c473..1f9957c75be 100644 --- a/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/service/ThirdPartyDispatchService.kt +++ b/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/service/ThirdPartyDispatchService.kt @@ -31,6 +31,7 @@ import com.tencent.devops.common.api.enums.AgentStatus import com.tencent.devops.common.api.exception.InvalidParamException import com.tencent.devops.common.api.exception.RemoteServiceException import com.tencent.devops.common.api.util.HashUtil +import com.tencent.devops.common.api.util.timestampmilli import com.tencent.devops.common.client.Client import com.tencent.devops.common.dispatch.sdk.BuildFailureException import com.tencent.devops.common.dispatch.sdk.pojo.DispatchMessage @@ -38,17 +39,12 @@ import com.tencent.devops.common.log.utils.BuildLogPrinter import com.tencent.devops.common.pipeline.container.AgentReuseMutex import com.tencent.devops.common.pipeline.enums.VMBaseOS import com.tencent.devops.common.pipeline.type.agent.AgentType -import com.tencent.devops.common.pipeline.type.agent.ThirdPartyAgentDockerInfo -import com.tencent.devops.common.pipeline.type.agent.ThirdPartyAgentDockerInfoDispatch +import com.tencent.devops.common.pipeline.type.agent.ThirdPartyAgentDispatch import com.tencent.devops.common.pipeline.type.agent.ThirdPartyAgentEnvDispatchType import com.tencent.devops.common.pipeline.type.agent.ThirdPartyAgentIDDispatchType import com.tencent.devops.common.pipeline.type.agent.ThirdPartyDevCloudDispatchType -import com.tencent.devops.common.redis.RedisLockByValue import com.tencent.devops.common.redis.RedisOperation -import com.tencent.devops.common.service.config.CommonConfig -import com.tencent.devops.common.service.utils.HomeHostUtil import com.tencent.devops.common.web.utils.I18nUtil -import com.tencent.devops.dispatch.constants.AGENT_REUSE_MUTEX_REDISPATCH import com.tencent.devops.dispatch.constants.AGENT_REUSE_MUTEX_WAIT_REUSED_ENV import com.tencent.devops.dispatch.constants.BK_AGENT_IS_BUSY import com.tencent.devops.dispatch.constants.BK_ENV_BUSY @@ -57,7 +53,6 @@ import com.tencent.devops.dispatch.constants.BK_ENV_WORKER_ERROR_IGNORE import com.tencent.devops.dispatch.constants.BK_MAX_BUILD_SEARCHING_AGENT import com.tencent.devops.dispatch.constants.BK_NO_AGENT_AVAILABLE import com.tencent.devops.dispatch.constants.BK_QUEUE_TIMEOUT_MINUTES -import com.tencent.devops.dispatch.constants.BK_SCHEDULING_SELECTED_AGENT import com.tencent.devops.dispatch.constants.BK_SEARCHING_AGENT import com.tencent.devops.dispatch.constants.BK_SEARCHING_AGENT_MOST_IDLE import com.tencent.devops.dispatch.constants.BK_SEARCHING_AGENT_PARALLEL_AVAILABLE @@ -65,22 +60,22 @@ import com.tencent.devops.dispatch.constants.BK_THIRD_JOB_ENV_CURR import com.tencent.devops.dispatch.constants.BK_THIRD_JOB_NODE_CURR import com.tencent.devops.dispatch.exception.DispatchRetryMQException import com.tencent.devops.dispatch.exception.ErrorCodeEnum +import com.tencent.devops.dispatch.pojo.ThirdPartyAgentDispatchData +import com.tencent.devops.dispatch.service.tpaqueue.TPAQueueService +import com.tencent.devops.dispatch.service.tpaqueue.TPASingleQueueService +import com.tencent.devops.dispatch.utils.TPACommonUtil import com.tencent.devops.dispatch.utils.ThirdPartyAgentEnvLock -import com.tencent.devops.dispatch.utils.ThirdPartyAgentLock -import com.tencent.devops.dispatch.utils.redis.ThirdPartyAgentBuildRedisUtils -import com.tencent.devops.dispatch.utils.redis.ThirdPartyRedisBuild import com.tencent.devops.environment.api.thirdpartyagent.ServiceThirdPartyAgentResource import com.tencent.devops.environment.pojo.thirdpartyagent.EnvNodeAgent import com.tencent.devops.environment.pojo.thirdpartyagent.ThirdPartyAgent -import com.tencent.devops.process.api.service.ServiceBuildResource import com.tencent.devops.process.api.service.ServiceVarResource import com.tencent.devops.process.engine.common.VMUtils -import com.tencent.devops.process.pojo.SetContextVarData -import com.tencent.devops.process.pojo.VmInfo +import com.tencent.devops.process.pojo.mq.PipelineAgentShutdownEvent import com.tencent.devops.process.pojo.mq.PipelineAgentStartupEvent import org.slf4j.LoggerFactory import org.springframework.beans.factory.annotation.Autowired import org.springframework.stereotype.Service +import java.time.LocalDateTime import javax.ws.rs.core.Response @Service @@ -89,27 +84,41 @@ class ThirdPartyDispatchService @Autowired constructor( private val client: Client, private val redisOperation: RedisOperation, private val buildLogPrinter: BuildLogPrinter, - private val commonConfig: CommonConfig, - private val thirdPartyAgentBuildRedisUtils: ThirdPartyAgentBuildRedisUtils, - private val thirdPartyAgentBuildService: ThirdPartyAgentService + private val commonUtil: TPACommonUtil, + private val thirdPartyAgentBuildService: ThirdPartyAgentService, + private val tpaQueueService: TPAQueueService, + private val tpaSingleQueueService: TPASingleQueueService ) { fun canDispatch(event: PipelineAgentStartupEvent) = event.dispatchType is ThirdPartyAgentIDDispatchType || event.dispatchType is ThirdPartyAgentEnvDispatchType || event.dispatchType is ThirdPartyDevCloudDispatchType + // 按 Redis 灰度使用新排队逻辑的项目或者流水线 + // project to pipeline1;pipeline2;..... + private fun useNewQueue(projectId: String, pipelineId: String): Boolean { + val v = redisOperation.hget( + DISPATCH_QUEUE_GRAY_PROJECT_PIPELINE, projectId + ) ?: return false + if (v.isBlank()) { + return true + } + return v.split(";").toSet().contains(pipelineId) + } + fun startUp(dispatchMessage: DispatchMessage) { when (dispatchMessage.event.dispatchType) { is ThirdPartyAgentIDDispatchType -> { val dispatchType = dispatchMessage.event.dispatchType as ThirdPartyAgentIDDispatchType + // 没有复用逻辑的直接调度 if (!dispatchType.agentType.isReuse()) { buildByAgentId(dispatchMessage, dispatchType) return } - // 只要是复用就先拿一下上下文,可能存在同stage但又先后的情况 + // 只要是复用就先拿一下上下文,可能存在同stage但被复用的已经跑完了 val agentId = dispatchMessage.getAgentReuseContextVar(dispatchType.displayName) - // 是复用,但是和被复用对象在同一stage且先后顺序未知 + // 是复用,但是和被复用对象在同一stage且先后顺序未知,且被复用对象还没有跑完,这里拿复用对象的资源调度 if (dispatchType.reusedInfo != null && agentId.isNullOrBlank()) { dispatchType.displayName = dispatchType.reusedInfo!!.value buildByAgentId(dispatchMessage, dispatchType) @@ -127,13 +136,25 @@ class ThirdPartyDispatchService @Autowired constructor( ) } - buildByAgentId(dispatchMessage, dispatchType.copy(displayName = agentId)) + // 到了这里就剩两种 + // 1、绝对复用有先后区别 + // 2、先后顺序未知,但是客观上被复用对象先跑完了,就按照绝对复用处理 + buildByAgentId( + dispatchMessage, + dispatchType.copy(displayName = agentId, agentType = AgentType.REUSE_JOB_ID, reusedInfo = null) + ) } is ThirdPartyAgentEnvDispatchType -> { val dispatchType = dispatchMessage.event.dispatchType as ThirdPartyAgentEnvDispatchType if (!dispatchType.agentType.isReuse()) { - buildByEnvId(dispatchMessage, dispatchType) + if (useNewQueue(dispatchMessage.event.projectId, dispatchMessage.event.pipelineId)) { + tpaQueueService.queue( + ThirdPartyAgentDispatchData(dispatchMessage, dispatchType) + ) + } else { + buildByEnvId(dispatchMessage, dispatchType) + } return } // 只要是复用就先拿一下上下文,可能存在同stage但又先后的情况 @@ -179,7 +200,7 @@ class ThirdPartyDispatchService @Autowired constructor( workspace = dispatchType.workspace, agentType = AgentType.REUSE_JOB_ID, dockerInfo = dispatchType.dockerInfo, - reusedInfo = dispatchType.reusedInfo + reusedInfo = null ) ) return @@ -209,6 +230,7 @@ class ThirdPartyDispatchService @Autowired constructor( dispatchMessage: DispatchMessage, dispatchType: ThirdPartyAgentIDDispatchType ) { + dispatchMessage.event.dispatchQueueStartTimeMilliSecond = LocalDateTime.now().timestampmilli() val agentResult = if (dispatchType.idType()) { client.get(ServiceThirdPartyAgentResource::class) .getAgentById(dispatchMessage.event.projectId, dispatchType.displayName) @@ -247,17 +269,7 @@ class ThirdPartyDispatchService @Autowired constructor( ) } - if (!doAgentInQueue( - dispatchMessage = dispatchMessage, - agent = agentResult.data!!, - workspace = dispatchType.workspace, - dockerInfo = dispatchType.dockerInfo, - envId = null, - ignoreEnvAgentIds = null, - hasReuseMutex = dispatchType.hasReuseMutex(), - jobId = dispatchMessage.event.jobId - ) - ) { + if (!agentInQueue(dispatchMessage, dispatchType, agent = agentResult.data!!, envId = null)) { logDebug( dispatchMessage.event, I18nUtil.getCodeLanMessage( @@ -273,264 +285,34 @@ class ThirdPartyDispatchService @Autowired constructor( ) ) } - } - private fun doAgentInQueue( - dispatchMessage: DispatchMessage, - agent: ThirdPartyAgent, - workspace: String?, - dockerInfo: ThirdPartyAgentDockerInfo?, - envId: Long?, - ignoreEnvAgentIds: Set?, - hasReuseMutex: Boolean, - jobId: String? - ): Boolean { - val event = dispatchMessage.event - val redisLock = ThirdPartyAgentLock(redisOperation, event.projectId, agent.agentId) - try { - if (redisLock.tryLock()) { - if (thirdPartyAgentBuildRedisUtils.isThirdPartyAgentUpgrading( - projectId = event.projectId, - agentId = agent.agentId - ) - ) { - logger.warn("The agent(${agent.agentId}) of project(${event.projectId}) is upgrading") - log( - dispatchMessage.event, - ErrorCodeEnum.BUILD_MACHINE_UPGRADE_IN_PROGRESS.getErrorMessage( - language = I18nUtil.getDefaultLocaleLanguage() - ) + " - ${agent.hostname}/${agent.ip}" - ) - return false - } - - // #10082 对于复用的机器和被复用的,需要加锁校验看看这台机器能不能使用 - val lockKey = AgentReuseMutex.genAgentReuseMutexLockKey(event.projectId, agent.agentId) - if (hasReuseMutex) { - val lock = RedisLockByValue( - redisOperation = redisOperation, - lockKey = lockKey, - lockValue = event.buildId, - expiredTimeInSeconds = AgentReuseMutex.AGENT_LOCK_TIMEOUT - ) - // 没有拿到锁说明现在这台机被复用互斥占用不能选 - if (!lock.tryLock()) { - logAgentReuse(lockKey, dispatchMessage, agent) - return false - } - try { - // # 10082 设置复用需要的关键字 jobs..container.agent_id,jobId需要为根节点id - // 只用给env类型的根节点设置,因为id类型的在引擎 AgentReuseMutexCmd 直接写入了 - val dispatch = dispatchMessage.event.dispatchType - if ((dispatch is ThirdPartyAgentEnvDispatchType) && - dispatch.reusedInfo != null && - dispatch.reusedInfo!!.jobId == null - ) { - client.get(ServiceVarResource::class).setContextVar( - SetContextVarData( - projectId = dispatchMessage.event.projectId, - pipelineId = dispatchMessage.event.pipelineId, - buildId = dispatchMessage.event.buildId, - contextName = AgentReuseMutex.genAgentContextKey(dispatchMessage.event.jobId!!), - contextVal = agent.agentId, - readOnly = true, - rewriteReadOnly = true - ) - ) - } - } catch (e: Exception) { - logger.error("inQueue|doAgentInQueue|error", e) - } - } else { - val lockedBuildId = redisOperation.get(lockKey) - if (!lockedBuildId.isNullOrBlank() && lockedBuildId != event.buildId) { - // 没有复用逻辑的需要检查下如果这个机器剩一个可调度空间且有复用锁那么不能进行调度 - val checkRes = if (dockerInfo != null) { - ((agent.dockerParallelTaskCount ?: 4) - - thirdPartyAgentBuildService.getDockerRunningBuilds(agent.agentId)) <= 1 - } else { - ((agent.parallelTaskCount ?: 4) - - thirdPartyAgentBuildService.getRunningBuilds(agent.agentId)) <= 1 - } - if (checkRes) { - logAgentReuse(lockKey, dispatchMessage, agent) - return false - } - } - } - - // #5806 入库失败就不再写Redis - inQueue( - agent = agent, - dispatchMessage = dispatchMessage, - agentId = agent.agentId, - workspace = workspace, - dockerInfo = if (dockerInfo == null) { - null - } else { - ThirdPartyAgentDockerInfoDispatch( - agentId = dispatchMessage.id, - secretKey = dispatchMessage.secretKey, - info = dockerInfo - ) - }, - envId = envId, - ignoreEnvAgentIds = ignoreEnvAgentIds, - jobId = jobId - ) - - // 保存构建详情 - saveAgentInfoToBuildDetail(dispatchMessage = dispatchMessage, agent = agent) - - logger.info( - "${event.buildId}|START_AGENT_BY_ID|j(${event.vmSeqId})|agent=${agent.agentId}" - ) - log( - dispatchMessage.event, - I18nUtil.getCodeLanMessage( - messageCode = BK_SCHEDULING_SELECTED_AGENT, - params = arrayOf(agent.hostname, agent.ip), - language = I18nUtil.getDefaultLocaleLanguage() - ) - ) - return true - } else { - logWarn( - dispatchMessage.event, - ErrorCodeEnum.BUILD_MACHINE_BUSY.getErrorMessage( - language = I18nUtil.getDefaultLocaleLanguage() - ) + "(Agent is busy) - ${agent.hostname}/${agent.ip}" - ) - return false - } - } finally { - redisLock.unlock() - } - } - - private fun logAgentReuse( - lockKey: String, - dispatchMessage: DispatchMessage, - agent: ThirdPartyAgent - ) { - val lockedBuildId = redisOperation.get(lockKey) - if (lockedBuildId.isNullOrBlank()) { - log( - dispatchMessage.event, - I18nUtil.getCodeLanMessage( - messageCode = AGENT_REUSE_MUTEX_REDISPATCH, - language = I18nUtil.getDefaultLocaleLanguage(), - params = arrayOf( - "${agent.agentId}|${agent.hostname}/${agent.ip}", lockedBuildId ?: "" - ) - ) - ) - return - } - val msg = redisOperation.get(AgentReuseMutex.genAgentReuseMutexLinkTipKey(lockedBuildId))?.let { s -> - val endIndex = s.indexOf("_") - val pipelineId = s.substring(0, endIndex) - val linkTip = s.substring(endIndex + 1) - val link = "${ - HomeHostUtil.getHost( - commonConfig.devopsHostGateway!! - ) - }/console/pipeline/${dispatchMessage.event.projectId}/$pipelineId/detail/$lockedBuildId" - if (lockedBuildId != dispatchMessage.event.buildId) { - "$linkTip$lockedBuildId" - } else { - linkTip - } - } ?: "" - logWarn( - dispatchMessage.event, - I18nUtil.getCodeLanMessage( - messageCode = AGENT_REUSE_MUTEX_REDISPATCH, - language = I18nUtil.getDefaultLocaleLanguage(), - params = arrayOf( - "${agent.agentId}|${agent.hostname}/${agent.ip}", lockedBuildId ?: "" - ) - ) + msg + // 错误结束的在最外边有处理了,这里只管正常逻辑的 + commonUtil.updateQueueTime( + event = dispatchMessage.event, + createTime = dispatchMessage.event.dispatchQueueStartTimeMilliSecond ?: return, + endTime = LocalDateTime.now().timestampmilli() ) } - private fun inQueue( - agent: ThirdPartyAgent, + private fun agentInQueue( dispatchMessage: DispatchMessage, - agentId: String, - workspace: String?, - dockerInfo: ThirdPartyAgentDockerInfoDispatch?, - envId: Long?, - ignoreEnvAgentIds: Set?, - jobId: String? - ) { - thirdPartyAgentBuildService.queueBuild( + dispatchType: ThirdPartyAgentDispatch, + agent: ThirdPartyAgent, + envId: Long? + ): Boolean { + return tpaSingleQueueService.doAgentInQueue( + data = ThirdPartyAgentDispatchData( + dispatchMessage = dispatchMessage, + dispatchType = dispatchType + ), agent = agent, - thirdPartyAgentWorkspace = workspace ?: "", - dispatchMessage = dispatchMessage, - retryCount = 0, - dockerInfo = dockerInfo, - envId = envId, - ignoreEnvAgentIds = ignoreEnvAgentIds, - jobId = jobId - ) - - thirdPartyAgentBuildRedisUtils.setThirdPartyBuild( - agent.secretKey, - ThirdPartyRedisBuild( - projectId = dispatchMessage.event.projectId, - pipelineId = dispatchMessage.event.pipelineId, - buildId = dispatchMessage.event.buildId, - agentId = agentId, - vmSeqId = dispatchMessage.event.vmSeqId, - vmName = agent.hostname, - channelCode = dispatchMessage.event.channelCode, - atoms = dispatchMessage.event.atoms - ) - ) - - // 添加上下文关键字 jobs..container.node_alias - if (dispatchMessage.event.jobId.isNullOrBlank()) { - return - } - try { - val detail = client.get(ServiceThirdPartyAgentResource::class).getAgentDetail( - userId = dispatchMessage.event.userId, - projectId = dispatchMessage.event.projectId, - agentHashId = agentId - ).data - if (detail == null) { - logger.warn("inQueue|setContextVar|getAgentDetail $agentId is null") - return - } - client.get(ServiceVarResource::class).setContextVar( - SetContextVarData( - projectId = dispatchMessage.event.projectId, - pipelineId = dispatchMessage.event.pipelineId, - buildId = dispatchMessage.event.buildId, - contextName = "jobs.${dispatchMessage.event.jobId}.container.node_alias", - contextVal = detail.displayName, - readOnly = true, - rewriteReadOnly = true - ) - ) - } catch (e: Exception) { - logger.error("inQueue|setContextVar|error", e) - } - } - - private fun saveAgentInfoToBuildDetail(dispatchMessage: DispatchMessage, agent: ThirdPartyAgent) { - client.get(ServiceBuildResource::class).saveBuildVmInfo( - projectId = dispatchMessage.event.projectId, - pipelineId = dispatchMessage.event.pipelineId, - buildId = dispatchMessage.event.buildId, - vmSeqId = dispatchMessage.event.vmSeqId, - vmInfo = VmInfo(ip = agent.ip, name = agent.ip) + envId = envId ) } @Suppress("ComplexMethod", "LongMethod", "NestedBlockDepth", "MagicNumber") private fun buildByEnvId(dispatchMessage: DispatchMessage, dispatchType: ThirdPartyAgentEnvDispatchType) { + dispatchMessage.event.dispatchQueueStartTimeMilliSecond = LocalDateTime.now().timestampmilli() val agentsResult = try { if (dispatchType.idType()) { client.get(ServiceThirdPartyAgentResource::class) @@ -682,14 +464,12 @@ class ThirdPartyDispatchService @Autowired constructor( var jobEnvActiveAgents = activeAgents if (!dispatchMessage.event.ignoreEnvAgentIds.isNullOrEmpty()) { - logWarn( - dispatchMessage.event, - I18nUtil.getCodeLanMessage( - messageCode = BK_ENV_WORKER_ERROR_IGNORE, - params = arrayOf(dispatchMessage.event.ignoreEnvAgentIds!!.joinToString(",")), - language = I18nUtil.getDefaultLocaleLanguage() - ) - ) + val data = ThirdPartyAgentDispatchData(dispatchMessage, dispatchType) + val agentMap = activeAgents.associateBy { it.agentId } + dispatchMessage.event.ignoreEnvAgentIds?.forEach { + val a = agentMap[it] + commonUtil.logWithAgentUrl(data, BK_ENV_WORKER_ERROR_IGNORE, arrayOf(it), a?.nodeId, a?.agentId) + } jobEnvActiveAgents = activeAgents.filter { it.agentId !in dispatchMessage.event.ignoreEnvAgentIds!! } if (jobEnvActiveAgents.isEmpty()) { throw BuildFailureException( @@ -731,6 +511,12 @@ class ThirdPartyDispatchService @Autowired constructor( envId = envId ) ) { + // 错误结束的在最外边有处理了,这里只管正常逻辑的 + commonUtil.updateQueueTime( + event = dispatchMessage.event, + createTime = dispatchMessage.event.dispatchQueueStartTimeMilliSecond ?: return, + endTime = LocalDateTime.now().timestampmilli() + ) return } } else { @@ -1074,16 +860,7 @@ class ThirdPartyDispatchService @Autowired constructor( return false } hasTryAgents.add(agent.agentId) - return doAgentInQueue( - dispatchMessage = dispatchMessage, - agent = agent, - workspace = dispatchType.workspace, - dockerInfo = dispatchType.dockerInfo, - envId = envId, - ignoreEnvAgentIds = dispatchMessage.event.ignoreEnvAgentIds, - hasReuseMutex = dispatchType.hasReuseMutex(), - jobId = dispatchMessage.event.jobId - ) + return agentInQueue(dispatchMessage, dispatchType, agent, envId) } private fun getRunningCnt(agentId: String, runningBuildsMapper: HashMap): Int { @@ -1206,9 +983,15 @@ class ThirdPartyDispatchService @Autowired constructor( ).data?.get(AgentReuseMutex.genAgentContextKey(jobId)) } + fun finishBuild(event: PipelineAgentShutdownEvent) { + tpaQueueService.finishQueue(event.buildId, event.vmSeqId) + thirdPartyAgentBuildService.finishBuild(event.buildId, event.vmSeqId, event.buildResult) + } + companion object { private val logger = LoggerFactory.getLogger(ThirdPartyDispatchService::class.java) private val availableAgentMatcher = AvailableAgent() private val idleAgentMatcher = IdleAgent() + const val DISPATCH_QUEUE_GRAY_PROJECT_PIPELINE = "DISPATCH_REDIS_QUEUE_GRAY_PROJECT_PIPELINE" } } diff --git a/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/service/tpaqueue/TPAEnvQueueService.kt b/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/service/tpaqueue/TPAEnvQueueService.kt new file mode 100644 index 00000000000..bb4387daabc --- /dev/null +++ b/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/service/tpaqueue/TPAEnvQueueService.kt @@ -0,0 +1,552 @@ +package com.tencent.devops.dispatch.service.tpaqueue + +import com.tencent.devops.common.api.enums.AgentStatus +import com.tencent.devops.common.api.exception.RemoteServiceException +import com.tencent.devops.common.api.util.HashUtil +import com.tencent.devops.common.client.Client +import com.tencent.devops.common.pipeline.enums.VMBaseOS +import com.tencent.devops.common.pipeline.type.agent.ThirdPartyAgentEnvDispatchType +import com.tencent.devops.common.web.utils.I18nUtil +import com.tencent.devops.dispatch.constants.BK_AGENT_IS_BUSY +import com.tencent.devops.dispatch.constants.BK_ENV_DISPATCH_AGENT +import com.tencent.devops.dispatch.constants.BK_ENV_NODE_DISABLE +import com.tencent.devops.dispatch.constants.BK_ENV_WORKER_ERROR_IGNORE +import com.tencent.devops.dispatch.constants.BK_MAX_BUILD_SEARCHING_AGENT +import com.tencent.devops.dispatch.constants.BK_NO_AGENT_AVAILABLE +import com.tencent.devops.dispatch.constants.BK_QUEUE_TIMEOUT_MINUTES +import com.tencent.devops.dispatch.constants.BK_SEARCHING_AGENT +import com.tencent.devops.dispatch.constants.BK_SEARCHING_AGENT_MOST_IDLE +import com.tencent.devops.dispatch.constants.BK_SEARCHING_AGENT_PARALLEL_AVAILABLE +import com.tencent.devops.dispatch.constants.BK_THIRD_JOB_ENV_CURR +import com.tencent.devops.dispatch.constants.BK_THIRD_JOB_NODE_CURR +import com.tencent.devops.dispatch.exception.ErrorCodeEnum +import com.tencent.devops.dispatch.pojo.EnvQueueContext +import com.tencent.devops.dispatch.pojo.QueueDataContext +import com.tencent.devops.dispatch.pojo.ThirdPartyAgentDispatchData +import com.tencent.devops.dispatch.service.ThirdPartyAgentService +import com.tencent.devops.dispatch.utils.TPACommonUtil +import com.tencent.devops.dispatch.utils.TPACommonUtil.Companion.tagError +import com.tencent.devops.environment.api.thirdpartyagent.ServiceThirdPartyAgentResource +import com.tencent.devops.environment.pojo.thirdpartyagent.EnvNodeAgent +import com.tencent.devops.environment.pojo.thirdpartyagent.ThirdPartyAgent +import org.slf4j.LoggerFactory +import org.springframework.beans.factory.annotation.Autowired +import org.springframework.stereotype.Service +import javax.ws.rs.core.Response + +/** + * 存放第三方构建机环境相关逻辑 + */ +@Suppress("ComplexMethod") +@Service +class TPAEnvQueueService @Autowired constructor( + private val client: Client, + private val commonUtil: TPACommonUtil, + private val thirdPartyAgentService: ThirdPartyAgentService, + private val tpaSingleQueueService: TPASingleQueueService +) { + fun initEnvContext( + dataContext: QueueDataContext + ): EnvQueueContext { + val data = dataContext.data + + val (envId, envAgents) = fetchEnvIdAndAgents(dataContext, data.genEnvWithProject()!!) + logDisableAgents(data, envAgents) + + val agents = envAgents.filter { + it.agent.status == AgentStatus.IMPORT_OK && (data.os == it.agent.os || data.os == VMBaseOS.ALL.name) && + it.enableNode + }.map { it.agent } + if (agents.isNotEmpty()) { + return EnvQueueContext(envId, agents) + } + + commonUtil.logWarnI18n(data, BK_NO_AGENT_AVAILABLE) + throw TPACommonUtil.queueRetry( + errorCode = ErrorCodeEnum.LOAD_BUILD_AGENT_FAIL, + errMsg = "${data.buildId}|${data.vmSeqId} " + I18nUtil.getCodeLanMessage( + messageCode = BK_QUEUE_TIMEOUT_MINUTES, + language = I18nUtil.getDefaultLocaleLanguage(), + params = arrayOf("${data.queueTimeoutMinutes}") + ) + ) + } + + fun inEnvQueue( + context: EnvQueueContext, + dataContext: QueueDataContext + ) { + ignoreAgentCheck(context, dataContext) + allNodeConcurrencyCheck(context, dataContext) + singleNodeConcurrencyCheck(context, dataContext) + if (context.agents.isNotEmpty() && pickupAgent(context, dataContext)) { + afterGenAgentBuild(context, dataContext) + return + } + + // 没有可用构建机列表进入下一次重试 + val data = dataContext.data + logger.info("START_AGENT|${data.toLog()}|Not Found, Retry!") + commonUtil.logWarnI18n(data, BK_AGENT_IS_BUSY, suffixMsg = dataContext.retryLog("env agents not found")) + throw TPACommonUtil.queueRetry( + errorCode = ErrorCodeEnum.LOAD_BUILD_AGENT_FAIL, + errMsg = "${data.buildId}|${data.vmSeqId} " + I18nUtil.getCodeLanMessage( + messageCode = BK_QUEUE_TIMEOUT_MINUTES, + language = I18nUtil.getDefaultLocaleLanguage(), + params = arrayOf("${data.queueTimeoutMinutes}") + ) + ) + } + + private fun logDisableAgents(data: ThirdPartyAgentDispatchData, agents: List) { + val disableIds = + agents.filter { !it.enableNode }.associate { it.agent.agentId to it.nodeDisplayName }.ifEmpty { return } + val disableStr = disableIds.map { "[${it.key}][${it.value}]" }.joinToString(",") + commonUtil.logI18n(data, BK_ENV_NODE_DISABLE, arrayOf(disableStr)) + } + + private fun fetchEnvIdAndAgents( + dataContext: QueueDataContext, + env: String + ): Pair> { + val data = dataContext.data + + val agentsResult = try { + if (data.dispatchType.idType()) { + client.get(ServiceThirdPartyAgentResource::class).getAgentsByEnvId(data.projectId, env) + } else { + client.get(ServiceThirdPartyAgentResource::class).getAgentsByEnvNameWithId(data.projectId, env) + } + } catch (e: Exception) { + throw TPACommonUtil.queueFailure( + ErrorCodeEnum.GET_VM_ERROR, if (e is RemoteServiceException) { + e.errorMessage + } else { + e.message ?: (ErrorCodeEnum.GET_VM_ERROR.getErrorMessage() + "($env)") + } + ) + } + + if (agentsResult.status == Response.Status.FORBIDDEN.statusCode) { + logger.warn("fetchEnvIdAndAgents|START_AGENT_FAILED_FORBIDDEN|${data.toLog()}|err=${agentsResult.message}") + throw TPACommonUtil.queueFailure(ErrorCodeEnum.GET_VM_ERROR, agentsResult.message ?: "") + } + + if (agentsResult.isNotOk()) { + logger.warn("fetchEnvIdAndAgents|START_AGENT_FAILED|${data.toLog()}|err=${agentsResult.message}") + commonUtil.logDebugI18n(data, BK_AGENT_IS_BUSY, suffixMsg = dataContext.retryLog(agentsResult.message)) + throw TPACommonUtil.queueRetry( + ErrorCodeEnum.GET_BUILD_AGENT_ERROR, + suffixMsg = "(System Error) - $env: ${agentsResult.message}" + ) + } + + if (agentsResult.data == null) { + logger.warn("fetchEnvIdAndAgents|START_AGENT_FAILED|${data.toLog()}|err=null agents") + commonUtil.logDebugI18n(data, BK_AGENT_IS_BUSY, suffixMsg = dataContext.retryLog("null agents")) + throw TPACommonUtil.queueRetry( + ErrorCodeEnum.FOUND_AGENT_ERROR, + suffixMsg = "System Error - $env: agent is null" + ) + } + + val (envId, agentResData) = if (data.dispatchType.idType()) { + Pair( + HashUtil.decodeIdToLong((data.dispatchType as ThirdPartyAgentEnvDispatchType).envName), + (agentsResult.data as List) + ) + } else { + (agentsResult.data as Pair>) + } + + if (agentResData.isEmpty()) { + logger.warn("fetchEnvIdAndAgents|START_AGENT_FAILED|${data.toLog()}|err=empty agents") + throw TPACommonUtil.queueRetry( + ErrorCodeEnum.VM_NODE_NULL, + errMsg = ErrorCodeEnum.BUILD_NODE_IS_EMPTY.getErrorMessage(arrayOf(env)), + suffixMsg = "build cluster: $env is empty)" + ) + } + + return Pair(envId, agentResData) + } + + private fun ignoreAgentCheck(context: EnvQueueContext, dataContext: QueueDataContext) { + val data = dataContext.data + + if (data.ignoreEnvAgentIds.isNullOrEmpty() || !data.isEnv()) { + return + } + + val agentMap = context.agents.associateBy { it.agentId } + data.ignoreEnvAgentIds.forEach { + val a = agentMap[it] + commonUtil.logWithAgentUrl(data, BK_ENV_WORKER_ERROR_IGNORE, arrayOf(it), a?.nodeId, a?.agentId) + } + + val activeAgents = context.agents.filter { it.agentId !in data.ignoreEnvAgentIds } + if (activeAgents.isEmpty()) { + throw TPACommonUtil.queueFailureI18n( + ErrorCodeEnum.BK_ENV_WORKER_ERROR_IGNORE_ALL_ERROR, + param = arrayOf(data.ignoreEnvAgentIds.joinToString(",")) + ) + } + + context.agents = activeAgents + return + } + + private fun allNodeConcurrencyCheck(context: EnvQueueContext, dataContext: QueueDataContext) { + val data = dataContext.data + + if (data.allNodeConcurrency == null || !data.isEnv()) { + return + } + + val envId = context.envId + val jobId = data.jobId + if (envId == null || jobId.isNullOrBlank()) { + logger.warn( + "allNodeConcurrencyCheck|${data.toLog()}|has ${data.allNodeConcurrency} but env $envId job $jobId null" + ) + return + } + + if (context.projectJobRunningAndQueueAllCount(jobId) == null) { + context.setProjectJobRunningAndQueueAllCount( + jobId = jobId, + cnt = thirdPartyAgentService.countProjectJobRunningAndQueueAll( + pipelineId = data.pipelineId, + envId = envId, + jobId = jobId, + projectId = data.projectId + ) + ) + } + + val c = context.projectJobRunningAndQueueAllCount(jobId)!! + if (c < data.allNodeConcurrency) { + return + } + + commonUtil.logI18n( + data, BK_THIRD_JOB_ENV_CURR, arrayOf( + c.toString(), data.allNodeConcurrency.toString(), (data.queueTimeoutMinutes ?: 10).toString() + ) + ) + throw TPACommonUtil.queueRetry(ErrorCodeEnum.GET_BUILD_RESOURCE_ERROR) + } + + private fun singleNodeConcurrencyCheck(context: EnvQueueContext, dataContext: QueueDataContext) { + val data = dataContext.data + + if (data.singleNodeConcurrency == null || !data.isEnv()) { + return + } + + val envId = context.envId + val jobId = data.jobId + if (envId == null || jobId.isNullOrBlank()) { + logger.warn( + "${data.toLog()}|has singleNodeConcurrency ${data.singleNodeConcurrency} but env $envId job $jobId null" + ) + return + } + + val activeAgents = mutableListOf() + + val agentIds = context.agents.map { it.agentId }.toSet() + // 可能存在某些条件导致前面最后选剩下的 agent 在不同的编排中不同,这里需要补缺的 + if (context.agentsJobRunningAndQueueAllMap(jobId) == null) { + context.setAllAgentsJobRunningAndQueueAllMap( + jobId, + thirdPartyAgentService.countAgentsJobRunningAndQueueAll( + pipelineId = data.pipelineId, + envId = envId, + jobId = jobId, + agentIds = agentIds, + projectId = data.projectId + ).toMutableMap() + ) + } else if (context.agentsJobRunningAndQueueAllMap(jobId)!!.keys != agentIds) { + val newMap = thirdPartyAgentService.countAgentsJobRunningAndQueueAll( + pipelineId = data.pipelineId, + envId = envId, + jobId = jobId, + agentIds = agentIds, + projectId = data.projectId + ) + newMap.forEach { (k, v) -> + if (!context.agentsJobRunningAndQueueAllMap(jobId)!!.containsKey(k)) { + context.setAgentsJobRunningAndQueueAllMap(jobId, k, v) + } + } + } + val m = context.agentsJobRunningAndQueueAllMap(jobId)!!.toMap() + + context.agents.forEach { agent -> + // 为空说明当前节点没有记录就是没有任务直接加,除非并发是0的情况 + val agentCount = m[agent.agentId] ?: if (data.singleNodeConcurrency > 0) { + activeAgents.add(agent) + return@forEach + } else { + commonUtil.logDebug(data, "singleNodeConcurrency: ${data.singleNodeConcurrency} == 0") + return@forEach + } + if (agentCount < data.singleNodeConcurrency) { + activeAgents.add(agent) + return@forEach + } + commonUtil.logDebug( + data, + "singleNodeConcurrency: ${agent.agentId}:$agentCount > ${data.singleNodeConcurrency}" + ) + } + + // 没有一个节点满足则进入排队机制 + if (activeAgents.isEmpty()) { + commonUtil.logI18n( + data, BK_THIRD_JOB_NODE_CURR, + arrayOf(data.singleNodeConcurrency.toString(), (data.queueTimeoutMinutes ?: 10).toString()) + ) + throw TPACommonUtil.queueRetry(ErrorCodeEnum.GET_BUILD_RESOURCE_ERROR) + } + + context.agents = activeAgents + return + } + + private fun pickupAgent(context: EnvQueueContext, dataContext: QueueDataContext): Boolean { + val data = dataContext.data + + if (!data.dispatchType.isEnv()) { + // 理论上不可能但是逻辑上有可能所以打印日志切不进行选取 + logger.tagError("PickupAgentCheck|not env|${data.toLog()}") + return false + } + + val activeAgents = context.agents + // 这里拿之前构建过的 agent 需要动态的去拿因为不同编排的 agents,因为前面可能的 check 而导致最后选择结果不同导致 agents 不同 + // 同时因为不存在说是之前构建过的机器突然不构建了所以每次即使去拿也不会对同一组的构建机器产生不同结果 + val agentMaps = activeAgents.associateBy { it.agentId } + val preBuildAgents = ArrayList(agentMaps.size) + thirdPartyAgentService.getPreBuildAgentIds( + projectId = data.projectId, + pipelineId = data.pipelineId, + vmSeqId = data.vmSeqId, + size = activeAgents.size.coerceAtLeast(1) + ).forEach { agentId -> agentMaps[agentId]?.let { agent -> preBuildAgents.add(agent) } } + + val pbAgents = sortAgent( + data = data, + agents = preBuildAgents, + context = context + ) + + /** + * 1. 最高优先级的agent: + * a. 最近构建机中使用过这个构建机,并且 + * b. 当前没有任何构建机任务 + * 2. 次高优先级的agent: + * a. 最近构建机中使用过这个构建机,并且 + * b. 当前有构建任务,选当前正在运行任务最少的构建机(没有达到当前构建机的最大并发数) + * 3. 第三优先级的agent: + * a. 当前没有任何构建机任务 + * 4. 第四优先级的agent: + * a. 当前有构建任务,选当前正在运行任务最少的构建机(没有达到当前构建机的最大并发数) + */ + + val retryMsg = "retry: ${dataContext.retryTime} | " + + /** + * 最高优先级的agent: 根据哪些agent没有任何任务并且是在最近构建中使用到的Agent + */ + commonUtil.logDebugI18n(data, BK_SEARCHING_AGENT, preMsg = retryMsg) + if (matchAgents(context, dataContext, pbAgents, idleAgentMatcher)) { + return true + } + + /** + * 次高优先级的agent: 最近构建机中使用过这个构建机,并且当前有构建任务,选当前正在运行任务最少的构建机(没有达到当前构建机的最大并发数) + */ + commonUtil.logDebugI18n(data, BK_MAX_BUILD_SEARCHING_AGENT, preMsg = retryMsg) + if (matchAgents(context, dataContext, pbAgents, availableAgentMatcher)) { + return true + } + + val allAgents = sortAgent( + data = data, + agents = activeAgents, + context = context + ) + + /** + * 第三优先级的agent: 当前没有任何构建机任务 + */ + commonUtil.logDebugI18n(data, BK_SEARCHING_AGENT_MOST_IDLE, preMsg = retryMsg) + if (matchAgents(context, dataContext, allAgents, idleAgentMatcher)) { + return true + } + + /** + * 第四优先级的agent: 当前有构建任务,选当前正在运行任务最少的构建机(没有达到当前构建机的最大并发数) + */ + commonUtil.logDebugI18n(data, BK_SEARCHING_AGENT_PARALLEL_AVAILABLE, preMsg = retryMsg) + if (matchAgents(context, dataContext, allAgents, availableAgentMatcher)) { + return true + } + + commonUtil.logWarnI18n(data, BK_NO_AGENT_AVAILABLE) + + return false + } + + private fun matchAgents( + context: EnvQueueContext, + dataContext: QueueDataContext, + agents: Collection, + agentMatcher: AgentMatcher + ): Boolean { + if (agents.isEmpty()) { + return false + } + + val data = dataContext.data + agents.forEach { + val agent = it.agent + if (context.hasTryAgents.contains(agent.agentId)) { + return@forEach + } + + val matchOk = agentMatcher.match( + agent = agent, + runningCnt = it.runningCnt, + dockerBuilder = data.dispatchType.dockerInfo != null, + dockerRunningCnt = it.dockerRunningCnt + ) + if (!matchOk) { + return@forEach + } + + dataContext.buildAgent = agent + if (!tpaSingleQueueService.genAgentBuild(context, dataContext)) { + context.hasTryAgents.add(agent.agentId) + return@forEach + } + + commonUtil.logWithAgentUrl( + data = data, + messageCode = BK_ENV_DISPATCH_AGENT, + param = arrayOf("[${agent.agentId}]${agent.hostname}/${agent.ip}"), + nodeHashId = agent.nodeId, + agentHashId = agent.agentId + ) + return true + } + + return false + } + + private fun sortAgent( + data: ThirdPartyAgentDispatchData, + agents: Collection, + context: EnvQueueContext + ): MutableList { + val sortQ = mutableListOf() + agents.forEach { + val runningCnt = getRunningCnt(context, it.agentId) + val dockerRunningCnt = if (data.dispatchType.dockerInfo == null) { + 0 + } else { + getDockerRunningCnt(context, it.agentId) + } + sortQ.add(AgentAndCount(it, runningCnt, dockerRunningCnt)) + commonUtil.logDebug( + data, + "[${it.agentId}]${it.hostname}/${it.ip}, Jobs:$runningCnt, DockerJobs:$dockerRunningCnt" + ) + } + // 这里应该根据不同的构建使用不同的排序 + if (data.dispatchType.dockerInfo == null) { + sortQ.sortBy { it.runningCnt } + } else { + sortQ.sortBy { it.dockerRunningCnt } + } + return sortQ + } + + // runningCnt,每次拿取的都是指定 agent 范围的,所以即使不裁剪也不会影响拿取结果 + private fun getRunningCnt(context: EnvQueueContext, agentId: String): Int { + var runningCnt = context.agentRunningCnt[agentId] + if (runningCnt == null) { + runningCnt = thirdPartyAgentService.getRunningBuilds(agentId) + context.agentRunningCnt[agentId] = runningCnt + } + return runningCnt + } + + private fun getDockerRunningCnt(context: EnvQueueContext, agentId: String): Int { + var dockerRunningCnt = context.dockerRunningCnt[agentId] + if (dockerRunningCnt == null) { + dockerRunningCnt = thirdPartyAgentService.getDockerRunningBuilds(agentId) + context.dockerRunningCnt[agentId] = dockerRunningCnt + } + return dockerRunningCnt + } + + fun afterGenAgentBuild(context: EnvQueueContext, dataContext: QueueDataContext) { + dataContext.data.jobId?.let { jobId -> + context.setProjectJobRunningAndQueueAllCount(jobId, null) + } + + val agentId = dataContext.buildAgent?.agentId ?: return + dataContext.data.jobId?.let { jobId -> + context.setAgentsJobRunningAndQueueAllMap(jobId, agentId, null) + } + context.agentRunningCnt.remove(agentId) + context.dockerRunningCnt.remove(agentId) + context.hasTryAgents.remove(agentId) + } + + companion object { + private val logger = LoggerFactory.getLogger(TPAEnvQueueService::class.java) + private val availableAgentMatcher = AvailableAgent() + private val idleAgentMatcher = IdleAgent() + } +} + +data class AgentAndCount( + val agent: ThirdPartyAgent, + val runningCnt: Int, + val dockerRunningCnt: Int +) + +interface AgentMatcher { + fun match(runningCnt: Int, agent: ThirdPartyAgent, dockerBuilder: Boolean, dockerRunningCnt: Int): Boolean +} + +class IdleAgent : AgentMatcher { + override fun match( + runningCnt: Int, + agent: ThirdPartyAgent, + dockerBuilder: Boolean, + dockerRunningCnt: Int + ): Boolean = if (dockerBuilder) { + dockerRunningCnt == 0 + } else { + runningCnt == 0 + } +} + +class AvailableAgent : AgentMatcher { + override fun match( + runningCnt: Int, + agent: ThirdPartyAgent, + dockerBuilder: Boolean, + dockerRunningCnt: Int + ) = if (dockerBuilder) { + agent.dockerParallelTaskCount == 0 || (agent.dockerParallelTaskCount?.let { it > dockerRunningCnt } ?: false) + } else { + agent.parallelTaskCount == 0 || (agent.parallelTaskCount?.let { it > runningCnt } ?: false) + } +} \ No newline at end of file diff --git a/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/service/tpaqueue/TPAQueueService.kt b/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/service/tpaqueue/TPAQueueService.kt new file mode 100644 index 00000000000..8a93aebe121 --- /dev/null +++ b/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/service/tpaqueue/TPAQueueService.kt @@ -0,0 +1,328 @@ +package com.tencent.devops.dispatch.service.tpaqueue + +import com.tencent.devops.common.api.constant.CommonMessageCode.BK_FAILED_START_BUILD_MACHINE +import com.tencent.devops.common.api.exception.InvalidParamException +import com.tencent.devops.common.api.pojo.ErrorType +import com.tencent.devops.common.api.util.timestampmilli +import com.tencent.devops.common.dispatch.sdk.BuildFailureException +import com.tencent.devops.common.dispatch.sdk.DispatchSdkErrorCode +import com.tencent.devops.common.dispatch.sdk.service.DispatchService +import com.tencent.devops.common.dispatch.sdk.service.JobQuotaService +import com.tencent.devops.common.event.annotation.Event +import com.tencent.devops.common.pipeline.type.agent.ThirdPartyAgentEnvDispatchType +import com.tencent.devops.common.redis.RedisOperation +import com.tencent.devops.common.web.utils.I18nUtil +import com.tencent.devops.dispatch.dao.TPAQueueDao +import com.tencent.devops.dispatch.dao.ThirdPartyAgentQueueSqlData +import com.tencent.devops.dispatch.exception.DispatchRetryMQException +import com.tencent.devops.dispatch.pojo.QueueDataContext +import com.tencent.devops.dispatch.pojo.ThirdPartyAgentDispatchData +import com.tencent.devops.dispatch.pojo.TPAQueueEvent +import com.tencent.devops.dispatch.pojo.TPAQueueEventContext +import com.tencent.devops.dispatch.pojo.ThirdPartyAgentSqlQueueType +import com.tencent.devops.dispatch.utils.TPACommonUtil +import com.tencent.devops.dispatch.utils.TPACommonUtil.Companion.tagError +import com.tencent.devops.dispatch.utils.ThirdPartyAgentQueueEnvLock +import org.jooq.DSLContext +import org.slf4j.LoggerFactory +import org.springframework.amqp.rabbit.core.RabbitTemplate +import org.springframework.beans.factory.annotation.Autowired +import org.springframework.stereotype.Service +import java.time.LocalDateTime +import java.util.UUID + +@Service +class TPAQueueService @Autowired constructor( + private val dslContext: DSLContext, + private val redisOperation: RedisOperation, + private val rabbitTemplate: RabbitTemplate, + private val commonUtil: TPACommonUtil, + private val tpaQueueDao: TPAQueueDao, + private val tpaEnvQueueService: TPAEnvQueueService, + private val dispatchService: DispatchService, + private val jobQuotaService: JobQuotaService +) { + fun queue(data: ThirdPartyAgentDispatchData) { + logger.info("queue|${data.toLog()}") + val (sqlData, dataType) = when (data.dispatchType) { + // 目前只做环境排队 + is ThirdPartyAgentEnvDispatchType -> Pair(data.genEnv()!!, ThirdPartyAgentSqlQueueType.ENV) + else -> throw InvalidParamException("Unknown agent type - ${data.dispatchType}") + } + val now = LocalDateTime.now() + tpaQueueDao.add( + dslContext = dslContext, + projectId = data.projectId, + pipelineId = data.pipelineId, + buildId = data.buildId, + vmSeqId = data.vmSeqId, + data = sqlData, + dataType = dataType, + info = data.genSqlJsonData(), + retryTime = 0, + createTime = now, + updateTime = now + ) + // 写入耗时,防止在排队中被取消了 + commonUtil.updateQueueTime(data, now.timestampmilli(), null) + val event = TPAQueueEvent( + projectId = data.projectId, + pipelineId = data.pipelineId, + data = sqlData, + dataType = dataType, + sendData = data, + delayMills = 5000, + lockValue = UUID.randomUUID().toString() + ) + dispatch(event) + } + + /** + * 因为每个消费消息的线程数量有限制,如果线程慢了,其他消息不会消失,只会等待消费 + * 所以生产和消费共用一把锁,这样才可以做到每次只有一个消息在消费 + */ + private fun dispatch(event: TPAQueueEvent) { + logger.info("queue_dispatch|${event.toLog()}") + // 每个排队队列中的最长的只会是Job排队时间,7天 + // 目前只有ENV + val lock = ThirdPartyAgentQueueEnvLock( + redisOperation = redisOperation, + projectId = event.projectId, + queueKey = event.data, + expiredTimeInSeconds = ENV_LOCK_TIME_OUT_7D, + lockValue = event.lockValue + ) + if (!lock.tryLock(timeout = 5000, interval = 1000)) { + if (event.sendData != null) { + commonUtil.logDebug(event.sendData!!, "do queue no lock wait other queue") + logger.info("doQueue|${event.sendData?.toLog()}|no lock wait other queue") + event.sendData = null + } + return + } + try { + if (event.sendData != null) { + commonUtil.logDebug(event.sendData!!, "do queue get lock in queue") + logger.info("doQueue|${event.sendData?.toLog()}|get lock in queue") + event.sendData = null + } + logger.info("queue_dispatch|${event.toLog()}") + send(event) + } catch (e: Throwable) { + // 只可能是发送消息错误或者抓到的异常处理逻辑错误,但是为了防止没解锁 + logger.tagError("dispatch|send|${event.toLog()}|error", e) + lock.unlock() + } + } + + private fun send(event: TPAQueueEvent) { + val eventType = event::class.java.annotations.find { s -> s is Event } as Event + rabbitTemplate.convertAndSend(eventType.exchange, eventType.routeKey, event) { message -> + // 事件中的变量指定 + when { + event.delayMills > 0 -> message.messageProperties.setHeader("x-delay", event.delayMills) + eventType.delayMills > 0 -> // 事件类型固化默认值 + message.messageProperties.setHeader("x-delay", eventType.delayMills) + + else -> // 非延时消息的则8小时后过期,防止意外发送的消息无消费端ACK处理从而堆积过多消息导致MQ故障 + message.messageProperties.expiration = "28800000" + } + message + } + } + + /** + * 进入排队,每次 sql 数据变更,都应该持有锁并发送消息(在队列循环结束后也会重新拿取一次 sql,即是消息更新) + * 这样在不处理完所有数据前,队列永远存在 + * 同时将排队信息保存到 sql 中,每次结束排队都更新,这样不论哪个消息进来了,信息都是持久化的 + */ + fun doQueue(event: TPAQueueEvent) { + inQueue(event) + val lock = ThirdPartyAgentQueueEnvLock( + redisOperation = redisOperation, + projectId = event.projectId, + queueKey = event.data, + expiredTimeInSeconds = ENV_LOCK_TIME_OUT_7D, + lockValue = event.lockValue + ) + lock.unlock() + // 解锁后去查询,如果还有没有下发完的那么就再发送一条记录 + try { + val count = tpaQueueDao.fetchProjectDataCount( + dslContext = dslContext, + projectId = event.projectId, + pipelineId = event.pipelineId, + data = event.data, + dataType = event.dataType + ) + if (count > 0) { + dispatch(event) + } + } catch (e: Throwable) { + // 只可能是Sql错误或者抓到的异常处理逻辑错误,但是为了防止丢失消息,还是抓一下重发 + logger.tagError("doQueue|fetchProjectDataCount|${event.toLog()}|error", e) + dispatch(event) + } + } + + private fun inQueue(event: TPAQueueEvent) { + try { + val records = tpaQueueDao.fetchProjectData( + dslContext = dslContext, + projectId = event.projectId, + pipelineId = event.pipelineId, + data = event.data, + dataType = event.dataType + ) + // 事件开始,在这初始化时间去掉数据库查询的系统时间 + val eventContext = TPAQueueEventContext() + // 目前只做环境排队 + records.filter { event.dataType == ThirdPartyAgentSqlQueueType.ENV }.forEachIndexed { index, sqlData -> + inEnvQueue(eventContext, sqlData, records.size, index + 1) + } + // 循环事件结束收尾,增加重试测试 + tpaQueueDao.addRetryTimeByIds(dslContext, eventContext.needRetryRecord) + } catch (e: Throwable) { + // 只可能是Sql错误或者抓到的异常处理逻辑错误,但是为了防止丢失消息,还是抓一下重发 + logger.tagError("inQueue|fetchProjectData|${event.toLog()}|error", e) + dispatch(event) + } + } + + /** + * 对 inEnvQueue 的包装,主要用来整合异常,例如结束和重试异常 + */ + private fun inEnvQueue( + eventContext: TPAQueueEventContext, + sqlData: ThirdPartyAgentQueueSqlData, + queueSize: Int, + queueIndex: Int + ) { + try { + val dataContext = QueueDataContext(sqlData.data, sqlData.retryTime) + if (!checkRunning(dataContext)) { + eventContext.setDelete(sqlData.recordId) + return + } + + val costMilliSecond = System.currentTimeMillis() - eventContext.startTimeMilliSecond + commonUtil.logDebug( + dataContext.data, "env queue size:$queueSize index:$queueIndex cost ${costMilliSecond}ms" + ) + // context 只能初始化一次,但是因为初始化过程中也可能出现报错,所以需要把可能的报错分摊给每个消息,防止一次报错整个队列没了 + if (eventContext.context == null) { + eventContext.context = tpaEnvQueueService.initEnvContext(dataContext) + } + tpaEnvQueueService.inEnvQueue(eventContext.context!!, dataContext) + // 只有调度成功才能走到这一步,到这一步就删除,同时删除数据库 + eventContext.setDelete(sqlData.recordId) + } catch (e: Throwable) { + queueEnd(eventContext, sqlData, e) + } finally { + // 计算用户耗时,只能刚下发就写入,防止执行完了还没启动计算,同时也要删除,防止用户取消后计时计算错误 + if (eventContext.needDeleteRecord?.first == sqlData.recordId) { + tpaQueueDao.delete(dslContext, sqlData.recordId) + commonUtil.updateQueueTime( + data = sqlData.data, + createTime = sqlData.createTime.timestampmilli(), + endTime = eventContext.needDeleteRecord?.second + ) + } + } + } + + private fun checkRunning(dataContext: QueueDataContext): Boolean { + val data = dataContext.data + val running = dispatchService.checkRunning( + projectId = data.projectId, + buildId = data.buildId, + containerId = data.vmSeqId, + retryTime = dataContext.retryTime, + executeCount = data.executeCount, + logTag = data.toLog() + ) + if (!running && dataContext.retryTime > 1) { + // 重试的请求如果流水线已结束,主动把配额记录删除 + jobQuotaService.removeRunningJob( + projectId = dataContext.data.projectId, + pipelineId = dataContext.data.pipelineId, + buildId = dataContext.data.buildId, + vmSeqId = dataContext.data.vmSeqId, + executeCount = dataContext.data.executeCount + ) + } + return running + } + + fun queueEnd( + eventContext: TPAQueueEventContext, + sqlData: ThirdPartyAgentQueueSqlData, + e: Throwable + ) { + val data = sqlData.data + val failureE = when (e) { + is DispatchRetryMQException -> { + // 用时间做判断,避免 retryTime 的加减 + val timeOut = data.queueTimeoutMinutes ?: 10 + if (sqlData.createTime.plusMinutes(timeOut.toLong()) > LocalDateTime.now()) { + eventContext.addRetry(sqlData.recordId) + return + } + // 超时就是结束 + BuildFailureException( + errorType = ErrorType.SYSTEM, + errorCode = DispatchSdkErrorCode.RETRY_STARTUP_FAIL, + formatErrorMessage = e.message ?: "Fail to start up the job after $timeOut minutes", + errorMessage = e.message ?: "Fail to start up the job after $timeOut minutes" + ) + } + + is BuildFailureException -> e + + else -> { + logger.tagError("queueEnd|unKnowError|${data.toLog()}", e) + BuildFailureException( + errorType = ErrorType.SYSTEM, + errorCode = DispatchSdkErrorCode.SDK_SYSTEM_ERROR, + formatErrorMessage = e.message ?: "Fail to handle the start up message", + errorMessage = e.message ?: "Fail to handle the start up message" + ) + } + } + eventContext.setDelete(sqlData.recordId) + onFailure(data, failureE) + } + + private fun onFailure( + data: ThirdPartyAgentDispatchData, + exception: BuildFailureException + ) { + commonUtil.logError( + data = data, + message = "${I18nUtil.getCodeLanMessage(BK_FAILED_START_BUILD_MACHINE)}-${exception.message}" + ) + dispatchService.onFailure( + projectId = data.projectId, + pipelineId = data.pipelineId, + buildId = data.buildId, + vmSeqId = data.vmSeqId, + e = exception, + logTag = data.toLog() + ) + } + + fun finishQueue(buildId: String, vmSeqId: String?) { + val now = LocalDateTime.now().timestampmilli() + val records = tpaQueueDao.fetchTimeByBuild(dslContext, buildId, vmSeqId).ifEmpty { return } + // 取消时兜底结束时间 + records.forEach { record -> + commonUtil.updateQueueTime(record.data, record.createTime.timestampmilli(), now) + } + tpaQueueDao.deleteByIds(dslContext, records.map { it.recordId }.toSet()) + } + + companion object { + private const val ENV_LOCK_TIME_OUT_7D = 60 * 60 * 24 * 7L + private val logger = LoggerFactory.getLogger(TPAQueueService::class.java) + } +} \ No newline at end of file diff --git a/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/service/tpaqueue/TPASingleQueueService.kt b/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/service/tpaqueue/TPASingleQueueService.kt new file mode 100644 index 00000000000..75eac495a1e --- /dev/null +++ b/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/service/tpaqueue/TPASingleQueueService.kt @@ -0,0 +1,281 @@ +package com.tencent.devops.dispatch.service.tpaqueue + +import com.tencent.devops.common.client.Client +import com.tencent.devops.common.pipeline.container.AgentReuseMutex +import com.tencent.devops.common.redis.RedisLockByValue +import com.tencent.devops.common.redis.RedisOperation +import com.tencent.devops.dispatch.constants.AGENT_REUSE_MUTEX_REDISPATCH +import com.tencent.devops.dispatch.constants.AGENT_REUSE_MUTEX_RESERVE_REDISPATCH +import com.tencent.devops.dispatch.constants.BK_SCHEDULING_SELECTED_AGENT +import com.tencent.devops.dispatch.constants.TRY_AGENT_DISPATCH +import com.tencent.devops.dispatch.exception.ErrorCodeEnum +import com.tencent.devops.dispatch.pojo.QueueContext +import com.tencent.devops.dispatch.pojo.QueueDataContext +import com.tencent.devops.dispatch.pojo.ThirdPartyAgentDispatchData +import com.tencent.devops.dispatch.service.ThirdPartyAgentService +import com.tencent.devops.dispatch.utils.TPACommonUtil +import com.tencent.devops.dispatch.utils.TPACommonUtil.Companion.tagError +import com.tencent.devops.dispatch.utils.ThirdPartyAgentLock +import com.tencent.devops.dispatch.utils.redis.ThirdPartyAgentBuildRedisUtils +import com.tencent.devops.dispatch.utils.redis.ThirdPartyRedisBuild +import com.tencent.devops.environment.api.thirdpartyagent.ServiceThirdPartyAgentResource +import com.tencent.devops.environment.pojo.thirdpartyagent.ThirdPartyAgent +import com.tencent.devops.process.api.service.ServiceBuildResource +import com.tencent.devops.process.api.service.ServiceVarResource +import com.tencent.devops.process.pojo.SetContextVarData +import com.tencent.devops.process.pojo.VmInfo +import org.slf4j.LoggerFactory +import org.springframework.beans.factory.annotation.Autowired +import org.springframework.stereotype.Service + +/** + * 存放第三方构建机单节点相关 + * Agent 不参与排队,老逻辑和新逻辑一起用 + */ +@Suppress("ComplexMethod", "NestedBlockDepth") +@Service +class TPASingleQueueService @Autowired constructor( + private val redisOperation: RedisOperation, + private val client: Client, + private val commonUtil: TPACommonUtil, + private val thirdPartyAgentBuildRedisUtils: ThirdPartyAgentBuildRedisUtils, + private val thirdPartyAgentService: ThirdPartyAgentService +) { + fun genAgentBuild(context: QueueContext, dataContext: QueueDataContext): Boolean { + val data = dataContext.data + val agent = dataContext.buildAgent ?: run { + // 理论上不可能但是逻辑上可能所以加校验 + logger.tagError("genAgentBuild|build agent is null|${data.toLog()}") + return false + } + + return doAgentInQueue(data, agent, context.envId) + } + + fun doAgentInQueue( + data: ThirdPartyAgentDispatchData, + agent: ThirdPartyAgent, + envId: Long? + ): Boolean { + if (data.dispatchType.isEnv()) { + commonUtil.logWithAgentUrl( + data = data, + messageCode = TRY_AGENT_DISPATCH, + param = arrayOf("[${agent.agentId}]${agent.hostname}/${agent.ip}"), + nodeHashId = agent.nodeId, + agentHashId = agent.agentId + ) + } + val redisLock = ThirdPartyAgentLock(redisOperation, data.projectId, agent.agentId) + try { + if (redisLock.tryLock()) { + return agentInQueue(data, agent, envId) + } else { + commonUtil.logWarnI18n( + data, ErrorCodeEnum.BUILD_MACHINE_BUSY.errorCode.toString(), + suffixMsg = "(Agent is busy) - ${agent.hostname}/${agent.ip}" + ) + return false + } + } finally { + redisLock.unlock() + } + } + + private fun agentInQueue( + data: ThirdPartyAgentDispatchData, + agent: ThirdPartyAgent, + envId: Long? + ): Boolean { + if (thirdPartyAgentBuildRedisUtils.isThirdPartyAgentUpgrading(data.projectId, agent.agentId)) { + commonUtil.logWarnI18n( + data, ErrorCodeEnum.BUILD_MACHINE_UPGRADE_IN_PROGRESS.errorCode.toString(), + suffixMsg = " - ${agent.hostname}/${agent.ip}" + ) + return false + } + + // #10082 对于复用的机器和被复用的,需要加锁校验看看这台机器能不能使用 + val lockKey = AgentReuseMutex.genAgentReuseMutexLockKey(data.projectId, agent.agentId) + if (data.dispatchType.hasReuseMutex()) { + val lock = RedisLockByValue( + redisOperation = redisOperation, + lockKey = lockKey, + lockValue = data.buildId, + expiredTimeInSeconds = AgentReuseMutex.AGENT_LOCK_TIMEOUT + ) + // 没有拿到锁说明现在这台机被复用互斥占用不能选 + if (!lock.tryLock()) { + logAgentReuse(data, agent, AGENT_REUSE_MUTEX_REDISPATCH, null) + return false + } + try { + // # 10082 设置复用需要的关键字 jobs..container.agent_id,jobId需要为根节点id + // 只用给env类型的根节点设置,因为id类型的在引擎 AgentReuseMutexCmd 直接写入了 + val dispatch = data.dispatchType + if (dispatch.isEnv() && dispatch.reusedInfo != null && dispatch.reusedInfo!!.jobId == null) { + client.get(ServiceVarResource::class).setContextVar( + SetContextVarData( + projectId = data.projectId, + pipelineId = data.pipelineId, + buildId = data.buildId, + // 根节点一定会有 jobId,引擎侧检查 + contextName = AgentReuseMutex.genAgentContextKey(data.jobId!!), + contextVal = agent.agentId, + readOnly = true, + rewriteReadOnly = true + ) + ) + // 写 linkTip,方便被阻塞的打印日志 + redisOperation.set( + key = AgentReuseMutex.genAgentReuseMutexLinkTipKey(data.buildId), + value = "${data.pipelineId}_Job[${data.vmSeqId}|${data.jobId}]", + expiredInSecond = AgentReuseMutex.AGENT_LOCK_TIMEOUT + ) + } + } catch (e: Exception) { + logger.tagError("agentInQueue|${data.toLog()}|setContextVar|error", e) + } + } else { + val lockedBuildId = redisOperation.get(lockKey) + if (!lockedBuildId.isNullOrBlank() && lockedBuildId != data.buildId) { + // 没有复用逻辑的需要检查下如果这个机器剩一个可调度空间且有复用锁那么不能进行调度 + // 判断当前复用锁有没有任务已经在跑,如果已经在跑那么不管,如果没有跑那么要留一个给复用调度 + val checkRes = if (data.dispatchType.dockerInfo != null) { + val (hasRun, cnt) = + thirdPartyAgentService.checkRunningAndSize(agent.agentId, lockedBuildId, true) + if (hasRun) { + ((agent.dockerParallelTaskCount ?: 4) - cnt) <= 0 + } else { + ((agent.dockerParallelTaskCount ?: 4) - cnt) <= 1 + } + } else { + val (hasRun, cnt) = + thirdPartyAgentService.checkRunningAndSize(agent.agentId, lockedBuildId, false) + if (hasRun) { + ((agent.parallelTaskCount ?: 4) - cnt) <= 0 + } else { + ((agent.parallelTaskCount ?: 4) - cnt) <= 1 + } + } + if (checkRes) { + logAgentReuse(data, agent, AGENT_REUSE_MUTEX_RESERVE_REDISPATCH, lockedBuildId) + return false + } + } + } + + // #5806 入库失败就不再写Redis + inQueue(data, agent, envId) + + // 保存构建详情 + saveAgentInfoToBuildDetail(data, agent) + + logger.info("${data.buildId}|START_AGENT_BY_ID|j(${data.vmSeqId})|agent=${agent.agentId}") + commonUtil.logWithAgentUrl( + data = data, + messageCode = BK_SCHEDULING_SELECTED_AGENT, + param = arrayOf(agent.hostname, agent.ip), + nodeHashId = agent.nodeId, + agentHashId = agent.agentId + ) + return true + } + + private fun logAgentReuse( + data: ThirdPartyAgentDispatchData, + agent: ThirdPartyAgent, + messageCode: String, + lockBuildId: String? + ) { + val lockedBuildId = if (lockBuildId == null) { + val lockKey = AgentReuseMutex.genAgentReuseMutexLockKey(data.projectId, agent.agentId) + redisOperation.get(lockKey) + } else { + lockBuildId + } + + val params = arrayOf("${agent.agentId}|${agent.hostname}/${agent.ip}", lockedBuildId ?: "") + if (lockedBuildId.isNullOrBlank()) { + commonUtil.logI18n(data, messageCode, params) + return + } + var linkTip = redisOperation.get(AgentReuseMutex.genAgentReuseMutexLinkTipKey(lockedBuildId)) + if (linkTip.isNullOrBlank()) { + commonUtil.logI18n(data, messageCode, params) + return + } + val pipelineId = linkTip.substringBefore("_") + linkTip = linkTip.substringAfter("_") + commonUtil.logWithBuildUrl(data, messageCode, params, pipelineId, lockedBuildId, linkTip) + } + + private fun inQueue( + data: ThirdPartyAgentDispatchData, + agent: ThirdPartyAgent, + envId: Long? + ) { + thirdPartyAgentService.queueBuild( + agent = agent, + dispatchData = data, + envId = envId + ) + + thirdPartyAgentBuildRedisUtils.setThirdPartyBuild( + agent.secretKey, + ThirdPartyRedisBuild( + projectId = data.projectId, + pipelineId = data.pipelineId, + buildId = data.buildId, + agentId = agent.agentId, + vmSeqId = data.vmSeqId, + vmName = agent.hostname, + channelCode = data.channelCode, + atoms = data.atoms + ) + ) + + // 添加上下文关键字 jobs..container.node_alias + if (data.jobId.isNullOrBlank()) { + return + } + try { + val detail = client.get(ServiceThirdPartyAgentResource::class).getAgentDetail( + userId = data.userId, + projectId = data.projectId, + agentHashId = agent.agentId + ).data + if (detail == null) { + logger.warn("inQueue|${data.toLog()}|setContextVar|getAgentDetail ${agent.agentId} is null") + return + } + client.get(ServiceVarResource::class).setContextVar( + SetContextVarData( + projectId = data.projectId, + pipelineId = data.pipelineId, + buildId = data.buildId, + contextName = "jobs.${data.jobId}.container.node_alias", + contextVal = detail.displayName, + readOnly = true, + rewriteReadOnly = true + ) + ) + } catch (e: Exception) { + logger.tagError("inQueue|${data.toLog()}|setContextVar|error", e) + } + } + + private fun saveAgentInfoToBuildDetail(data: ThirdPartyAgentDispatchData, agent: ThirdPartyAgent) { + client.get(ServiceBuildResource::class).saveBuildVmInfo( + projectId = data.projectId, + pipelineId = data.pipelineId, + buildId = data.buildId, + vmSeqId = data.vmSeqId, + vmInfo = VmInfo(ip = agent.ip, name = agent.ip) + ) + } + + companion object { + private val logger = LoggerFactory.getLogger(TPASingleQueueService::class.java) + } +} \ No newline at end of file diff --git a/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/utils/TPACommonUtil.kt b/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/utils/TPACommonUtil.kt new file mode 100644 index 00000000000..f513704f6dc --- /dev/null +++ b/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/utils/TPACommonUtil.kt @@ -0,0 +1,270 @@ +package com.tencent.devops.dispatch.utils + +import com.tencent.devops.common.client.Client +import com.tencent.devops.common.dispatch.sdk.BuildFailureException +import com.tencent.devops.common.log.utils.BuildLogPrinter +import com.tencent.devops.common.pipeline.type.agent.ThirdPartyAgentEnvDispatchType +import com.tencent.devops.common.service.config.CommonConfig +import com.tencent.devops.common.service.utils.HomeHostUtil +import com.tencent.devops.common.web.utils.I18nUtil +import com.tencent.devops.dispatch.exception.DispatchRetryMQException +import com.tencent.devops.dispatch.exception.ErrorCodeEnum +import com.tencent.devops.dispatch.pojo.ThirdPartyAgentDispatchData +import com.tencent.devops.process.engine.common.VMUtils +import com.tencent.devops.process.pojo.mq.PipelineAgentStartupEvent +import org.slf4j.Logger +import org.slf4j.LoggerFactory +import org.springframework.beans.factory.annotation.Autowired +import org.springframework.stereotype.Component + +@Component +class TPACommonUtil @Autowired constructor( + private val client: Client, + private val commonConfig: CommonConfig, + private val buildLogPrinter: BuildLogPrinter +) { + fun logI18n( + dispatchData: ThirdPartyAgentDispatchData, + messageCode: String, + param: Array? = null, + suffixMsg: String = "" + ) { + log( + dispatchData, + I18nUtil.getCodeLanMessage( + messageCode = messageCode, + language = I18nUtil.getDefaultLocaleLanguage(), + params = param + ) + suffixMsg + ) + } + + fun log(dispatchData: ThirdPartyAgentDispatchData, logMessage: String) { + buildLogPrinter.addLine( + buildId = dispatchData.buildId, + message = logMessage, + tag = VMUtils.genStartVMTaskId(dispatchData.vmSeqId), + containerHashId = dispatchData.containerHashId, + executeCount = dispatchData.executeCount ?: 1, + jobId = dispatchData.jobId, + stepId = VMUtils.genStartVMTaskId(dispatchData.vmSeqId) + ) + } + + fun logWarnI18n( + dispatchData: ThirdPartyAgentDispatchData, + messageCode: String, + param: Array? = null, + suffixMsg: String = "" + ) { + logWarn( + dispatchData, + I18nUtil.getCodeLanMessage( + messageCode = messageCode, + language = I18nUtil.getDefaultLocaleLanguage(), + params = param + ) + suffixMsg + ) + } + + fun logWarn(dispatchData: ThirdPartyAgentDispatchData, logMessage: String) { + buildLogPrinter.addYellowLine( + buildId = dispatchData.buildId, + message = logMessage, + tag = VMUtils.genStartVMTaskId(dispatchData.vmSeqId), + containerHashId = dispatchData.containerHashId, + executeCount = dispatchData.executeCount ?: 1, + jobId = dispatchData.jobId, + stepId = null + ) + } + + fun logDebugI18n( + dispatchData: ThirdPartyAgentDispatchData, + messageCode: String, + param: Array? = null, + preMsg: String = "", + suffixMsg: String = "" + ) { + logDebug( + dispatchData, + preMsg + I18nUtil.getCodeLanMessage( + messageCode = messageCode, + language = I18nUtil.getDefaultLocaleLanguage(), + params = param + ) + suffixMsg + ) + } + + fun logDebug(dispatchData: ThirdPartyAgentDispatchData, message: String) { + buildLogPrinter.addDebugLine( + buildId = dispatchData.buildId, + message = message, + tag = VMUtils.genStartVMTaskId(dispatchData.vmSeqId), + containerHashId = dispatchData.containerHashId, + executeCount = dispatchData.executeCount ?: 1, + jobId = dispatchData.jobId, + stepId = VMUtils.genStartVMTaskId(dispatchData.vmSeqId) + ) + } + + fun logWithBuildUrl( + data: ThirdPartyAgentDispatchData, + messageCode: String, + param: Array? = null, + pipelineId: String, + lockedBuildId: String, + linkTip: String + ) { + val host = HomeHostUtil.getHost(commonConfig.devopsHostGateway!!) + val link = "$host/console/pipeline/${data.projectId}/$pipelineId/detail/$lockedBuildId" + val msg = if (lockedBuildId != data.buildId) { + "$linkTip $lockedBuildId" + } else { + linkTip + } + + logI18n(data, messageCode, param, suffixMsg = msg) + } + + fun logWithAgentUrl( + data: ThirdPartyAgentDispatchData, + messageCode: String, + param: Array? = null, + nodeHashId: String?, + agentHashId: String? + ) { + val host = HomeHostUtil.getHost(commonConfig.devopsHostGateway!!) + // 跨项目使用 agent + val projectId = if (data.dispatchType is ThirdPartyAgentEnvDispatchType) { + data.dispatchType.envProjectId?.ifBlank { data.projectId } ?: data.projectId + } else { + data.projectId + } + val link = "$host/console/environment/$projectId/nodeDetail/$nodeHashId" + val msg = if (nodeHashId.isNullOrBlank()) { + "" + } else { + " $agentHashId" + } + + logI18n(data, messageCode, param, suffixMsg = msg) + } + + fun logError( + data: ThirdPartyAgentDispatchData, + message: String + ) { + buildLogPrinter.addRedLine( + buildId = data.buildId, + message = message, + tag = VMUtils.genStartVMTaskId(data.vmSeqId), + containerHashId = data.containerHashId, + executeCount = data.executeCount ?: 1, + jobId = data.jobId, + stepId = VMUtils.genStartVMTaskId(data.vmSeqId) + ) + } + + /** + * 给引擎写入排队的启停时间,时间为 millis 的 timestamp + */ + fun updateQueueTime(data: ThirdPartyAgentDispatchData, createTime: Long?, endTime: Long?) { + updateQueueTime( + projectId = data.projectId, + pipelineId = data.pipelineId, + buildId = data.buildId, + vmSeqId = data.vmSeqId, + executeCount = data.executeCount ?: 1, + createTime = createTime, + endTime = endTime + ) + } + + fun updateQueueTime(event: PipelineAgentStartupEvent, createTime: Long?, endTime: Long?) { + updateQueueTime( + projectId = event.projectId, + pipelineId = event.pipelineId, + buildId = event.buildId, + vmSeqId = event.vmSeqId, + executeCount = event.executeCount ?: 1, + createTime = createTime, + endTime = endTime + ) + } + + fun updateQueueTime( + projectId: String, + pipelineId: String, + buildId: String, + vmSeqId: String, + executeCount: Int, + createTime: Long?, + endTime: Long? + ) { + // TODO: #9897 因为需要前端配合,所以一期先不写入耗时,等待前端完善排队耗时展示 +// try { +// client.get(ServiceBuildResource::class).updateContainerTimeout( +// projectId = projectId, +// pipelineId = pipelineId, +// buildId = buildId, +// containerId = vmSeqId, +// executeCount = executeCount ?: 1, +// timestamps = mapOf( +// BuildTimestampType.JOB_THIRD_PARTY_QUEUE to BuildRecordTimeStamp(createTime, endTime) +// ) +// ) +// } catch (e: Throwable) { +// logger.error("updateQueueTime|$projectId|$pipelineId|$buildId|$vmSeqId|$executeCount" + +// "|$createTime|$endTime|error", e) +// } + } + + companion object { + fun queueRetry( + errorCode: ErrorCodeEnum, + errMsg: String? = null, + suffixMsg: String = "" + ): DispatchRetryMQException { + throw DispatchRetryMQException( + errorCodeEnum = errorCode, + errorMessage = (errMsg ?: errorCode.getErrorMessage()) + suffixMsg + ) + } + + fun queueFailureI18n( + errorCode: ErrorCodeEnum, + messageCode: String? = null, + param: Array? = null + ): BuildFailureException { + return queueFailure( + errorCode, + I18nUtil.getCodeLanMessage( + messageCode = messageCode ?: errorCode.errorCode.toString(), + language = I18nUtil.getDefaultLocaleLanguage(), + params = param + ) + ) + } + + fun queueFailure( + errorCode: ErrorCodeEnum, + errMsg: String + ): BuildFailureException { + return BuildFailureException( + errorType = errorCode.errorType, + errorCode = errorCode.errorCode, + formatErrorMessage = errorCode.formatErrorMessage, + errorMessage = errMsg + ) + } + + private const val TPA_QUEUE_LOG_TAG = "tpa_queue_log_tag" + + // 打印带特定tag的日志 + fun Logger.tagError(msg: String) = this.error("$TPA_QUEUE_LOG_TAG$msg") + fun Logger.tagError(msg: String, o: Any) = this.error("$TPA_QUEUE_LOG_TAG$msg", o) + + private val logger = LoggerFactory.getLogger(TPACommonUtil::class.java) + } +} \ No newline at end of file diff --git a/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/utils/ThirdPartyAgentEnvLock.kt b/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/utils/ThirdPartyAgentEnvLock.kt index 52d5d9ab6f1..cfe632db459 100644 --- a/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/utils/ThirdPartyAgentEnvLock.kt +++ b/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/utils/ThirdPartyAgentEnvLock.kt @@ -41,7 +41,6 @@ class ThirdPartyAgentEnvLock( lockKey = "DISPATCH_REDIS_LOCK_ENV_${projectId}_$envId", expiredTimeInSeconds = 60L ) { - fun tryLock(timeout: Long = 1000, interval: Long = 100): Boolean { val sleepTime = min(interval, timeout) // sleep时间不超过timeout val start = System.currentTimeMillis() diff --git a/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/utils/ThirdPartyAgentQueueEnvLock.kt b/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/utils/ThirdPartyAgentQueueEnvLock.kt new file mode 100644 index 00000000000..f0073d0bcb7 --- /dev/null +++ b/src/backend/ci/core/dispatch/biz-dispatch/src/main/kotlin/com/tencent/devops/dispatch/utils/ThirdPartyAgentQueueEnvLock.kt @@ -0,0 +1,30 @@ +package com.tencent.devops.dispatch.utils + +import com.tencent.devops.common.redis.RedisLock +import com.tencent.devops.common.redis.RedisOperation +import kotlin.math.min + +class ThirdPartyAgentQueueEnvLock( + redisOperation: RedisOperation, + projectId: String, + queueKey: String, + expiredTimeInSeconds: Long, + lockValue: String +) : RedisLock( + redisOperation = redisOperation, + lockKey = "DISPATCH_REDIS_QUEUE_LOCK_ENV_${projectId}_$queueKey", + expiredTimeInSeconds = expiredTimeInSeconds, + lockValue = lockValue +) { + + fun tryLock(timeout: Long = 1000, interval: Long = 100): Boolean { + val sleepTime = min(interval, timeout) // sleep时间不超过timeout + val start = System.currentTimeMillis() + var tryLock = tryLock() + while (timeout > 0 && !tryLock && timeout > (System.currentTimeMillis() - start)) { + Thread.sleep(sleepTime) + tryLock = tryLock() + } + return tryLock + } +} diff --git a/src/backend/ci/core/environment/biz-environment/src/main/kotlin/com/tencent/devops/environment/resources/thirdPartyAgent/ServiceThirdPartyAgentResourceImpl.kt b/src/backend/ci/core/environment/biz-environment/src/main/kotlin/com/tencent/devops/environment/resources/thirdPartyAgent/ServiceThirdPartyAgentResourceImpl.kt index 44e258d8fbd..085b594c751 100644 --- a/src/backend/ci/core/environment/biz-environment/src/main/kotlin/com/tencent/devops/environment/resources/thirdPartyAgent/ServiceThirdPartyAgentResourceImpl.kt +++ b/src/backend/ci/core/environment/biz-environment/src/main/kotlin/com/tencent/devops/environment/resources/thirdPartyAgent/ServiceThirdPartyAgentResourceImpl.kt @@ -38,7 +38,6 @@ import com.tencent.devops.common.auth.api.ActionId import com.tencent.devops.common.web.RestResource import com.tencent.devops.environment.api.thirdpartyagent.ServiceThirdPartyAgentResource import com.tencent.devops.environment.constant.EnvironmentMessageCode -import com.tencent.devops.environment.permission.EnvironmentPermissionService import com.tencent.devops.environment.pojo.AgentPipelineRefRequest import com.tencent.devops.environment.pojo.EnvVar import com.tencent.devops.environment.pojo.enums.NodeType @@ -73,7 +72,6 @@ class ServiceThirdPartyAgentResourceImpl @Autowired constructor( private val thirdPartyAgentPipelineService: ThirdPartyAgentPipelineService, private val agentPipelineService: AgentPipelineService, private val slaveGatewayService: SlaveGatewayService, - private val permissionService: EnvironmentPermissionService, private val nodeService: NodeService, private val agentService: ThirdPartAgentService ) : ServiceThirdPartyAgentResource { diff --git a/src/backend/ci/core/process/api-process/src/main/kotlin/com/tencent/devops/process/api/service/ServiceBuildResource.kt b/src/backend/ci/core/process/api-process/src/main/kotlin/com/tencent/devops/process/api/service/ServiceBuildResource.kt index 430822ffa07..d1b485d735e 100644 --- a/src/backend/ci/core/process/api-process/src/main/kotlin/com/tencent/devops/process/api/service/ServiceBuildResource.kt +++ b/src/backend/ci/core/process/api-process/src/main/kotlin/com/tencent/devops/process/api/service/ServiceBuildResource.kt @@ -34,12 +34,14 @@ import com.tencent.devops.common.api.pojo.BuildHistoryPage import com.tencent.devops.common.api.pojo.ErrorType import com.tencent.devops.common.api.pojo.Result import com.tencent.devops.common.api.pojo.SimpleResult +import com.tencent.devops.common.pipeline.enums.BuildRecordTimeStamp import com.tencent.devops.common.pipeline.enums.BuildStatus import com.tencent.devops.common.pipeline.enums.ChannelCode import com.tencent.devops.common.pipeline.enums.StartType import com.tencent.devops.common.pipeline.pojo.BuildFormProperty import com.tencent.devops.common.pipeline.pojo.BuildFormValue import com.tencent.devops.common.pipeline.pojo.StageReviewRequest +import com.tencent.devops.common.pipeline.pojo.time.BuildTimestampType import com.tencent.devops.common.web.annotation.BkApiPermission import com.tencent.devops.common.web.annotation.BkField import com.tencent.devops.common.web.constant.BkApiHandleType @@ -911,4 +913,32 @@ interface ServiceBuildResource { @BkField(required = true) buildIds: Set ): Result + + @Operation(summary = "添加构建的容器耗时") + @POST + @Path("projects/{projectId}/pipelines/{pipelineId}/builds/{buildId}/updateContainerTimeout") + fun updateContainerTimeout( + @Parameter(description = "项目ID", required = true) + @BkField(required = true) + @PathParam("projectId") + projectId: String, + @Parameter(description = "流水线ID", required = true) + @PathParam("pipelineId") + @BkField(required = true) + pipelineId: String, + @Parameter(description = "构建ID", required = true) + @PathParam("buildId") + @BkField(required = true) + buildId: String, + @Parameter(description = "containerId/vmSeqId") + @QueryParam("containerId") + containerId: String, + @Parameter(description = "执行次数", required = false) + @QueryParam("executeCount") + @BkField(required = true) + executeCount: Int, + @Parameter(description = "要写入的耗时", required = true) + @BkField(required = true) + timestamps: Map + ) } diff --git a/src/backend/ci/core/process/api-process/src/main/kotlin/com/tencent/devops/process/pojo/mq/PipelineAgentStartupEvent.kt b/src/backend/ci/core/process/api-process/src/main/kotlin/com/tencent/devops/process/pojo/mq/PipelineAgentStartupEvent.kt index 0b204730b2a..7f6d4afe82b 100644 --- a/src/backend/ci/core/process/api-process/src/main/kotlin/com/tencent/devops/process/pojo/mq/PipelineAgentStartupEvent.kt +++ b/src/backend/ci/core/process/api-process/src/main/kotlin/com/tencent/devops/process/pojo/mq/PipelineAgentStartupEvent.kt @@ -59,6 +59,8 @@ data class PipelineAgentStartupEvent( val ignoreEnvAgentIds: Set? = null, val singleNodeConcurrency: Int? = null, val allNodeConcurrency: Int? = null, + // 用来计算 dispatch 排队过程中的耗时 + var dispatchQueueStartTimeMilliSecond: Long? = null, override var actionType: ActionType = ActionType.REFRESH, override var delayMills: Int = 0, override var routeKeySuffix: String? = null diff --git a/src/backend/ci/core/process/biz-base/src/main/kotlin/com/tencent/devops/process/engine/pojo/AgentReuseMutexTree.kt b/src/backend/ci/core/process/biz-base/src/main/kotlin/com/tencent/devops/process/engine/pojo/AgentReuseMutexTree.kt index d9c21e6fe2d..910c18a53d7 100644 --- a/src/backend/ci/core/process/biz-base/src/main/kotlin/com/tencent/devops/process/engine/pojo/AgentReuseMutexTree.kt +++ b/src/backend/ci/core/process/biz-base/src/main/kotlin/com/tencent/devops/process/engine/pojo/AgentReuseMutexTree.kt @@ -40,6 +40,7 @@ data class AgentReuseMutexTree( return addNode( jobId = container.jobId, dispatchType = dispatchType, + // 逻辑上可能需要dependOn复用树 existDep = (container.jobControlOption?.dependOnId?.contains(reuseId) == true) || (container.jobControlOption?.dependOnName == reuseId), stageIndex = stageIndex, diff --git a/src/backend/ci/core/process/biz-engine/src/main/kotlin/com/tencent/devops/process/engine/atom/vm/DispatchVMStartupTaskAtom.kt b/src/backend/ci/core/process/biz-engine/src/main/kotlin/com/tencent/devops/process/engine/atom/vm/DispatchVMStartupTaskAtom.kt index 7b5860440b5..ac685175ab8 100644 --- a/src/backend/ci/core/process/biz-engine/src/main/kotlin/com/tencent/devops/process/engine/atom/vm/DispatchVMStartupTaskAtom.kt +++ b/src/backend/ci/core/process/biz-engine/src/main/kotlin/com/tencent/devops/process/engine/atom/vm/DispatchVMStartupTaskAtom.kt @@ -31,7 +31,6 @@ import com.tencent.devops.common.api.check.Preconditions import com.tencent.devops.common.api.constant.CommonMessageCode.BK_ENV_NOT_YET_SUPPORTED import com.tencent.devops.common.api.pojo.ErrorCode import com.tencent.devops.common.api.pojo.ErrorType -import com.tencent.devops.common.api.pojo.Zone import com.tencent.devops.common.api.util.EnvUtils import com.tencent.devops.common.api.util.JsonUtil import com.tencent.devops.common.api.util.MessageUtil @@ -66,7 +65,6 @@ import com.tencent.devops.process.engine.service.detail.ContainerBuildDetailServ import com.tencent.devops.process.engine.service.record.ContainerBuildRecordService import com.tencent.devops.process.pojo.mq.PipelineAgentShutdownEvent import com.tencent.devops.process.pojo.mq.PipelineAgentStartupEvent -import com.tencent.devops.process.service.BuildVariableService import com.tencent.devops.process.service.PipelineAsCodeService import com.tencent.devops.process.service.PipelineContextService import com.tencent.devops.store.api.container.ServiceContainerAppResource @@ -81,7 +79,7 @@ import java.util.concurrent.TimeUnit * * @version 1.0 */ -@Suppress("UNUSED", "LongParameterList") +@Suppress("LongParameterList", "LongMethod", "MagicNumber") @Component @Scope(ConfigurableBeanFactory.SCOPE_PROTOTYPE) class DispatchVMStartupTaskAtom @Autowired constructor( @@ -90,7 +88,6 @@ class DispatchVMStartupTaskAtom @Autowired constructor( private val containerBuildDetailService: ContainerBuildDetailService, private val containerBuildRecordService: ContainerBuildRecordService, private val pipelineRuntimeService: PipelineRuntimeService, - private val buildVariableService: BuildVariableService, private val pipelineEventDispatcher: PipelineEventDispatcher, private val buildLogPrinter: BuildLogPrinter, private val dispatchTypeBuilder: DispatchTypeBuilder, @@ -337,14 +334,6 @@ class DispatchVMStartupTaskAtom @Autowired constructor( return true } - private fun getBuildZone(container: Container): Zone? { - return when { - container !is VMBuildContainer -> null - container.enableExternal == true -> Zone.EXTERNAL - else -> null - } - } - override fun tryFinish( task: PipelineBuildTask, param: VMBuildContainer, @@ -401,9 +390,12 @@ class DispatchVMStartupTaskAtom @Autowired constructor( ignoreEnvAgentIds = retryThirdAgentEnv.split(",").filter { it.isNotBlank() }.toSet() ) } - // 发送后就将参数置空防止下次重复发送事件 - thirdPartyAgentMonitorPrint(task) + try { + thirdPartyAgentMonitorPrint(task) + } catch (ignore: Exception) { + // 忽略掉因调用打印接口出错而导致调度失败的问题 + } } AtomResponse( diff --git a/src/backend/ci/core/process/biz-engine/src/main/kotlin/com/tencent/devops/process/engine/control/BuildEndControl.kt b/src/backend/ci/core/process/biz-engine/src/main/kotlin/com/tencent/devops/process/engine/control/BuildEndControl.kt index d652f8e2265..4ea900e83c4 100644 --- a/src/backend/ci/core/process/biz-engine/src/main/kotlin/com/tencent/devops/process/engine/control/BuildEndControl.kt +++ b/src/backend/ci/core/process/biz-engine/src/main/kotlin/com/tencent/devops/process/engine/control/BuildEndControl.kt @@ -217,6 +217,8 @@ class BuildEndControl @Autowired constructor( lockValue = buildId, expiredTimeInSeconds = AgentReuseMutex.AGENT_LOCK_TIMEOUT ).unlock() + // 解锁的同时兜底删除 linkTip + redisOperation.delete(AgentReuseMutex.genAgentReuseMutexLinkTipKey(buildId)) val queueKey = AgentReuseMutex.genAgentReuseMutexQueueKey(projectId, agentId) redisOperation.hdelete(queueKey, buildId) } diff --git a/src/backend/ci/core/process/biz-engine/src/main/kotlin/com/tencent/devops/process/engine/control/HeartbeatControl.kt b/src/backend/ci/core/process/biz-engine/src/main/kotlin/com/tencent/devops/process/engine/control/HeartbeatControl.kt index 54725a4879b..3317f088e22 100644 --- a/src/backend/ci/core/process/biz-engine/src/main/kotlin/com/tencent/devops/process/engine/control/HeartbeatControl.kt +++ b/src/backend/ci/core/process/biz-engine/src/main/kotlin/com/tencent/devops/process/engine/control/HeartbeatControl.kt @@ -42,18 +42,16 @@ import com.tencent.devops.process.engine.pojo.event.PipelineBuildContainerEvent import com.tencent.devops.process.engine.pojo.event.PipelineContainerAgentHeartBeatEvent import com.tencent.devops.process.engine.service.PipelineContainerService import com.tencent.devops.process.engine.service.PipelineRuntimeService -import com.tencent.devops.process.engine.service.PipelineTaskService -import java.util.concurrent.TimeUnit import org.slf4j.LoggerFactory import org.springframework.beans.factory.annotation.Autowired import org.springframework.stereotype.Service +import java.util.concurrent.TimeUnit @Service class HeartbeatControl @Autowired constructor( private val buildLogPrinter: BuildLogPrinter, private val redisOperation: RedisOperation, private val pipelineEventDispatcher: PipelineEventDispatcher, - private val pipelineTaskService: PipelineTaskService, private val pipelineContainerService: PipelineContainerService, private val pipelineRuntimeService: PipelineRuntimeService ) { @@ -114,7 +112,6 @@ class HeartbeatControl @Autowired constructor( "executeCount(${event.executeCount} != ${container.executeCount})") return } - var found = false // # 5806 完善构建进程超时提示信息 val tipMessage = I18nUtil.getCodeLanMessage( @@ -122,37 +119,16 @@ class HeartbeatControl @Autowired constructor( params = arrayOf("${TimeUnit.MILLISECONDS.toSeconds(elapse)}") ) - // #2365 在运行中的插件中记录心跳超时信息 - val runningTask = pipelineTaskService.getRunningTask(container.projectId, container.buildId) - runningTask.forEach { taskMap -> - if (container.containerId == taskMap["containerId"] && taskMap["taskId"] != null) { - found = true - val executeCount = taskMap["executeCount"]?.toString()?.toInt() ?: 1 - val stepId = taskMap["stepId"]?.toString() ?: "" - buildLogPrinter.addRedLine( - buildId = container.buildId, - message = tipMessage, - tag = taskMap["taskId"].toString(), - containerHashId = container.containerHashId, - executeCount = executeCount, - jobId = null, - stepId = stepId - ) - } - } - - if (!found) { - // #2365 在Set Up Job位置记录心跳超时信息 - buildLogPrinter.addRedLine( - buildId = container.buildId, - message = tipMessage, - tag = VMUtils.genStartVMTaskId(container.containerId), - containerHashId = container.containerHashId, - executeCount = container.executeCount, - jobId = null, - stepId = VMUtils.genStartVMTaskId(container.containerId) - ) - } + // #2365 在Set Up Job位置记录心跳超时信息 + buildLogPrinter.addRedLine( + buildId = container.buildId, + message = tipMessage, + tag = VMUtils.genStartVMTaskId(container.containerId), + containerHashId = container.containerHashId, + executeCount = container.executeCount, + jobId = null, + stepId = VMUtils.genStartVMTaskId(container.containerId) + ) // 终止当前容器下的任务 pipelineEventDispatcher.dispatch( PipelineBuildContainerEvent( diff --git a/src/backend/ci/core/process/biz-engine/src/main/kotlin/com/tencent/devops/process/engine/control/command/stage/impl/StartContainerStageCmd.kt b/src/backend/ci/core/process/biz-engine/src/main/kotlin/com/tencent/devops/process/engine/control/command/stage/impl/StartContainerStageCmd.kt index fbdef7c056f..ecd9a680b1e 100644 --- a/src/backend/ci/core/process/biz-engine/src/main/kotlin/com/tencent/devops/process/engine/control/command/stage/impl/StartContainerStageCmd.kt +++ b/src/backend/ci/core/process/biz-engine/src/main/kotlin/com/tencent/devops/process/engine/control/command/stage/impl/StartContainerStageCmd.kt @@ -249,6 +249,8 @@ class StartContainerStageCmd( expiredTimeInSeconds = AgentReuseMutex.AGENT_LOCK_TIMEOUT ) lock.unlock() + // 解锁的同时兜底删除 linkTip + redisOperation.delete(AgentReuseMutex.genAgentReuseMutexLinkTipKey(stage.buildId)) } } } diff --git a/src/backend/ci/core/process/biz-process/src/main/kotlin/com/tencent/devops/process/api/ServiceBuildResourceImpl.kt b/src/backend/ci/core/process/biz-process/src/main/kotlin/com/tencent/devops/process/api/ServiceBuildResourceImpl.kt index 83c96b73c46..06ccb7a04a6 100644 --- a/src/backend/ci/core/process/biz-process/src/main/kotlin/com/tencent/devops/process/api/ServiceBuildResourceImpl.kt +++ b/src/backend/ci/core/process/biz-process/src/main/kotlin/com/tencent/devops/process/api/ServiceBuildResourceImpl.kt @@ -34,15 +34,18 @@ import com.tencent.devops.common.api.pojo.ErrorType import com.tencent.devops.common.api.pojo.Result import com.tencent.devops.common.api.pojo.SimpleResult import com.tencent.devops.common.auth.api.ActionId +import com.tencent.devops.common.pipeline.enums.BuildRecordTimeStamp import com.tencent.devops.common.pipeline.enums.BuildStatus import com.tencent.devops.common.pipeline.enums.ChannelCode import com.tencent.devops.common.pipeline.enums.StartType import com.tencent.devops.common.pipeline.pojo.BuildFormProperty import com.tencent.devops.common.pipeline.pojo.BuildFormValue import com.tencent.devops.common.pipeline.pojo.StageReviewRequest +import com.tencent.devops.common.pipeline.pojo.time.BuildTimestampType import com.tencent.devops.common.web.RestResource import com.tencent.devops.process.api.service.ServiceBuildResource import com.tencent.devops.process.engine.service.PipelineRuntimeService +import com.tencent.devops.process.engine.service.record.ContainerBuildRecordService import com.tencent.devops.process.engine.service.vmbuild.EngineVMBuildService import com.tencent.devops.process.pojo.BuildBasicInfo import com.tencent.devops.process.pojo.BuildHistory @@ -70,7 +73,8 @@ class ServiceBuildResourceImpl @Autowired constructor( private val pipelineBuildFacadeService: PipelineBuildFacadeService, private val engineVMBuildService: EngineVMBuildService, private val pipelinePauseBuildFacadeService: PipelinePauseBuildFacadeService, - private val pipelineRuntimeService: PipelineRuntimeService + private val pipelineRuntimeService: PipelineRuntimeService, + private val containerBuildRecordService: ContainerBuildRecordService ) : ServiceBuildResource { override fun getPipelineIdFromBuildId(projectId: String, buildId: String): Result { if (buildId.isBlank()) { @@ -822,6 +826,26 @@ class ServiceBuildResourceImpl @Autowired constructor( ) } + override fun updateContainerTimeout( + projectId: String, + pipelineId: String, + buildId: String, + containerId: String, + executeCount: Int, + timestamps: Map + ) { + containerBuildRecordService.updateContainerRecord( + projectId = projectId, + pipelineId = pipelineId, + buildId = buildId, + containerId = containerId, + executeCount = executeCount, + containerVar = emptyMap(), + buildStatus = null, + timestamps = timestamps + ) + } + private fun checkParam(projectId: String, pipelineId: String) { if (pipelineId.isBlank()) { throw ParamBlankException("Invalid pipelineId") diff --git a/support-files/i18n/dispatch/message_en_US.properties b/support-files/i18n/dispatch/message_en_US.properties index 24caf3c1986..ff10f77fcb3 100644 --- a/support-files/i18n/dispatch/message_en_US.properties +++ b/support-files/i18n/dispatch/message_en_US.properties @@ -202,4 +202,7 @@ agentReuseMuteXRedispatch=AgentReuseMutex, agent {0} has been used by {1} to bui agentReuseMuteXWaitReusedEnv=AgentReuseMutex, wait for the dependent node {0} to be scheduled to a specific node before performing reuse scheduling. bkEnvNodeDisable=Environment node {0} has been disabled and is not scheduled for builds bkThirdJobEnvCurr=Total concurrency limit on all nodes, in the current environment, the concurrency of all build machines {0} has exceeded the configured {1} and has been queued for {2} minutes. -bkThirdJobNodeCurr=The concurrency limit on a single node, In the current environment, the running tasks of each node exceed the configured {0} and are queued for {1} minutes. \ No newline at end of file +bkThirdJobNodeCurr=The concurrency limit on a single node, In the current environment, the running tasks of each node exceed the configured {0} and are queued for {1} minutes. +agentReuseMutexReserveRedispatch=AgentReuseMutex, agent {0} has been used by {1} to build, no space to use, reschedule +bkEnvDispatchAgent=Build environment scheduling completed, Node {0} has been selected +tryAgentDispatch=Try to send the build to node {0} \ No newline at end of file diff --git a/support-files/i18n/dispatch/message_zh_CN.properties b/support-files/i18n/dispatch/message_zh_CN.properties index e395a005674..26ef291bbd0 100644 --- a/support-files/i18n/dispatch/message_zh_CN.properties +++ b/support-files/i18n/dispatch/message_zh_CN.properties @@ -202,4 +202,7 @@ taskStatus.unknown=未知状态 taskStatus.waiting=任务初始化 agentReuseMuteXRedispatch=构建机复用互斥,节点 {0} 已被 {1} 构建使用,重新调度 agentReuseMuteXWaitReusedEnv=构建机复用互斥,等待被依赖的节点 {0} 调度到具体节点后再进行复用调度 -bkEnvNodeDisable=环境节点 {0} 已被禁用,不进行构建调度 \ No newline at end of file +bkEnvNodeDisable=环境节点 {0} 已被禁用,不进行构建调度 +agentReuseMutexReserveRedispatch=构建机复用互斥,节点 {0} 已被 {1} 构建使用,剩余可调度空间不足,重新调度 +bkEnvDispatchAgent=构建环境调度结束,已选取节点 {0} +tryAgentDispatch=尝试下发任务至节点 {0} \ No newline at end of file diff --git a/support-files/sql/1001_ci_dispatch_ddl_mysql.sql b/support-files/sql/1001_ci_dispatch_ddl_mysql.sql index 49440a6a7b3..2a5f7b9c4aa 100644 --- a/support-files/sql/1001_ci_dispatch_ddl_mysql.sql +++ b/support-files/sql/1001_ci_dispatch_ddl_mysql.sql @@ -410,4 +410,22 @@ CREATE TABLE IF NOT EXISTS `T_DISPATCH_QUOTA_JOB_SYSTEM` UNIQUE INDEX `UNI_KEY` (`VM_TYPE`, `CHANNEL_CODE`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4 COMMENT '流水线JOB配额系统表'; +CREATE TABLE IF NOT EXISTS `T_DISPATCH_THIRDPARTY_AGENT_QUEUE` +( + `ID` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '主键ID', + `PROJECT_ID` varchar(64) NOT NULL COMMENT '项目ID', + `PIPELINE_ID` varchar(34) NOT NULL COMMENT '流水线ID', + `BUILD_ID` varchar(34) NOT NULL COMMENT '构建ID', + `VM_SEQ_ID` varchar(34) NOT NULL COMMENT '构建序列号', + `DATA` varchar(256) NOT NULL COMMENT '排队资源,随着类型不同而不同', + `DATA_TYPE` varchar(64) NOT NULL COMMENT '排队资源类型', + `INFO` json NOT NULL COMMENT '额外的事件信息', + `RETRY_TIME` int(11) NOT NULL COMMENT '重试次数', + `CREATED_TIME` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', + `UPDATE_TIME` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '更新时间', + PRIMARY KEY (`ID`), + KEY `IDX_PROJECT_DATA` (`PROJECT_ID`,`PIPELINE_ID`,`DATA`,`DATA_TYPE`,`CREATED_TIME`) USING BTREE, + KEY `IDX_BUILD_VMSEQ` (`BUILD_ID`,`VM_SEQ_ID`) USING BTREE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT '第三方构建机排队表'; + SET FOREIGN_KEY_CHECKS = 1;