From b5fe3c96f6be515493c539210e3e492a76556dac Mon Sep 17 00:00:00 2001 From: dengqichen Date: Mon, 8 Dec 2025 15:53:43 +0800 Subject: [PATCH] =?UTF-8?q?=E6=95=B4=E7=90=86=E4=B8=8B=E5=88=9D=E5=A7=8B?= =?UTF-8?q?=E5=8C=96=E6=95=B0=E6=8D=AE=E8=A1=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../backend/deploy/entity/ServerAlertLog.java | 6 +- .../repository/IServerAlertLogRepository.java | 25 +- .../service/impl/ServerAlertServiceImpl.java | 249 ++++++++++++++++-- .../db/changelog/changes/v1.0.0-data.sql | 2 +- 4 files changed, 254 insertions(+), 28 deletions(-) diff --git a/backend/src/main/java/com/qqchen/deploy/backend/deploy/entity/ServerAlertLog.java b/backend/src/main/java/com/qqchen/deploy/backend/deploy/entity/ServerAlertLog.java index 342aeb21..8e17446d 100644 --- a/backend/src/main/java/com/qqchen/deploy/backend/deploy/entity/ServerAlertLog.java +++ b/backend/src/main/java/com/qqchen/deploy/backend/deploy/entity/ServerAlertLog.java @@ -2,6 +2,7 @@ package com.qqchen.deploy.backend.deploy.entity; import com.qqchen.deploy.backend.framework.enums.MonitorAlertLevelEnum; import com.qqchen.deploy.backend.framework.enums.MonitorMetricEnum; +import com.qqchen.deploy.backend.framework.enums.ServerAlertStatusEnum; import jakarta.persistence.*; import lombok.AllArgsConstructor; import lombok.Builder; @@ -71,10 +72,11 @@ public class ServerAlertLog { private String alertMessage; /** - * 状态: ACTIVE/RESOLVED + * 状态: PENDING/ACTIVE/RESOLVED */ + @Enumerated(EnumType.STRING) @Column(name = "status", length = 20) - private String status = "ACTIVE"; + private ServerAlertStatusEnum status; /** * 告警时间 diff --git a/backend/src/main/java/com/qqchen/deploy/backend/deploy/repository/IServerAlertLogRepository.java b/backend/src/main/java/com/qqchen/deploy/backend/deploy/repository/IServerAlertLogRepository.java index 2313fa96..04eaa9e6 100644 --- a/backend/src/main/java/com/qqchen/deploy/backend/deploy/repository/IServerAlertLogRepository.java +++ b/backend/src/main/java/com/qqchen/deploy/backend/deploy/repository/IServerAlertLogRepository.java @@ -1,11 +1,16 @@ package com.qqchen.deploy.backend.deploy.repository; import com.qqchen.deploy.backend.deploy.entity.ServerAlertLog; +import com.qqchen.deploy.backend.framework.enums.MonitorMetricEnum; +import com.qqchen.deploy.backend.framework.enums.ServerAlertStatusEnum; import org.springframework.data.jpa.repository.JpaRepository; +import org.springframework.data.jpa.repository.Query; +import org.springframework.data.repository.query.Param; import org.springframework.stereotype.Repository; import java.time.LocalDateTime; import java.util.List; +import java.util.Optional; /** * 服务器告警记录Repository @@ -14,13 +19,29 @@ import java.util.List; public interface IServerAlertLogRepository extends JpaRepository { /** - * 查询指定服务器的活跃告警 + * 查询指定服务器的指定状态告警 */ - List findByServerIdAndStatus(Long serverId, String status); + List findByServerIdAndStatus(Long serverId, ServerAlertStatusEnum status); /** * 查询指定服务器在指定时间范围内的告警记录 */ List findByServerIdAndAlertTimeBetweenOrderByAlertTimeDesc( Long serverId, LocalDateTime startTime, LocalDateTime endTime); + + /** + * 查询指定服务器、规则、告警类型的活跃或待处理告警 + * 用于判断是否已存在未解决的告警 + */ + @Query("SELECT a FROM ServerAlertLog a WHERE a.serverId = :serverId " + + "AND a.ruleId = :ruleId " + + "AND a.alertType = :alertType " + + "AND a.status IN (com.qqchen.deploy.backend.framework.enums.ServerAlertStatusEnum.PENDING, " + + " com.qqchen.deploy.backend.framework.enums.ServerAlertStatusEnum.ACTIVE) " + + "ORDER BY a.alertTime DESC") + Optional findActiveOrPendingAlert( + @Param("serverId") Long serverId, + @Param("ruleId") Long ruleId, + @Param("alertType") MonitorMetricEnum alertType + ); } diff --git a/backend/src/main/java/com/qqchen/deploy/backend/deploy/service/impl/ServerAlertServiceImpl.java b/backend/src/main/java/com/qqchen/deploy/backend/deploy/service/impl/ServerAlertServiceImpl.java index 1b4b89d9..2517ca20 100644 --- a/backend/src/main/java/com/qqchen/deploy/backend/deploy/service/impl/ServerAlertServiceImpl.java +++ b/backend/src/main/java/com/qqchen/deploy/backend/deploy/service/impl/ServerAlertServiceImpl.java @@ -7,6 +7,7 @@ import com.qqchen.deploy.backend.deploy.entity.ServerAlertLog; import com.qqchen.deploy.backend.deploy.entity.ServerAlertRule; import com.qqchen.deploy.backend.framework.enums.MonitorAlertLevelEnum; import com.qqchen.deploy.backend.framework.enums.MonitorMetricEnum; +import com.qqchen.deploy.backend.framework.enums.ServerAlertStatusEnum; import com.qqchen.deploy.backend.deploy.repository.IServerAlertLogRepository; import com.qqchen.deploy.backend.deploy.repository.IServerAlertRuleRepository; import com.qqchen.deploy.backend.deploy.repository.IServerRepository; @@ -23,6 +24,7 @@ import java.time.LocalDateTime; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Optional; /** * 服务器告警服务实现 @@ -109,9 +111,18 @@ public class ServerAlertServiceImpl implements IServerAlertService { threshold = rule.getWarningThreshold(); } - // 触发告警 + // 查询是否已有活跃或待处理的告警 + Optional existingAlertOpt = alertLogRepository.findActiveOrPendingAlert( + serverId, rule.getId(), rule.getAlertType()); + if (alertLevel != null) { - triggerAlert(serverId, rule, alertLevel, currentValue, threshold, resourceInfo, config); + // 当前超过阈值 + handleAlertTriggered(existingAlertOpt, serverId, rule, alertLevel, currentValue, threshold, resourceInfo, config); + } else { + // 当前值正常,检查是否需要解除告警 + if (existingAlertOpt.isPresent()) { + resolveAlert(existingAlertOpt.get()); + } } } @@ -158,21 +169,109 @@ public class ServerAlertServiceImpl implements IServerAlertService { threshold = rule.getWarningThreshold(); } - // 触发告警 + // 查询是否已有活跃或待处理的告警 + Optional existingAlertOpt = alertLogRepository.findActiveOrPendingAlert( + serverId, rule.getId(), rule.getAlertType()); + if (alertLevel != null) { + // 当前超过阈值 String resourceInfo = String.format("总磁盘(%d个分区,总容量%.0fGB,已用%.0fGB)", diskUsageList.size(), totalCapacity.doubleValue(), totalUsed.doubleValue()); - triggerAlert(serverId, rule, alertLevel, totalUsagePercent, threshold, resourceInfo, config); + handleAlertTriggered(existingAlertOpt, serverId, rule, alertLevel, totalUsagePercent, threshold, resourceInfo, config); + } else { + // 当前值正常,检查是否需要解除告警 + if (existingAlertOpt.isPresent()) { + resolveAlert(existingAlertOpt.get()); + } } } /** - * 触发告警 + * 处理告警触发逻辑 */ - private void triggerAlert(Long serverId, ServerAlertRule rule, MonitorAlertLevelEnum level, - BigDecimal currentValue, BigDecimal threshold, String resourceInfo, - ServerMonitorNotificationConfig config) { - // 1. 记录告警日志到数据库 + private void handleAlertTriggered(Optional existingAlertOpt, Long serverId, + ServerAlertRule rule, MonitorAlertLevelEnum level, + BigDecimal currentValue, BigDecimal threshold, + String resourceInfo, ServerMonitorNotificationConfig config) { + if (!existingAlertOpt.isPresent()) { + // 首次检测到超过阈值 + if (rule.getDurationMinutes() == null || rule.getDurationMinutes() == 0) { + // 立即告警 + createActiveAlertAndNotify(serverId, rule, level, currentValue, threshold, resourceInfo, config); + } else { + // 创建 PENDING 状态,等待持续时间验证 + createPendingAlert(serverId, rule, level, currentValue, threshold, resourceInfo); + } + } else { + ServerAlertLog existingAlert = existingAlertOpt.get(); + + if (ServerAlertStatusEnum.PENDING.equals(existingAlert.getStatus())) { + // 已有 PENDING 告警,检查是否达到持续时间 + long durationMinutes = java.time.Duration.between( + existingAlert.getAlertTime(), LocalDateTime.now()).toMinutes(); + + if (durationMinutes >= rule.getDurationMinutes()) { + // 达到持续时间,激活并发送通知 + activateAndNotify(existingAlert, rule, config); + } else { + // 未达到持续时间,更新当前值(如果级别变化) + if (existingAlert.getAlertLevel() != level) { + updateAlertLevel(existingAlert, level, currentValue, threshold, resourceInfo); + } + log.debug("⏳ 告警持续时间不足: serverId={}, 已持续{}分钟/需要{}分钟", + serverId, durationMinutes, rule.getDurationMinutes()); + } + } else if (ServerAlertStatusEnum.ACTIVE.equals(existingAlert.getStatus())) { + // 已有 ACTIVE 告警 + if (existingAlert.getAlertLevel() != level && + level.ordinal() > existingAlert.getAlertLevel().ordinal()) { + // 级别升级(例如从 WARNING 升级到 CRITICAL),重新发送通知 + updateAlertLevelAndNotify(existingAlert, rule, level, currentValue, threshold, resourceInfo, config); + } else if (existingAlert.getAlertLevel() != level) { + // 级别变化但未升级,只更新数据不发通知 + updateAlertLevel(existingAlert, level, currentValue, threshold, resourceInfo); + } + // 否则不重复发送通知 + } + } + } + + /** + * 创建 PENDING 状态的告警(需要等待持续时间验证) + */ + private void createPendingAlert(Long serverId, ServerAlertRule rule, MonitorAlertLevelEnum level, + BigDecimal currentValue, BigDecimal threshold, String resourceInfo) { + String alertMessage = String.format("%s使用率达到%s级别: 当前值=%.2f%%, 阈值=%.2f%% (等待持续%d分钟验证)", + resourceInfo, level.getDescription(), currentValue, threshold, rule.getDurationMinutes()); + + ServerAlertLog alertLog = ServerAlertLog.builder() + .serverId(serverId) + .ruleId(rule.getId()) + .alertType(rule.getAlertType()) + .alertLevel(level) + .alertValue(currentValue) + .thresholdValue(threshold) + .alertMessage(alertMessage) + .status(ServerAlertStatusEnum.PENDING) + .alertTime(LocalDateTime.now()) + .notified(false) + .build(); + + try { + alertLogRepository.save(alertLog); + log.info("📝 PENDING 告警已创建: id={}, serverId={}, 需持续{}分钟", + alertLog.getId(), serverId, rule.getDurationMinutes()); + } catch (Exception e) { + log.error("创建 PENDING 告警失败", e); + } + } + + /** + * 创建 ACTIVE 状态的告警并立即发送通知 + */ + private void createActiveAlertAndNotify(Long serverId, ServerAlertRule rule, MonitorAlertLevelEnum level, + BigDecimal currentValue, BigDecimal threshold, + String resourceInfo, ServerMonitorNotificationConfig config) { String alertMessage = String.format("%s使用率达到%s级别: 当前值=%.2f%%, 阈值=%.2f%%", resourceInfo, level.getDescription(), currentValue, threshold); @@ -184,32 +283,136 @@ public class ServerAlertServiceImpl implements IServerAlertService { .alertValue(currentValue) .thresholdValue(threshold) .alertMessage(alertMessage) - .status("ACTIVE") + .status(ServerAlertStatusEnum.ACTIVE) .alertTime(LocalDateTime.now()) .notified(false) .build(); try { alertLogRepository.save(alertLog); - log.info("✅ 告警记录已保存: id={}, serverId={}, message={}", + log.warn("🚨 告警已触发: id={}, serverId={}, message={}", alertLog.getId(), serverId, alertMessage); } catch (Exception e) { log.error("保存告警记录失败", e); + return; } - // 2. 记录日志 - log.warn("⚠️ 服务器告警触发: serverId={}, ruleName={}, type={}, level={}, resource={}, " + - "current={}%, threshold={}%", - serverId, rule.getRuleName(), rule.getAlertType(), level, - resourceInfo, currentValue, threshold); + // 发送通知 + sendNotificationAndUpdateStatus(alertLog, rule, resourceInfo, config); + } + + /** + * 激活 PENDING 告警并发送通知 + */ + private void activateAndNotify(ServerAlertLog alertLog, ServerAlertRule rule, + ServerMonitorNotificationConfig config) { + alertLog.setStatus(ServerAlertStatusEnum.ACTIVE); + String resourceInfo = getResourceInfo(rule.getAlertType()); + alertLog.setAlertMessage(String.format("%s使用率达到%s级别: 当前值=%.2f%%, 阈值=%.2f%% (已持续%d分钟)", + resourceInfo, alertLog.getAlertLevel().getDescription(), + alertLog.getAlertValue(), alertLog.getThresholdValue(), rule.getDurationMinutes())); - // 3. 发送告警通知 - if (config != null && config.getNotificationChannelId() != null && config.getResourceAlertTemplateId() != null) { - try { - sendAlertNotification(serverId, rule, level, currentValue, threshold, resourceInfo, config); - } catch (Exception e) { - log.error("发送告警通知失败: serverId={}, error={}", serverId, e.getMessage(), e); - } + try { + alertLogRepository.save(alertLog); + log.warn("🚨 告警已激活: id={}, serverId={}, 已持续{}分钟", + alertLog.getId(), alertLog.getServerId(), rule.getDurationMinutes()); + } catch (Exception e) { + log.error("激活告警失败", e); + return; + } + + // 发送通知 + sendNotificationAndUpdateStatus(alertLog, rule, resourceInfo, config); + } + + /** + * 更新告警级别 + */ + private void updateAlertLevel(ServerAlertLog alertLog, MonitorAlertLevelEnum newLevel, + BigDecimal currentValue, BigDecimal threshold, String resourceInfo) { + alertLog.setAlertLevel(newLevel); + alertLog.setAlertValue(currentValue); + alertLog.setThresholdValue(threshold); + alertLog.setAlertMessage(String.format("%s使用率达到%s级别: 当前值=%.2f%%, 阈值=%.2f%%", + resourceInfo, newLevel.getDescription(), currentValue, threshold)); + + try { + alertLogRepository.save(alertLog); + log.info("📊 告警级别已更新: id={}, 新级别={}", alertLog.getId(), newLevel); + } catch (Exception e) { + log.error("更新告警级别失败", e); + } + } + + /** + * 更新告警级别并重新发送通知(级别升级时) + */ + private void updateAlertLevelAndNotify(ServerAlertLog alertLog, ServerAlertRule rule, + MonitorAlertLevelEnum newLevel, BigDecimal currentValue, + BigDecimal threshold, String resourceInfo, + ServerMonitorNotificationConfig config) { + MonitorAlertLevelEnum oldLevel = alertLog.getAlertLevel(); + updateAlertLevel(alertLog, newLevel, currentValue, threshold, resourceInfo); + + log.warn("⬆️ 告警级别升级: id={}, {} → {}, 重新发送通知", + alertLog.getId(), oldLevel, newLevel); + + // 重新发送通知 + sendNotificationAndUpdateStatus(alertLog, rule, resourceInfo, config); + } + + /** + * 解除告警 + */ + private void resolveAlert(ServerAlertLog alertLog) { + alertLog.setStatus(ServerAlertStatusEnum.RESOLVED); + alertLog.setResolveTime(LocalDateTime.now()); + + try { + alertLogRepository.save(alertLog); + log.info("✅ 告警已解除: id={}, serverId={}", alertLog.getId(), alertLog.getServerId()); + } catch (Exception e) { + log.error("解除告警失败", e); + } + } + + /** + * 发送通知并更新状态 + */ + private void sendNotificationAndUpdateStatus(ServerAlertLog alertLog, ServerAlertRule rule, + String resourceInfo, ServerMonitorNotificationConfig config) { + if (config == null || config.getNotificationChannelId() == null || + config.getResourceAlertTemplateId() == null) { + log.debug("通知配置不完整,跳过发送通知"); + return; + } + + try { + sendAlertNotification(alertLog.getServerId(), rule, alertLog.getAlertLevel(), + alertLog.getAlertValue(), alertLog.getThresholdValue(), resourceInfo, config); + + // 发送成功后,更新数据库通知状态 + alertLog.setNotified(true); + alertLog.setNotifyTime(LocalDateTime.now()); + alertLogRepository.save(alertLog); + + log.info("✅ 告警通知已发送: alertLogId={}, serverId={}", + alertLog.getId(), alertLog.getServerId()); + } catch (Exception e) { + log.error("发送告警通知失败: serverId={}, error={}", + alertLog.getServerId(), e.getMessage(), e); + } + } + + /** + * 获取资源信息描述 + */ + private String getResourceInfo(MonitorMetricEnum alertType) { + switch (alertType) { + case CPU: return "CPU"; + case MEMORY: return "内存"; + case DISK: return "磁盘"; + default: return "未知"; } } diff --git a/backend/src/main/resources/db/changelog/changes/v1.0.0-data.sql b/backend/src/main/resources/db/changelog/changes/v1.0.0-data.sql index 4dfcbaab..9994f796 100644 --- a/backend/src/main/resources/db/changelog/changes/v1.0.0-data.sql +++ b/backend/src/main/resources/db/changelog/changes/v1.0.0-data.sql @@ -765,7 +765,7 @@ INSERT INTO `deploy-ease-platform`.`schedule_job` (`id`, `create_by`, `create_ti INSERT INTO `deploy-ease-platform`.`schedule_job` (`id`, `create_by`, `create_time`, `update_by`, `update_time`, `version`, `deleted`, `job_name`, `job_description`, `category_id`, `bean_name`, `method_name`, `form_definition_id`, `method_params`, `cron_expression`, `status`, `concurrent`, `last_execute_time`, `next_execute_time`, `execute_count`, `success_count`, `fail_count`, `timeout_seconds`, `retry_count`, `alert_email`) VALUES (14, 'admin', NOW(), 'admin', NOW(), 26, b'0', '隆基Git仓库组同步', '定期同步Git仓库组信息,每天凌晨2点执行', 2, 'repositoryGroupServiceImpl', 'syncGroups', NULL, '{\"externalSystemId\": 4}', '0 0 3 * * ?', 'ENABLED', b'0', NOW(), NOW(), 0, 0, 0, 3600, 2, ''); INSERT INTO `deploy-ease-platform`.`schedule_job` (`id`, `create_by`, `create_time`, `update_by`, `update_time`, `version`, `deleted`, `job_name`, `job_description`, `category_id`, `bean_name`, `method_name`, `form_definition_id`, `method_params`, `cron_expression`, `status`, `concurrent`, `last_execute_time`, `next_execute_time`, `execute_count`, `success_count`, `fail_count`, `timeout_seconds`, `retry_count`, `alert_email`) VALUES (15, 'admin', NOW(), 'admin', NOW(), 1204, b'0', '隆基Git项目同步', '定期同步Git项目信息,每天凌晨3点执行', 2, 'repositoryProjectServiceImpl', 'syncProjects', NULL, '{\"externalSystemId\": 4}', '0 */5 * * * ?', 'ENABLED', b'0', NOW(), NOW(), 0, 0, 0, 3600, 2, ''); INSERT INTO `deploy-ease-platform`.`schedule_job` (`id`, `create_by`, `create_time`, `update_by`, `update_time`, `version`, `deleted`, `job_name`, `job_description`, `category_id`, `bean_name`, `method_name`, `form_definition_id`, `method_params`, `cron_expression`, `status`, `concurrent`, `last_execute_time`, `next_execute_time`, `execute_count`, `success_count`, `fail_count`, `timeout_seconds`, `retry_count`, `alert_email`) VALUES (16, 'admin', NOW(), 'admin', NOW(), 5719, b'0', '隆基Git分支同步', '定期同步Git仓库分支信息,每5分钟执行一次', 2, 'repositoryBranchServiceImpl', 'syncBranches', NULL, '{\"externalSystemId\": 4}', '0 */5 * * * ?', 'ENABLED', b'0', NOW(), NOW(), 0, 0, 0, 3600, 2, ''); -INSERT INTO `deploy-ease-platform`.`schedule_job` (`id`, `create_by`, `create_time`, `update_by`, `update_time`, `version`, `deleted`, `job_name`, `job_description`, `category_id`, `bean_name`, `method_name`, `form_definition_id`, `method_params`, `cron_expression`, `status`, `concurrent`, `last_execute_time`, `next_execute_time`, `execute_count`, `success_count`, `fail_count`, `timeout_seconds`, `retry_count`, `alert_email`) VALUES (17, 'admin', NOW(), 'admin', NOW(), 2, b'0', '服务器预警', '', 4, 'serverMonitorScheduler', 'collectServerMetrics', NULL, '{\n \"notificationChannelId\": 1,\n \"resourceAlertTemplateId\": 11\n \"serverOfflineTemplateId\": 12,\n}', '0 */5 * * * ?', 'DISABLED', b'0', NULL, NULL, 0, 0, 0, 300, 0, ''); +INSERT INTO `deploy-ease-platform`.`schedule_job` (`id`, `create_by`, `create_time`, `update_by`, `update_time`, `version`, `deleted`, `job_name`, `job_description`, `category_id`, `bean_name`, `method_name`, `form_definition_id`, `method_params`, `cron_expression`, `status`, `concurrent`, `last_execute_time`, `next_execute_time`, `execute_count`, `success_count`, `fail_count`, `timeout_seconds`, `retry_count`, `alert_email`) VALUES (17, 'admin', NOW(), 'admin', NOW(), 2, b'0', '服务器预警', '', 4, 'serverMonitorScheduler', 'collectServerMetrics', NULL, '{"notificationChannelId": 1, "serverOfflineTemplateId": 12, "resourceAlertTemplateId": 11}', '0 */5 * * * ?', 'DISABLED', b'0', NULL, NULL, 0, 0, 0, 300, 0, '');