【后端】

新增:服务器离线告警连续失败检测机制(避免临时网络波动误报)
    - 数据库:deploy_server_monitor 表增加 status 字段记录采集状态(SUCCESS/FAILURE)
    - 框架层:新增 StatusEnum 通用状态枚举类,MonitorMetricEnum 增加 SERVER_STATUS 监控类型
    - 实体层:ServerMonitor 实体增加 status 字段用于标识采集状态
    - Repository:IServerMonitorRepository 增加 findRecentMonitorRecords 方法查询最近N条监控记录
    - Service:IServerMonitorService 增加 saveMonitorRecord(保存单条记录)和 countConsecutiveFailures(统计连续失败次数)方法
    - 告警服务:IServerAlertService 增加 checkServerStatusAlert(检查状态告警)和 resolveServerStatusAlert(解除状态告警)方法
    - 告警实现:ServerAlertServiceImpl 实现服务器状态告警的创建、级别升级(WARNING→CRITICAL)、自动解除逻辑
    - 调度器:ServerMonitorScheduler 集成连续失败检测,连接成功时插入SUCCESS记录并解除告警,连接失败时插入FAILURE记录并触发告警检测
    - 数据初始化:增加全局服务器状态告警规则(连续3次失败触发警告,5次触发严重并标记离线)

优化:统一监控告警通知模板,简化配置参数
    - ServerMonitorNotificationConfig 删除 serverOfflineTemplateId 字段,所有监控告警(CPU/内存/磁盘/网络/服务器状态)统一使用 resourceAlertTemplateId
    - ServerMonitorScheduler.collectServerMetrics 方法删除 serverOfflineTemplateId 参数
    - ServerAlertServiceImpl.sendServerStatusNotification 改用统一的资源告警模板,模板参数与其他监控告警保持一致

修复:删除冗余代码和未使用的类
    - 删除 CollectionStatusEnum.java(已被 StatusEnum 取代)
    - 删除 ServerMonitorScheduler.sendServerOfflineNotification 方法(改由 ServerAlertService 统一处理)

优化:告警通知模板支持百分比和次数两种单位
    - 通知模板使用 FreeMarker 条件判断,根据 alertType 自动显示"连续失败X次"或"使用率X%"
    - 严重级别告警显示红色并提示"请立即处理",警告级别提示"请注意观察"

    【前端】
修复:告警规则表单规则范围下拉框无法滚动选择问题(移除嵌套滚动容器干扰)
优化:规则范围选择器升级为带搜索的Popover组件(支持按服务器名称/IP搜索,固定高度可滚动)
优化:规则范围数据加载从分页接口改为列表接口
This commit is contained in:
dengqichen 2025-12-10 14:42:37 +08:00
parent ef7efe0497
commit 1e93fffecc
16 changed files with 546 additions and 96 deletions

View File

@ -20,12 +20,8 @@ public class ServerMonitorNotificationConfig {
private Long notificationChannelId;
/**
* 服务器离线通知模板ID
*/
private Long serverOfflineTemplateId;
/**
* 资源预警通知模板ID
* 资源告警通知模板ID
* 用于所有监控告警CPU内存磁盘网络服务器状态
*/
private Long resourceAlertTemplateId;
}

View File

@ -1,5 +1,6 @@
package com.qqchen.deploy.backend.deploy.entity;
import com.qqchen.deploy.backend.framework.enums.StatusEnum;
import jakarta.persistence.*;
import lombok.AllArgsConstructor;
import lombok.Data;
@ -73,4 +74,11 @@ public class ServerMonitor {
*/
@Column(name = "collect_time", nullable = false)
private LocalDateTime collectTime;
/**
* 采集状态
*/
@Enumerated(EnumType.STRING)
@Column(name = "status", nullable = false, length = 20)
private StatusEnum status = StatusEnum.SUCCESS;
}

View File

@ -4,6 +4,7 @@ import com.qqchen.deploy.backend.deploy.entity.ServerAlertLog;
import com.qqchen.deploy.backend.framework.enums.MonitorMetricEnum;
import com.qqchen.deploy.backend.framework.enums.ServerAlertStatusEnum;
import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.data.jpa.repository.Modifying;
import org.springframework.data.jpa.repository.Query;
import org.springframework.data.repository.query.Param;
import org.springframework.stereotype.Repository;
@ -44,4 +45,11 @@ public interface IServerAlertLogRepository extends JpaRepository<ServerAlertLog,
@Param("ruleId") Long ruleId,
@Param("alertType") MonitorMetricEnum alertType
);
/**
* 删除指定服务器的所有告警记录
*/
@Modifying
@Query("DELETE FROM ServerAlertLog l WHERE l.serverId = :serverId")
int deleteByServerId(@Param("serverId") Long serverId);
}

View File

@ -2,6 +2,9 @@ package com.qqchen.deploy.backend.deploy.repository;
import com.qqchen.deploy.backend.deploy.entity.ServerAlertRule;
import com.qqchen.deploy.backend.framework.repository.IBaseRepository;
import org.springframework.data.jpa.repository.Modifying;
import org.springframework.data.jpa.repository.Query;
import org.springframework.data.repository.query.Param;
import org.springframework.stereotype.Repository;
/**
@ -9,4 +12,11 @@ import org.springframework.stereotype.Repository;
*/
@Repository
public interface IServerAlertRuleRepository extends IBaseRepository<ServerAlertRule, Long> {
/**
* 删除指定服务器的所有告警规则
*/
@Modifying
@Query("DELETE FROM ServerAlertRule r WHERE r.serverId = :serverId")
int deleteByServerId(@Param("serverId") Long serverId);
}

View File

@ -33,4 +33,18 @@ public interface IServerMonitorRepository extends JpaRepository<ServerMonitor, L
@Modifying
@Query("DELETE FROM ServerMonitor m WHERE m.collectTime < :beforeTime")
int deleteByCollectTimeBefore(@Param("beforeTime") LocalDateTime beforeTime);
/**
* 删除指定服务器的所有监控记录
*/
@Modifying
@Query("DELETE FROM ServerMonitor m WHERE m.serverId = :serverId")
int deleteByServerId(@Param("serverId") Long serverId);
/**
* 查询指定服务器最近N条监控记录按时间倒序
* 用于统计连续失败次数
*/
@Query("SELECT m FROM ServerMonitor m WHERE m.serverId = :serverId ORDER BY m.collectTime DESC")
List<ServerMonitor> findRecentMonitorRecords(@Param("serverId") Long serverId, org.springframework.data.domain.Pageable pageable);
}

View File

@ -69,22 +69,19 @@ public class ServerMonitorScheduler {
* 此方法由定时任务管理系统调用
*
* @param notificationChannelId 通知渠道ID可选为null则不发送通知
* @param serverOfflineTemplateId 服务器离线通知模板ID可选
* @param resourceAlertTemplateId 资源告警通知模板ID可选
* @param resourceAlertTemplateId 资源告警通知模板ID可选用于所有监控告警
*/
public void collectServerMetrics(Long notificationChannelId,
Long serverOfflineTemplateId,
Long resourceAlertTemplateId) {
// 构建通知配置对象
ServerMonitorNotificationConfig config = null;
if (notificationChannelId != null) {
config = new ServerMonitorNotificationConfig();
config.setNotificationChannelId(notificationChannelId);
config.setServerOfflineTemplateId(serverOfflineTemplateId);
config.setResourceAlertTemplateId(resourceAlertTemplateId);
log.debug("开始采集服务器监控数据: channelId={}, offlineTemplateId={}, alertTemplateId={}",
notificationChannelId, serverOfflineTemplateId, resourceAlertTemplateId);
log.debug("开始采集服务器监控数据: channelId={}, alertTemplateId={}",
notificationChannelId, resourceAlertTemplateId);
} else {
log.debug("开始采集服务器监控数据(不发送通知)");
}
@ -201,30 +198,88 @@ public class ServerMonitorScheduler {
passphrase
);
// 3. 连接成功更新服务器状态
server.setStatus(ServerStatusEnum.ONLINE);
server.setLastConnectTime(LocalDateTime.now());
serverRepository.save(server);
// 3. 连接成功插入成功记录
ServerMonitor successRecord = ServerMonitor.builder()
.serverId(server.getId())
.status(com.qqchen.deploy.backend.framework.enums.StatusEnum.SUCCESS)
.collectTime(LocalDateTime.now())
.build();
monitorService.saveMonitorRecord(successRecord);
log.info("✓ 服务器连接成功: {} ({})", server.getServerName(), server.getHostIp());
// 4. 解除服务器状态告警如果存在
try {
alertService.resolveServerStatusAlert(server.getId());
} catch (Exception e) {
log.warn("解除服务器状态告警失败: serverId={}", server.getId(), e);
}
// 4. 采集监控数据复用同一个SSH连接
// 5. 更新服务器状态为ONLINE
if (server.getStatus() == ServerStatusEnum.OFFLINE) {
server.setStatus(ServerStatusEnum.ONLINE);
server.setLastConnectTime(LocalDateTime.now());
serverRepository.save(server);
log.info("✓ 服务器已恢复在线: {} ({})", server.getServerName(), server.getHostIp());
} else {
server.setStatus(ServerStatusEnum.ONLINE);
server.setLastConnectTime(LocalDateTime.now());
serverRepository.save(server);
log.info("✓ 服务器连接成功: {} ({})", server.getServerName(), server.getHostIp());
}
// 6. 采集监控数据复用同一个SSH连接
return collectServerMonitorData(server, sshClient, sshService);
} catch (Exception e) {
// 连接失败更新服务器状态为离线
log.error("✗ 服务器连接失败: {} ({}) - {}",
server.getServerName(), server.getHostIp(), e.getMessage());
// 连接失败插入失败记录
ServerMonitor failureRecord = ServerMonitor.builder()
.serverId(server.getId())
.status(com.qqchen.deploy.backend.framework.enums.StatusEnum.FAILURE)
.collectTime(LocalDateTime.now())
.build();
server.setStatus(ServerStatusEnum.OFFLINE);
serverRepository.save(server);
try {
monitorService.saveMonitorRecord(failureRecord);
} catch (Exception saveError) {
log.error("保存失败记录异常: serverId={}", server.getId(), saveError);
}
// 发送离线通知
if (config != null && config.getNotificationChannelId() != null && config.getServerOfflineTemplateId() != null) {
// 统计连续失败次数
int failureCount = 0;
try {
failureCount = monitorService.countConsecutiveFailures(server.getId(), 10);
log.error("✗ 服务器连接失败 [连续{}次]: {} ({}) - {}",
failureCount, server.getServerName(), server.getHostIp(), e.getMessage());
} catch (Exception countError) {
log.error("统计连续失败次数异常: serverId={}", server.getId(), countError);
log.error("✗ 服务器连接失败: {} ({}) - {}",
server.getServerName(), server.getHostIp(), e.getMessage());
}
// 检查服务器状态告警传入所有规则避免重复查询
if (failureCount > 0 && config != null) {
try {
sendServerOfflineNotification(server, config);
} catch (Exception notifyError) {
log.error("发送服务器离线通知失败: serverId={}", server.getId(), notifyError);
// 查询所有规则在外层已查询这里重新查询以确保最新
List<ServerAlertRule> allRules = alertRuleRepository.findAll();
alertService.checkServerStatusAlert(server.getId(), failureCount, allRules, config);
// 如果达到严重阈值标记服务器为OFFLINE
ServerAlertRule statusRule = allRules.stream()
.filter(rule -> rule.getAlertType() == com.qqchen.deploy.backend.framework.enums.MonitorMetricEnum.SERVER_STATUS)
.filter(rule -> Boolean.TRUE.equals(rule.getEnabled()))
.filter(rule -> rule.getServerId() == null || server.getId().equals(rule.getServerId()))
.findFirst()
.orElse(null);
if (statusRule != null && failureCount >= statusRule.getCriticalThreshold().intValue()) {
if (server.getStatus() != ServerStatusEnum.OFFLINE) {
server.setStatus(ServerStatusEnum.OFFLINE);
serverRepository.save(server);
log.warn("🔴 服务器已标记为离线: {} (连续失败{}次)",
server.getServerName(), failureCount);
}
}
} catch (Exception alertError) {
log.error("检查服务器状态告警失败: serverId={}", server.getId(), alertError);
}
}
@ -238,35 +293,6 @@ public class ServerMonitorScheduler {
}
}
/**
* 发送服务器离线通知
*/
private void sendServerOfflineNotification(Server server, ServerMonitorNotificationConfig config) {
try {
// 1. 构建模板参数
Map<String, Object> templateParams = new HashMap<>();
templateParams.put("serverName", server.getServerName());
templateParams.put("serverIp", server.getHostIp());
templateParams.put("offlineTime", LocalDateTime.now().format(
java.time.format.DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")));
// 2. 构建SendNotificationRequest
SendNotificationRequest request = new SendNotificationRequest();
request.setChannelId(config.getNotificationChannelId());
request.setNotificationTemplateId(config.getServerOfflineTemplateId());
request.setTemplateParams(templateParams);
// 3. 发送通知NotificationService会自动根据渠道类型创建请求对象
notificationService.send(request);
log.info("服务器离线通知已发送: serverId={}, serverName={}, ip={}",
server.getId(), server.getServerName(), server.getHostIp());
} catch (Exception e) {
log.error("发送服务器离线通知异常: serverId={}", server.getId(), e);
throw e;
}
}
/**
* 采集服务器监控数据CPU内存磁盘网络使用率
*

View File

@ -32,4 +32,23 @@ public interface IServerAlertService {
void checkAlertRules(Long serverId, ServerMonitorDataDTO monitorData,
List<ServerAlertRule> allRules,
ServerMonitorNotificationConfig config);
/**
* 检查服务器状态告警连接失败场景
*
* @param serverId 服务器ID
* @param failureCount 连续失败次数
* @param allRules 所有告警规则
* @param config 通知配置
*/
void checkServerStatusAlert(Long serverId, int failureCount,
List<ServerAlertRule> allRules,
ServerMonitorNotificationConfig config);
/**
* 解除服务器状态告警连接恢复场景
*
* @param serverId 服务器ID
*/
void resolveServerStatusAlert(Long serverId);
}

View File

@ -29,4 +29,21 @@ public interface IServerMonitorService {
* @return 最新的监控记录如果没有则返回null
*/
ServerMonitor getLastMonitorData(Long serverId);
/**
* 保存单条监控记录包括失败记录
*
* @param monitor 监控记录
*/
void saveMonitorRecord(ServerMonitor monitor);
/**
* 统计指定服务器的连续失败次数
* 查询最近的监控记录从最新开始统计连续FAILURE状态的数量
*
* @param serverId 服务器ID
* @param checkLimit 检查最近多少条记录默认10条
* @return 连续失败次数
*/
int countConsecutiveFailures(Long serverId, int checkLimit);
}

View File

@ -491,4 +491,218 @@ public class ServerAlertServiceImpl implements IServerAlertService {
throw e;
}
}
@Override
public void checkServerStatusAlert(Long serverId, int failureCount,
List<ServerAlertRule> allRules,
ServerMonitorNotificationConfig config) {
// 获取SERVER_STATUS类型的有效规则优先专属规则
ServerAlertRule statusRule = getEffectiveRule(serverId, MonitorMetricEnum.SERVER_STATUS, allRules);
if (statusRule == null) {
log.debug("未找到启用的服务器状态告警规则: serverId={}", serverId);
return;
}
int warningCount = statusRule.getWarningThreshold().intValue();
int criticalCount = statusRule.getCriticalThreshold().intValue();
log.debug("检查服务器状态告警: serverId={}, failureCount={}, warningCount={}, criticalCount={}",
serverId, failureCount, warningCount, criticalCount);
// 判断当前应该触发的告警级别
MonitorAlertLevelEnum currentLevel = null;
int threshold = 0;
if (failureCount >= criticalCount) {
currentLevel = MonitorAlertLevelEnum.CRITICAL;
threshold = criticalCount;
} else if (failureCount >= warningCount) {
currentLevel = MonitorAlertLevelEnum.WARNING;
threshold = warningCount;
}
// 查询是否已有活跃告警
Optional<ServerAlertLog> existingAlertOpt = alertLogRepository.findActiveOrPendingAlert(
serverId, statusRule.getId(), MonitorMetricEnum.SERVER_STATUS);
if (currentLevel != null) {
// 需要触发告警
if (existingAlertOpt.isPresent()) {
// 已有告警检查是否需要升级
ServerAlertLog existingAlert = existingAlertOpt.get();
if (currentLevel.ordinal() > existingAlert.getAlertLevel().ordinal()) {
// 级别升级WARNING CRITICAL
updateServerStatusAlertLevel(existingAlert, statusRule, currentLevel,
failureCount, threshold, config);
} else {
log.debug("服务器状态告警已存在且级别未变化: serverId={}, level={}",
serverId, currentLevel);
}
} else {
// 创建新告警
createServerStatusAlert(serverId, statusRule, currentLevel, failureCount, threshold, config);
}
}
}
@Override
public void resolveServerStatusAlert(Long serverId) {
// 查询所有SERVER_STATUS类型的规则
List<ServerAlertRule> statusRules = alertRuleRepository.findAll().stream()
.filter(rule -> rule.getAlertType() == MonitorMetricEnum.SERVER_STATUS)
.collect(java.util.stream.Collectors.toList());
for (ServerAlertRule rule : statusRules) {
Optional<ServerAlertLog> activeAlertOpt = alertLogRepository.findActiveOrPendingAlert(
serverId, rule.getId(), MonitorMetricEnum.SERVER_STATUS);
if (activeAlertOpt.isPresent()) {
resolveAlert(activeAlertOpt.get());
log.info("✅ 服务器状态告警已解除: serverId={}, ruleId={}", serverId, rule.getId());
}
}
}
/**
* 获取有效规则优先专属规则其次全局规则
*/
private ServerAlertRule getEffectiveRule(Long serverId, MonitorMetricEnum alertType,
List<ServerAlertRule> allRules) {
ServerAlertRule specificRule = null;
ServerAlertRule globalRule = null;
for (ServerAlertRule rule : allRules) {
if (rule.getAlertType() != alertType || !Boolean.TRUE.equals(rule.getEnabled())) {
continue;
}
if (serverId.equals(rule.getServerId())) {
specificRule = rule;
break; // 找到专属规则直接返回
} else if (rule.getServerId() == null) {
globalRule = rule;
}
}
return specificRule != null ? specificRule : globalRule;
}
/**
* 创建服务器状态告警
*/
private void createServerStatusAlert(Long serverId, ServerAlertRule rule,
MonitorAlertLevelEnum level, int failureCount,
int threshold, ServerMonitorNotificationConfig config) {
Server server = serverRepository.findById(serverId).orElse(null);
if (server == null) {
log.warn("服务器不存在: serverId={}", serverId);
return;
}
ServerAlertLog alertLog = new ServerAlertLog();
alertLog.setServerId(serverId);
alertLog.setRuleId(rule.getId());
alertLog.setAlertType(MonitorMetricEnum.SERVER_STATUS);
alertLog.setAlertLevel(level);
alertLog.setAlertValue(new BigDecimal(failureCount));
alertLog.setThresholdValue(new BigDecimal(threshold));
alertLog.setAlertTime(LocalDateTime.now());
alertLog.setStatus(ServerAlertStatusEnum.PENDING);
alertLog.setAlertMessage(String.format("服务器连续%d次连接失败达到%s级别阈值:%d次",
failureCount, level.getDescription(), threshold));
try {
alertLogRepository.save(alertLog);
log.warn("🔔 服务器状态告警已创建: serverId={}, serverName={}, level={}, failureCount={}",
serverId, server.getServerName(), level, failureCount);
// 发送通知
sendServerStatusNotification(alertLog, server, config);
} catch (Exception e) {
log.error("创建服务器状态告警失败: serverId={}", serverId, e);
}
}
/**
* 更新服务器状态告警级别
*/
private void updateServerStatusAlertLevel(ServerAlertLog alertLog, ServerAlertRule rule,
MonitorAlertLevelEnum newLevel, int failureCount,
int threshold, ServerMonitorNotificationConfig config) {
Server server = serverRepository.findById(alertLog.getServerId()).orElse(null);
if (server == null) {
return;
}
MonitorAlertLevelEnum oldLevel = alertLog.getAlertLevel();
alertLog.setAlertLevel(newLevel);
alertLog.setAlertValue(new BigDecimal(failureCount));
alertLog.setThresholdValue(new BigDecimal(threshold));
alertLog.setAlertMessage(String.format("服务器连续%d次连接失败达到%s级别阈值:%d次",
failureCount, newLevel.getDescription(), threshold));
try {
alertLogRepository.save(alertLog);
log.warn("⬆️ 服务器状态告警级别升级: id={}, serverId={}, {} → {}, failureCount={}",
alertLog.getId(), alertLog.getServerId(), oldLevel, newLevel, failureCount);
// 重新发送通知
sendServerStatusNotification(alertLog, server, config);
} catch (Exception e) {
log.error("更新服务器状态告警级别失败", e);
}
}
/**
* 发送服务器状态通知
* 使用统一的资源告警模板与CPU/内存/磁盘告警相同
*/
private void sendServerStatusNotification(ServerAlertLog alertLog, Server server,
ServerMonitorNotificationConfig config) {
if (config == null || config.getNotificationChannelId() == null ||
config.getResourceAlertTemplateId() == null) {
log.debug("通知配置不完整,跳过发送服务器状态通知");
return;
}
try {
// 查询规则信息
ServerAlertRule rule = alertRuleRepository.findById(alertLog.getRuleId()).orElse(null);
if (rule == null) {
log.warn("告警规则不存在: ruleId={}", alertLog.getRuleId());
return;
}
// 构建模板参数与其他监控告警保持一致
Map<String, Object> templateParams = new HashMap<>();
templateParams.put("serverName", server.getServerName());
templateParams.put("ruleName", rule.getRuleName());
templateParams.put("alertType", rule.getAlertType().getDescription());
templateParams.put("resourceInfo", "服务器状态");
templateParams.put("alertLevel", alertLog.getAlertLevel().getDescription());
templateParams.put("currentValue", alertLog.getAlertValue().intValue());
templateParams.put("threshold", alertLog.getThresholdValue().intValue());
templateParams.put("alertTime", alertLog.getAlertTime().format(
java.time.format.DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")));
SendNotificationRequest request = new SendNotificationRequest();
request.setChannelId(config.getNotificationChannelId());
request.setNotificationTemplateId(config.getResourceAlertTemplateId());
request.setTemplateParams(templateParams);
notificationService.send(request);
// 更新通知状态
alertLog.setNotified(true);
alertLog.setNotifyTime(LocalDateTime.now());
alertLog.setStatus(ServerAlertStatusEnum.ACTIVE);
alertLogRepository.save(alertLog);
log.info("✅ 服务器状态告警通知已发送: serverId={}, level={}",
alertLog.getServerId(), alertLog.getAlertLevel());
} catch (Exception e) {
log.error("发送服务器状态通知异常: serverId={}", alertLog.getServerId(), e);
}
}
}

View File

@ -6,8 +6,11 @@ import com.qqchen.deploy.backend.deploy.dto.ServerMonitorDataDTO;
import com.qqchen.deploy.backend.deploy.entity.ServerMonitor;
import com.qqchen.deploy.backend.deploy.repository.IServerMonitorRepository;
import com.qqchen.deploy.backend.deploy.service.IServerMonitorService;
import com.qqchen.deploy.backend.framework.enums.StatusEnum;
import jakarta.annotation.Resource;
import lombok.extern.slf4j.Slf4j;
import org.springframework.data.domain.PageRequest;
import org.springframework.data.domain.Pageable;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
@ -80,6 +83,40 @@ public class ServerMonitorServiceImpl implements IServerMonitorService {
.networkRx(dto.getNetworkRx())
.networkTx(dto.getNetworkTx())
.collectTime(dto.getCollectTime())
.status(StatusEnum.SUCCESS) // 成功采集的记录
.build();
}
@Override
@Transactional
public void saveMonitorRecord(ServerMonitor monitor) {
monitorRepository.save(monitor);
log.debug("保存监控记录: serverId={}, status={}", monitor.getServerId(), monitor.getStatus());
}
@Override
public int countConsecutiveFailures(Long serverId, int checkLimit) {
// 查询最近N条记录
Pageable pageable = PageRequest.of(0, checkLimit);
List<ServerMonitor> recentRecords = monitorRepository.findRecentMonitorRecords(serverId, pageable);
if (recentRecords == null || recentRecords.isEmpty()) {
log.debug("没有找到监控记录: serverId={}", serverId);
return 0;
}
// 从最新记录开始统计连续失败次数
int failureCount = 0;
for (ServerMonitor record : recentRecords) {
if (StatusEnum.FAILURE.equals(record.getStatus())) {
failureCount++;
} else {
// 遇到成功记录停止统计
break;
}
}
log.debug("统计连续失败次数: serverId={}, count={}", serverId, failureCount);
return failureCount;
}
}

View File

@ -8,6 +8,9 @@ import com.qqchen.deploy.backend.deploy.enums.ServerStatusEnum;
import com.qqchen.deploy.backend.framework.enums.AuthTypeEnum;
import com.qqchen.deploy.backend.deploy.query.ServerQuery;
import com.qqchen.deploy.backend.deploy.repository.IServerRepository;
import com.qqchen.deploy.backend.deploy.repository.IServerMonitorRepository;
import com.qqchen.deploy.backend.deploy.repository.IServerAlertRuleRepository;
import com.qqchen.deploy.backend.deploy.repository.IServerAlertLogRepository;
import com.qqchen.deploy.backend.deploy.repository.ISSHAuditLogRepository;
import com.qqchen.deploy.backend.deploy.service.IServerService;
import com.qqchen.deploy.backend.framework.annotation.ServiceType;
@ -42,6 +45,15 @@ public class ServerServiceImpl
@Resource
private ISSHAuditLogRepository sshAuditLogRepository;
@Resource
private IServerMonitorRepository serverMonitorRepository;
@Resource
private IServerAlertRuleRepository serverAlertRuleRepository;
@Resource
private IServerAlertLogRepository serverAlertLogRepository;
public ServerServiceImpl(IServerRepository serverRepository) {
this.serverRepository = serverRepository;
@ -247,6 +259,7 @@ public class ServerServiceImpl
* @return 完整的硬件信息
*/
@Override
@Transactional
public ServerInfoDTO collectHardwareInfo(Long serverId) {
long startTime = System.currentTimeMillis();
ServerInfoDTO info = new ServerInfoDTO();
@ -320,22 +333,46 @@ public class ServerServiceImpl
* 1. 服务器物理删除没有@LogicDelete注解
* 2. 审计日志逻辑删除@LogicDelete注解
* 3. 审计日志永久保留仅标记deleted=true确保审计可追溯
* 4. 监控记录告警规则告警记录物理删除无需保留历史
*/
@Override
@Transactional
public void delete(Long id) {
log.info("删除服务器: serverId={}", id);
// 1. 逻辑删除关联的SSH审计日志保留历史记录
// 1. 物理删除关联的告警记录需要先删除因为它依赖告警规则
try {
int alertLogCount = serverAlertLogRepository.deleteByServerId(id);
log.info("已删除服务器关联的告警记录: serverId={}, count={}", id, alertLogCount);
} catch (Exception e) {
log.warn("删除服务器关联的告警记录失败: serverId={}, error={}", id, e.getMessage());
}
// 2. 物理删除关联的告警规则
try {
int alertRuleCount = serverAlertRuleRepository.deleteByServerId(id);
log.info("已删除服务器关联的告警规则: serverId={}, count={}", id, alertRuleCount);
} catch (Exception e) {
log.warn("删除服务器关联的告警规则失败: serverId={}, error={}", id, e.getMessage());
}
// 3. 物理删除关联的监控记录
try {
int monitorCount = serverMonitorRepository.deleteByServerId(id);
log.info("已删除服务器关联的监控记录: serverId={}, count={}", id, monitorCount);
} catch (Exception e) {
log.warn("删除服务器关联的监控记录失败: serverId={}, error={}", id, e.getMessage());
}
// 4. 逻辑删除关联的SSH审计日志保留历史记录
try {
sshAuditLogRepository.deleteByServerId(id);
log.info("已逻辑删除服务器关联的SSH审计日志: serverId={}", id);
} catch (Exception e) {
log.warn("逻辑删除服务器关联的SSH审计日志失败: serverId={}, error={}", id, e.getMessage());
// 继续执行即使审计日志删除失败也要删除服务器
}
// 2. 物理删除服务器
// 5. 物理删除服务器
super.delete(id);
log.info("服务器删除成功: serverId={}", id);
}

View File

@ -31,7 +31,14 @@ public enum MonitorMetricEnum {
/**
* 网络告警MB/s
*/
NETWORK("NETWORK", "网络使用率", "MB/s");
NETWORK("NETWORK", "网络使用率", "MB/s"),
/**
* 服务器状态告警次数
* warningThreshold: 警告级别的连续失败次数
* criticalThreshold: 严重级别的连续失败次数标记离线
*/
SERVER_STATUS("SERVER_STATUS", "服务器状态", "");
private final String code;
private final String description;

View File

@ -0,0 +1,28 @@
package com.qqchen.deploy.backend.framework.enums;
import lombok.AllArgsConstructor;
import lombok.Getter;
/**
* 状态枚举
* Framework 提供的通用状态类型适用于监控数据采集任务执行等场景
*
* @author qqchen
* @since 2025-12-10
*/
@Getter
@AllArgsConstructor
public enum StatusEnum {
/**
* 成功
*/
SUCCESS("SUCCESS", "成功"),
/**
* 失败
*/
FAILURE("FAILURE", "失败");
private final String code;
private final String description;
}

File diff suppressed because one or more lines are too long

View File

@ -1178,6 +1178,9 @@ CREATE TABLE deploy_server_monitor
-- 采集时间
collect_time DATETIME NOT NULL COMMENT '采集时间',
-- 采集状态
status VARCHAR(20) NOT NULL DEFAULT 'SUCCESS' COMMENT '采集状态SUCCESS-成功, FAILURE-失败',
INDEX idx_server_time (server_id, collect_time),
INDEX idx_collect_time (collect_time),
CONSTRAINT fk_monitor_server FOREIGN KEY (server_id) REFERENCES deploy_server (id)

View File

@ -12,22 +12,36 @@ INSERT INTO system_release (
)
VALUES (
'system', NOW(), 'system', NOW(), 1, 0,
1.13, 'ALL', NOW(),
1.15, 'ALL', NOW(),
'【后端】
- //DEBUG
- HikariCP泄漏检测阈值调整为35分钟Jenkins构建轮询时长
- SSH日志优化SSHJ底层日志降级为WARNTransport/SecureRandom等无业务价值日志
- INFO便
- Liquibase优化XML/SQL分离changes/XMLsql/SQL
- Security优化SecurityConfig/JwtAuthenticationFilter统一使用配置文件管理免认证路径
- "系统指标"JVMCPU线
- Actuator集成health/metrics/threaddump/heapdump/env/loggers
- SecurityWhitelistProperties配置类Security白名单
- JVM内存CPU线GC30
- 线线
- GB
- Actuator接口代理和调用方式/actuator代理配置
线
- deploy_server_monitor status SUCCESS/FAILURE
- StatusEnum MonitorMetricEnum SERVER_STATUS
- ServerMonitor status
- RepositoryIServerMonitorRepository findRecentMonitorRecords N条监控记录
- ServiceIServerMonitorService saveMonitorRecord countConsecutiveFailures
- IServerAlertService checkServerStatusAlert resolveServerStatusAlert
- ServerAlertServiceImpl WARNINGCRITICAL
- ServerMonitorScheduler SUCCESS记录并解除告警FAILURE记录并触发告警检测
- 35线
- ServerMonitorNotificationConfig serverOfflineTemplateId CPU////使 resourceAlertTemplateId
- ServerMonitorScheduler.collectServerMetrics serverOfflineTemplateId
- ServerAlertServiceImpl.sendServerStatusNotification
使
- CollectionStatusEnum.java StatusEnum
- ServerMonitorScheduler.sendServerOfflineNotification ServerAlertService
- 使 FreeMarker alertType "连续失败X次""使用率X%"
- "请立即处理""请注意观察"
Popover组件/IP搜索
',
0, NULL, NULL, 0
);