diff --git a/backend/src/main/java/com/qqchen/deploy/backend/deploy/api/ServerCategoryApiController.java b/backend/src/main/java/com/qqchen/deploy/backend/deploy/api/ServerCategoryApiController.java index 6ef49bd2..a083b27d 100644 --- a/backend/src/main/java/com/qqchen/deploy/backend/deploy/api/ServerCategoryApiController.java +++ b/backend/src/main/java/com/qqchen/deploy/backend/deploy/api/ServerCategoryApiController.java @@ -12,6 +12,7 @@ import jakarta.servlet.http.HttpServletResponse; import lombok.extern.slf4j.Slf4j; import org.springframework.data.domain.Page; import org.springframework.validation.annotation.Validated; +import org.springframework.web.bind.annotation.PathVariable; import org.springframework.web.bind.annotation.RequestBody; import org.springframework.web.bind.annotation.RequestMapping; import org.springframework.web.bind.annotation.RestController; @@ -38,18 +39,18 @@ public class ServerCategoryApiController } @Override - public Response update(Long aLong, ServerCategoryDTO dto) { - return super.update(aLong, dto); + public Response update(@PathVariable Long id, @Validated @RequestBody ServerCategoryDTO dto) { + return super.update(id, dto); } @Override - public Response delete(Long aLong) { - return super.delete(aLong); + public Response delete(@PathVariable Long id) { + return super.delete(id); } @Override - public Response findById(Long aLong) { - return super.findById(aLong); + public Response findById(@PathVariable Long id) { + return super.findById(id); } @Override diff --git a/backend/src/main/java/com/qqchen/deploy/backend/deploy/dto/ServerAlertRuleDTO.java b/backend/src/main/java/com/qqchen/deploy/backend/deploy/dto/ServerAlertRuleDTO.java index b306542b..ae1e0750 100644 --- a/backend/src/main/java/com/qqchen/deploy/backend/deploy/dto/ServerAlertRuleDTO.java +++ b/backend/src/main/java/com/qqchen/deploy/backend/deploy/dto/ServerAlertRuleDTO.java @@ -29,16 +29,16 @@ public class ServerAlertRuleDTO extends BaseDTO { @NotNull(message = "监控指标类型不能为空") private MonitorMetricEnum alertType; - @Schema(description = "警告阈值(%)", required = true, example = "80.00") + @Schema(description = "警告阈值(CPU/MEMORY/DISK为%,NETWORK为MB/s)", required = true, example = "80.00") @NotNull(message = "警告阈值不能为空") @DecimalMin(value = "0.00", message = "警告阈值必须大于等于0") - @DecimalMax(value = "100.00", message = "警告阈值不能超过100") + @DecimalMax(value = "10000.00", message = "警告阈值不能超过10000") private BigDecimal warningThreshold; - @Schema(description = "严重阈值(%)", required = true, example = "90.00") + @Schema(description = "严重阈值(CPU/MEMORY/DISK为%,NETWORK为MB/s)", required = true, example = "90.00") @NotNull(message = "严重阈值不能为空") @DecimalMin(value = "0.00", message = "严重阈值必须大于等于0") - @DecimalMax(value = "100.00", message = "严重阈值不能超过100") + @DecimalMax(value = "10000.00", message = "严重阈值不能超过10000") private BigDecimal criticalThreshold; @Schema(description = "持续时长(分钟)", example = "5") diff --git a/backend/src/main/java/com/qqchen/deploy/backend/deploy/scheduler/ServerMonitorScheduler.java b/backend/src/main/java/com/qqchen/deploy/backend/deploy/scheduler/ServerMonitorScheduler.java index 7f3c7b42..af974c5b 100644 --- a/backend/src/main/java/com/qqchen/deploy/backend/deploy/scheduler/ServerMonitorScheduler.java +++ b/backend/src/main/java/com/qqchen/deploy/backend/deploy/scheduler/ServerMonitorScheduler.java @@ -1,5 +1,6 @@ package com.qqchen.deploy.backend.deploy.scheduler; +import com.qqchen.deploy.backend.deploy.dto.ServerInfoDTO; import com.qqchen.deploy.backend.deploy.dto.ServerMonitorDataDTO; import com.qqchen.deploy.backend.deploy.dto.ServerMonitorNotificationConfig; import com.qqchen.deploy.backend.deploy.entity.Server; @@ -9,6 +10,7 @@ import com.qqchen.deploy.backend.deploy.repository.IServerAlertRuleRepository; import com.qqchen.deploy.backend.deploy.repository.IServerRepository; import com.qqchen.deploy.backend.deploy.service.IServerAlertService; import com.qqchen.deploy.backend.deploy.service.IServerMonitorService; +import com.qqchen.deploy.backend.deploy.service.IServerService; import com.qqchen.deploy.backend.framework.dto.DiskUsageInfo; import com.qqchen.deploy.backend.framework.ssh.ISSHCommandService; import com.qqchen.deploy.backend.framework.ssh.SSHCommandServiceFactory; @@ -34,35 +36,38 @@ import java.util.stream.Collectors; @Slf4j @Component public class ServerMonitorScheduler { - + @Resource private IServerRepository serverRepository; - + @Resource private SSHCommandServiceFactory sshCommandServiceFactory; - + @Resource private IServerMonitorService monitorService; - + @Resource private IServerAlertService alertService; - + @Resource private IServerAlertRuleRepository alertRuleRepository; - + @Resource private INotificationService notificationService; - + + @Resource + private IServerService serverService; + /** * 采集所有在线服务器的监控数据 * 此方法由定时任务管理系统调用 - * - * @param notificationChannelId 通知渠道ID(可选,为null则不发送通知) + * + * @param notificationChannelId 通知渠道ID(可选,为null则不发送通知) * @param serverOfflineTemplateId 服务器离线通知模板ID(可选) * @param resourceAlertTemplateId 资源告警通知模板ID(可选) */ - public void collectServerMetrics(Long notificationChannelId, - Long serverOfflineTemplateId, + public void collectServerMetrics(Long notificationChannelId, + Long serverOfflineTemplateId, Long resourceAlertTemplateId) { // 构建通知配置对象 ServerMonitorNotificationConfig config = null; @@ -71,63 +76,63 @@ public class ServerMonitorScheduler { config.setNotificationChannelId(notificationChannelId); config.setServerOfflineTemplateId(serverOfflineTemplateId); config.setResourceAlertTemplateId(resourceAlertTemplateId); - - log.info("========== 开始采集服务器监控数据 ========== channelId={}, offlineTemplateId={}, alertTemplateId={}", + + log.info("========== 开始采集服务器监控数据 ========== channelId={}, offlineTemplateId={}, alertTemplateId={}", notificationChannelId, serverOfflineTemplateId, resourceAlertTemplateId); } else { log.info("========== 开始采集服务器监控数据(不发送通知) =========="); } long startTime = System.currentTimeMillis(); - + try { // 1. 查询所有服务器(不管当前状态),准备检测在线状态 List allServers = serverRepository.findAll(); - + if (allServers.isEmpty()) { log.debug("没有需要监控的服务器,跳过监控采集"); return; } - + log.info("发现 {} 台服务器,开始检测在线状态并采集监控数据", allServers.size()); - + // 2. 并发检测所有服务器的连接状态并采集监控数据 // - 连接失败 → 发送离线通知 // - 连接成功 → 采集数据,检查阈值告警 final ServerMonitorNotificationConfig finalConfig = config; List> futures = allServers.stream() - .map(server -> CompletableFuture.supplyAsync(() -> + .map(server -> CompletableFuture.supplyAsync(() -> collectSingleServerWithStatusCheck(server, finalConfig))) .collect(Collectors.toList()); - + // 3. 等待所有任务完成 CompletableFuture allFutures = CompletableFuture.allOf( futures.toArray(new CompletableFuture[0]) ); allFutures.join(); - + // 4. 收集结果 List monitorDataList = futures.stream() .map(CompletableFuture::join) .filter(data -> data != null) .collect(Collectors.toList()); - + long duration = System.currentTimeMillis() - startTime; - log.info("========== 监控数据采集完成: 在线={}/{}, 耗时={}ms ==========", + log.info("========== 监控数据采集完成: 在线={}/{}, 耗时={}ms ==========", monitorDataList.size(), allServers.size(), duration); - + // 5. 批量保存监控数据到数据库 if (!monitorDataList.isEmpty()) { monitorService.batchSaveMonitorData(monitorDataList); log.info("监控数据已保存到数据库: count={}", monitorDataList.size()); } - + // 6. 检查告警规则(优化:只查询一次规则) if (!monitorDataList.isEmpty()) { // 一次性查询所有规则,避免 N 次数据库查询 List allRules = alertRuleRepository.findAll(); - log.debug("开始检查告警规则: 服务器数={}, 规则数={}", + log.debug("开始检查告警规则: 服务器数={}, 规则数={}", monitorDataList.size(), allRules.size()); - + for (ServerMonitorDataDTO data : monitorDataList) { try { alertService.checkAlertRules(data.getServerId(), data, allRules, config); @@ -136,7 +141,7 @@ public class ServerMonitorScheduler { } } } - + } catch (Exception e) { log.error("服务器监控数据采集失败", e); } finally { @@ -148,20 +153,41 @@ public class ServerMonitorScheduler { } } } - + /** * 检测服务器连接状态并采集监控数据 + * 统一使用 ServerService.testConnection() 方法进行连接测试和状态更新 */ private ServerMonitorDataDTO collectSingleServerWithStatusCheck(Server server, ServerMonitorNotificationConfig config) { try { - // 尝试采集监控数据 - return collectSingleServer(server); + // 1. 调用统一的连接测试方法(会自动更新服务器状态、硬件信息等) + ServerInfoDTO info = serverService.testConnection(server.getId()); + + // 2. 检查连接状态 + if (!info.getConnected()) { + // 连接失败(离线),发送离线通知 + log.error("服务器连接失败(离线): serverId={}, name={}, ip={}, error={}", + server.getId(), server.getServerName(), server.getHostIp(), info.getErrorMessage()); + + if (config != null && config.getNotificationChannelId() != null && config.getServerOfflineTemplateId() != null) { + try { + sendServerOfflineNotification(server, config); + } catch (Exception notifyError) { + log.error("发送服务器离线通知失败: serverId={}", server.getId(), notifyError); + } + } + + return null; + } + + // 3. 连接成功,采集监控数据 + return collectServerMonitorData(server); + } catch (Exception e) { - // 采集失败,说明服务器无法连接(离线) - log.error("服务器连接失败(离线): serverId={}, name={}, ip={}, error={}", + // 异常情况,发送离线通知 + log.error("服务器连接测试异常: serverId={}, name={}, ip={}, error={}", server.getId(), server.getServerName(), server.getHostIp(), e.getMessage()); - - // 发送离线通知 + if (config != null && config.getNotificationChannelId() != null && config.getServerOfflineTemplateId() != null) { try { sendServerOfflineNotification(server, config); @@ -169,11 +195,11 @@ public class ServerMonitorScheduler { log.error("发送服务器离线通知失败: serverId={}", server.getId(), notifyError); } } - + return null; } } - + /** * 发送服务器离线通知 */ @@ -185,40 +211,41 @@ public class ServerMonitorScheduler { templateParams.put("serverIp", server.getHostIp()); templateParams.put("offlineTime", LocalDateTime.now().format( java.time.format.DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"))); - + // 2. 构建SendNotificationRequest SendNotificationRequest request = new SendNotificationRequest(); request.setChannelId(config.getNotificationChannelId()); request.setNotificationTemplateId(config.getServerOfflineTemplateId()); request.setTemplateParams(templateParams); - + // 3. 发送通知(NotificationService会自动根据渠道类型创建请求对象) notificationService.send(request); - - log.info("✅ 服务器离线通知已发送: serverId={}, serverName={}, ip={}", + + log.info("✅ 服务器离线通知已发送: serverId={}, serverName={}, ip={}", server.getId(), server.getServerName(), server.getHostIp()); } catch (Exception e) { log.error("发送服务器离线通知异常: serverId={}", server.getId(), e); throw e; } } - + /** - * 采集单台服务器的监控数据 + * 采集服务器监控数据(CPU、内存、磁盘使用率) + * 注意:此方法仅负责采集监控数据,不负责连接测试和状态更新 */ - private ServerMonitorDataDTO collectSingleServer(Server server) { + private ServerMonitorDataDTO collectServerMonitorData(Server server) throws Exception { SSHClient sshClient = null; ISSHCommandService sshService = null; - + try { // 1. 获取对应OS的SSH服务 sshService = sshCommandServiceFactory.getService(server.getOsType()); - + // 2. 创建SSH连接 String password = null; String privateKey = null; String passphrase = null; - + switch (server.getAuthType()) { case PASSWORD: password = server.getSshPassword(); @@ -228,7 +255,7 @@ public class ServerMonitorScheduler { passphrase = server.getSshPassphrase(); break; } - + sshClient = sshService.createConnection( server.getHostIp(), server.getSshPort(), @@ -237,12 +264,12 @@ public class ServerMonitorScheduler { privateKey, passphrase ); - + // 3. 采集监控数据 BigDecimal cpuUsage = sshService.getCpuUsage(sshClient); BigDecimal memoryUsage = sshService.getMemoryUsage(sshClient); List diskUsage = sshService.getDiskUsage(sshClient); - + // 4. 计算已用内存(基于内存使用率和总内存) Integer memoryUsed = null; if (memoryUsage != null && server.getMemorySize() != null) { @@ -250,7 +277,7 @@ public class ServerMonitorScheduler { .divide(new BigDecimal(100), 0, BigDecimal.ROUND_HALF_UP) .intValue(); } - + // 5. 构建监控数据 ServerMonitorDataDTO data = ServerMonitorDataDTO.builder() .serverId(server.getId()) @@ -260,17 +287,17 @@ public class ServerMonitorScheduler { .diskUsage(diskUsage) .collectTime(LocalDateTime.now()) .build(); - - log.debug("服务器监控数据采集成功: serverId={}, cpu={}%, mem={}%, diskCount={}", - server.getId(), cpuUsage, memoryUsage, + + log.debug("服务器监控数据采集成功: serverId={}, cpu={}%, mem={}%, diskCount={}", + server.getId(), cpuUsage, memoryUsage, diskUsage != null ? diskUsage.size() : 0); - + return data; - + } catch (Exception e) { - log.error("采集服务器监控数据失败: serverId={}, serverName={}, error={}", + log.error("采集服务器监控数据失败: serverId={}, serverName={}, error={}", server.getId(), server.getServerName(), e.getMessage()); - return null; + throw e; // 抛出异常让上层处理 } finally { // 6. 关闭SSH连接 if (sshService != null && sshClient != null) { @@ -278,19 +305,19 @@ public class ServerMonitorScheduler { } } } - + /** * 清理历史监控数据 * 此方法由定时任务管理系统调用,建议每天凌晨执行 */ public void cleanOldMonitorData() { log.info("========== 开始清理历史监控数据 =========="); - + try { // 删除30天前的数据 LocalDateTime thirtyDaysAgo = LocalDateTime.now().minusDays(30); int deletedCount = monitorService.deleteOldData(thirtyDaysAgo); - + log.info("========== 历史监控数据清理完成: count={} ==========", deletedCount); } catch (Exception e) { log.error("清理历史监控数据失败", e); diff --git a/backend/src/main/resources/db/changelog/changes/v1.0.0-schema.sql b/backend/src/main/resources/db/changelog/changes/v1.0.0-schema.sql index d0838c88..23d048d3 100644 --- a/backend/src/main/resources/db/changelog/changes/v1.0.0-schema.sql +++ b/backend/src/main/resources/db/changelog/changes/v1.0.0-schema.sql @@ -1200,9 +1200,9 @@ CREATE TABLE deploy_server_alert_rule -- 告警类型 alert_type VARCHAR(20) NOT NULL COMMENT '告警类型: CPU/MEMORY/DISK', - -- 阈值 - warning_threshold DECIMAL(5,2) NOT NULL COMMENT '警告阈值(%)', - critical_threshold DECIMAL(5,2) NOT NULL COMMENT '严重阈值(%)', + -- 阈值(支持百分比和绝对值:CPU/MEMORY/DISK为%,NETWORK为MB/s) + warning_threshold DECIMAL(10,2) NOT NULL COMMENT '警告阈值', + critical_threshold DECIMAL(10,2) NOT NULL COMMENT '严重阈值', -- 持续时间(避免误报) duration_minutes INT DEFAULT 5 COMMENT '持续时长(分钟)',