整理下初始化数据表

This commit is contained in:
dengqichen 2025-12-08 17:09:43 +08:00
parent c36ee0808c
commit 279c19ad7a
4 changed files with 101 additions and 73 deletions

View File

@ -12,6 +12,7 @@ import jakarta.servlet.http.HttpServletResponse;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.springframework.data.domain.Page; import org.springframework.data.domain.Page;
import org.springframework.validation.annotation.Validated; import org.springframework.validation.annotation.Validated;
import org.springframework.web.bind.annotation.PathVariable;
import org.springframework.web.bind.annotation.RequestBody; import org.springframework.web.bind.annotation.RequestBody;
import org.springframework.web.bind.annotation.RequestMapping; import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController; import org.springframework.web.bind.annotation.RestController;
@ -38,18 +39,18 @@ public class ServerCategoryApiController
} }
@Override @Override
public Response<ServerCategoryDTO> update(Long aLong, ServerCategoryDTO dto) { public Response<ServerCategoryDTO> update(@PathVariable Long id, @Validated @RequestBody ServerCategoryDTO dto) {
return super.update(aLong, dto); return super.update(id, dto);
} }
@Override @Override
public Response<Void> delete(Long aLong) { public Response<Void> delete(@PathVariable Long id) {
return super.delete(aLong); return super.delete(id);
} }
@Override @Override
public Response<ServerCategoryDTO> findById(Long aLong) { public Response<ServerCategoryDTO> findById(@PathVariable Long id) {
return super.findById(aLong); return super.findById(id);
} }
@Override @Override

View File

@ -29,16 +29,16 @@ public class ServerAlertRuleDTO extends BaseDTO {
@NotNull(message = "监控指标类型不能为空") @NotNull(message = "监控指标类型不能为空")
private MonitorMetricEnum alertType; private MonitorMetricEnum alertType;
@Schema(description = "警告阈值(%)", required = true, example = "80.00") @Schema(description = "警告阈值(CPU/MEMORY/DISK为%NETWORK为MB/s)", required = true, example = "80.00")
@NotNull(message = "警告阈值不能为空") @NotNull(message = "警告阈值不能为空")
@DecimalMin(value = "0.00", message = "警告阈值必须大于等于0") @DecimalMin(value = "0.00", message = "警告阈值必须大于等于0")
@DecimalMax(value = "100.00", message = "警告阈值不能超过100") @DecimalMax(value = "10000.00", message = "警告阈值不能超过10000")
private BigDecimal warningThreshold; private BigDecimal warningThreshold;
@Schema(description = "严重阈值(%)", required = true, example = "90.00") @Schema(description = "严重阈值(CPU/MEMORY/DISK为%NETWORK为MB/s)", required = true, example = "90.00")
@NotNull(message = "严重阈值不能为空") @NotNull(message = "严重阈值不能为空")
@DecimalMin(value = "0.00", message = "严重阈值必须大于等于0") @DecimalMin(value = "0.00", message = "严重阈值必须大于等于0")
@DecimalMax(value = "100.00", message = "严重阈值不能超过100") @DecimalMax(value = "10000.00", message = "严重阈值不能超过10000")
private BigDecimal criticalThreshold; private BigDecimal criticalThreshold;
@Schema(description = "持续时长(分钟)", example = "5") @Schema(description = "持续时长(分钟)", example = "5")

View File

@ -1,5 +1,6 @@
package com.qqchen.deploy.backend.deploy.scheduler; package com.qqchen.deploy.backend.deploy.scheduler;
import com.qqchen.deploy.backend.deploy.dto.ServerInfoDTO;
import com.qqchen.deploy.backend.deploy.dto.ServerMonitorDataDTO; import com.qqchen.deploy.backend.deploy.dto.ServerMonitorDataDTO;
import com.qqchen.deploy.backend.deploy.dto.ServerMonitorNotificationConfig; import com.qqchen.deploy.backend.deploy.dto.ServerMonitorNotificationConfig;
import com.qqchen.deploy.backend.deploy.entity.Server; import com.qqchen.deploy.backend.deploy.entity.Server;
@ -9,6 +10,7 @@ import com.qqchen.deploy.backend.deploy.repository.IServerAlertRuleRepository;
import com.qqchen.deploy.backend.deploy.repository.IServerRepository; import com.qqchen.deploy.backend.deploy.repository.IServerRepository;
import com.qqchen.deploy.backend.deploy.service.IServerAlertService; import com.qqchen.deploy.backend.deploy.service.IServerAlertService;
import com.qqchen.deploy.backend.deploy.service.IServerMonitorService; import com.qqchen.deploy.backend.deploy.service.IServerMonitorService;
import com.qqchen.deploy.backend.deploy.service.IServerService;
import com.qqchen.deploy.backend.framework.dto.DiskUsageInfo; import com.qqchen.deploy.backend.framework.dto.DiskUsageInfo;
import com.qqchen.deploy.backend.framework.ssh.ISSHCommandService; import com.qqchen.deploy.backend.framework.ssh.ISSHCommandService;
import com.qqchen.deploy.backend.framework.ssh.SSHCommandServiceFactory; import com.qqchen.deploy.backend.framework.ssh.SSHCommandServiceFactory;
@ -34,35 +36,38 @@ import java.util.stream.Collectors;
@Slf4j @Slf4j
@Component @Component
public class ServerMonitorScheduler { public class ServerMonitorScheduler {
@Resource @Resource
private IServerRepository serverRepository; private IServerRepository serverRepository;
@Resource @Resource
private SSHCommandServiceFactory sshCommandServiceFactory; private SSHCommandServiceFactory sshCommandServiceFactory;
@Resource @Resource
private IServerMonitorService monitorService; private IServerMonitorService monitorService;
@Resource @Resource
private IServerAlertService alertService; private IServerAlertService alertService;
@Resource @Resource
private IServerAlertRuleRepository alertRuleRepository; private IServerAlertRuleRepository alertRuleRepository;
@Resource @Resource
private INotificationService notificationService; private INotificationService notificationService;
@Resource
private IServerService serverService;
/** /**
* 采集所有在线服务器的监控数据 * 采集所有在线服务器的监控数据
* 此方法由定时任务管理系统调用 * 此方法由定时任务管理系统调用
* *
* @param notificationChannelId 通知渠道ID可选为null则不发送通知 * @param notificationChannelId 通知渠道ID可选为null则不发送通知
* @param serverOfflineTemplateId 服务器离线通知模板ID可选 * @param serverOfflineTemplateId 服务器离线通知模板ID可选
* @param resourceAlertTemplateId 资源告警通知模板ID可选 * @param resourceAlertTemplateId 资源告警通知模板ID可选
*/ */
public void collectServerMetrics(Long notificationChannelId, public void collectServerMetrics(Long notificationChannelId,
Long serverOfflineTemplateId, Long serverOfflineTemplateId,
Long resourceAlertTemplateId) { Long resourceAlertTemplateId) {
// 构建通知配置对象 // 构建通知配置对象
ServerMonitorNotificationConfig config = null; ServerMonitorNotificationConfig config = null;
@ -71,63 +76,63 @@ public class ServerMonitorScheduler {
config.setNotificationChannelId(notificationChannelId); config.setNotificationChannelId(notificationChannelId);
config.setServerOfflineTemplateId(serverOfflineTemplateId); config.setServerOfflineTemplateId(serverOfflineTemplateId);
config.setResourceAlertTemplateId(resourceAlertTemplateId); config.setResourceAlertTemplateId(resourceAlertTemplateId);
log.info("========== 开始采集服务器监控数据 ========== channelId={}, offlineTemplateId={}, alertTemplateId={}", log.info("========== 开始采集服务器监控数据 ========== channelId={}, offlineTemplateId={}, alertTemplateId={}",
notificationChannelId, serverOfflineTemplateId, resourceAlertTemplateId); notificationChannelId, serverOfflineTemplateId, resourceAlertTemplateId);
} else { } else {
log.info("========== 开始采集服务器监控数据(不发送通知) =========="); log.info("========== 开始采集服务器监控数据(不发送通知) ==========");
} }
long startTime = System.currentTimeMillis(); long startTime = System.currentTimeMillis();
try { try {
// 1. 查询所有服务器不管当前状态准备检测在线状态 // 1. 查询所有服务器不管当前状态准备检测在线状态
List<Server> allServers = serverRepository.findAll(); List<Server> allServers = serverRepository.findAll();
if (allServers.isEmpty()) { if (allServers.isEmpty()) {
log.debug("没有需要监控的服务器,跳过监控采集"); log.debug("没有需要监控的服务器,跳过监控采集");
return; return;
} }
log.info("发现 {} 台服务器,开始检测在线状态并采集监控数据", allServers.size()); log.info("发现 {} 台服务器,开始检测在线状态并采集监控数据", allServers.size());
// 2. 并发检测所有服务器的连接状态并采集监控数据 // 2. 并发检测所有服务器的连接状态并采集监控数据
// - 连接失败 发送离线通知 // - 连接失败 发送离线通知
// - 连接成功 采集数据检查阈值告警 // - 连接成功 采集数据检查阈值告警
final ServerMonitorNotificationConfig finalConfig = config; final ServerMonitorNotificationConfig finalConfig = config;
List<CompletableFuture<ServerMonitorDataDTO>> futures = allServers.stream() List<CompletableFuture<ServerMonitorDataDTO>> futures = allServers.stream()
.map(server -> CompletableFuture.supplyAsync(() -> .map(server -> CompletableFuture.supplyAsync(() ->
collectSingleServerWithStatusCheck(server, finalConfig))) collectSingleServerWithStatusCheck(server, finalConfig)))
.collect(Collectors.toList()); .collect(Collectors.toList());
// 3. 等待所有任务完成 // 3. 等待所有任务完成
CompletableFuture<Void> allFutures = CompletableFuture.allOf( CompletableFuture<Void> allFutures = CompletableFuture.allOf(
futures.toArray(new CompletableFuture[0]) futures.toArray(new CompletableFuture[0])
); );
allFutures.join(); allFutures.join();
// 4. 收集结果 // 4. 收集结果
List<ServerMonitorDataDTO> monitorDataList = futures.stream() List<ServerMonitorDataDTO> monitorDataList = futures.stream()
.map(CompletableFuture::join) .map(CompletableFuture::join)
.filter(data -> data != null) .filter(data -> data != null)
.collect(Collectors.toList()); .collect(Collectors.toList());
long duration = System.currentTimeMillis() - startTime; long duration = System.currentTimeMillis() - startTime;
log.info("========== 监控数据采集完成: 在线={}/{}, 耗时={}ms ==========", log.info("========== 监控数据采集完成: 在线={}/{}, 耗时={}ms ==========",
monitorDataList.size(), allServers.size(), duration); monitorDataList.size(), allServers.size(), duration);
// 5. 批量保存监控数据到数据库 // 5. 批量保存监控数据到数据库
if (!monitorDataList.isEmpty()) { if (!monitorDataList.isEmpty()) {
monitorService.batchSaveMonitorData(monitorDataList); monitorService.batchSaveMonitorData(monitorDataList);
log.info("监控数据已保存到数据库: count={}", monitorDataList.size()); log.info("监控数据已保存到数据库: count={}", monitorDataList.size());
} }
// 6. 检查告警规则优化只查询一次规则 // 6. 检查告警规则优化只查询一次规则
if (!monitorDataList.isEmpty()) { if (!monitorDataList.isEmpty()) {
// 一次性查询所有规则避免 N 次数据库查询 // 一次性查询所有规则避免 N 次数据库查询
List<ServerAlertRule> allRules = alertRuleRepository.findAll(); List<ServerAlertRule> allRules = alertRuleRepository.findAll();
log.debug("开始检查告警规则: 服务器数={}, 规则数={}", log.debug("开始检查告警规则: 服务器数={}, 规则数={}",
monitorDataList.size(), allRules.size()); monitorDataList.size(), allRules.size());
for (ServerMonitorDataDTO data : monitorDataList) { for (ServerMonitorDataDTO data : monitorDataList) {
try { try {
alertService.checkAlertRules(data.getServerId(), data, allRules, config); alertService.checkAlertRules(data.getServerId(), data, allRules, config);
@ -136,7 +141,7 @@ public class ServerMonitorScheduler {
} }
} }
} }
} catch (Exception e) { } catch (Exception e) {
log.error("服务器监控数据采集失败", e); log.error("服务器监控数据采集失败", e);
} finally { } finally {
@ -148,20 +153,41 @@ public class ServerMonitorScheduler {
} }
} }
} }
/** /**
* 检测服务器连接状态并采集监控数据 * 检测服务器连接状态并采集监控数据
* 统一使用 ServerService.testConnection() 方法进行连接测试和状态更新
*/ */
private ServerMonitorDataDTO collectSingleServerWithStatusCheck(Server server, ServerMonitorNotificationConfig config) { private ServerMonitorDataDTO collectSingleServerWithStatusCheck(Server server, ServerMonitorNotificationConfig config) {
try { try {
// 尝试采集监控数据 // 1. 调用统一的连接测试方法会自动更新服务器状态硬件信息等
return collectSingleServer(server); ServerInfoDTO info = serverService.testConnection(server.getId());
// 2. 检查连接状态
if (!info.getConnected()) {
// 连接失败离线发送离线通知
log.error("服务器连接失败(离线): serverId={}, name={}, ip={}, error={}",
server.getId(), server.getServerName(), server.getHostIp(), info.getErrorMessage());
if (config != null && config.getNotificationChannelId() != null && config.getServerOfflineTemplateId() != null) {
try {
sendServerOfflineNotification(server, config);
} catch (Exception notifyError) {
log.error("发送服务器离线通知失败: serverId={}", server.getId(), notifyError);
}
}
return null;
}
// 3. 连接成功采集监控数据
return collectServerMonitorData(server);
} catch (Exception e) { } catch (Exception e) {
// 采集失败说明服务器无法连接离线 // 异常情况发送离线通知
log.error("服务器连接失败(离线): serverId={}, name={}, ip={}, error={}", log.error("服务器连接测试异常: serverId={}, name={}, ip={}, error={}",
server.getId(), server.getServerName(), server.getHostIp(), e.getMessage()); server.getId(), server.getServerName(), server.getHostIp(), e.getMessage());
// 发送离线通知
if (config != null && config.getNotificationChannelId() != null && config.getServerOfflineTemplateId() != null) { if (config != null && config.getNotificationChannelId() != null && config.getServerOfflineTemplateId() != null) {
try { try {
sendServerOfflineNotification(server, config); sendServerOfflineNotification(server, config);
@ -169,11 +195,11 @@ public class ServerMonitorScheduler {
log.error("发送服务器离线通知失败: serverId={}", server.getId(), notifyError); log.error("发送服务器离线通知失败: serverId={}", server.getId(), notifyError);
} }
} }
return null; return null;
} }
} }
/** /**
* 发送服务器离线通知 * 发送服务器离线通知
*/ */
@ -185,40 +211,41 @@ public class ServerMonitorScheduler {
templateParams.put("serverIp", server.getHostIp()); templateParams.put("serverIp", server.getHostIp());
templateParams.put("offlineTime", LocalDateTime.now().format( templateParams.put("offlineTime", LocalDateTime.now().format(
java.time.format.DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"))); java.time.format.DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")));
// 2. 构建SendNotificationRequest // 2. 构建SendNotificationRequest
SendNotificationRequest request = new SendNotificationRequest(); SendNotificationRequest request = new SendNotificationRequest();
request.setChannelId(config.getNotificationChannelId()); request.setChannelId(config.getNotificationChannelId());
request.setNotificationTemplateId(config.getServerOfflineTemplateId()); request.setNotificationTemplateId(config.getServerOfflineTemplateId());
request.setTemplateParams(templateParams); request.setTemplateParams(templateParams);
// 3. 发送通知NotificationService会自动根据渠道类型创建请求对象 // 3. 发送通知NotificationService会自动根据渠道类型创建请求对象
notificationService.send(request); notificationService.send(request);
log.info("✅ 服务器离线通知已发送: serverId={}, serverName={}, ip={}", log.info("✅ 服务器离线通知已发送: serverId={}, serverName={}, ip={}",
server.getId(), server.getServerName(), server.getHostIp()); server.getId(), server.getServerName(), server.getHostIp());
} catch (Exception e) { } catch (Exception e) {
log.error("发送服务器离线通知异常: serverId={}", server.getId(), e); log.error("发送服务器离线通知异常: serverId={}", server.getId(), e);
throw e; throw e;
} }
} }
/** /**
* 采集单台服务器的监控数据 * 采集服务器监控数据CPU内存磁盘使用率
* 注意此方法仅负责采集监控数据不负责连接测试和状态更新
*/ */
private ServerMonitorDataDTO collectSingleServer(Server server) { private ServerMonitorDataDTO collectServerMonitorData(Server server) throws Exception {
SSHClient sshClient = null; SSHClient sshClient = null;
ISSHCommandService sshService = null; ISSHCommandService sshService = null;
try { try {
// 1. 获取对应OS的SSH服务 // 1. 获取对应OS的SSH服务
sshService = sshCommandServiceFactory.getService(server.getOsType()); sshService = sshCommandServiceFactory.getService(server.getOsType());
// 2. 创建SSH连接 // 2. 创建SSH连接
String password = null; String password = null;
String privateKey = null; String privateKey = null;
String passphrase = null; String passphrase = null;
switch (server.getAuthType()) { switch (server.getAuthType()) {
case PASSWORD: case PASSWORD:
password = server.getSshPassword(); password = server.getSshPassword();
@ -228,7 +255,7 @@ public class ServerMonitorScheduler {
passphrase = server.getSshPassphrase(); passphrase = server.getSshPassphrase();
break; break;
} }
sshClient = sshService.createConnection( sshClient = sshService.createConnection(
server.getHostIp(), server.getHostIp(),
server.getSshPort(), server.getSshPort(),
@ -237,12 +264,12 @@ public class ServerMonitorScheduler {
privateKey, privateKey,
passphrase passphrase
); );
// 3. 采集监控数据 // 3. 采集监控数据
BigDecimal cpuUsage = sshService.getCpuUsage(sshClient); BigDecimal cpuUsage = sshService.getCpuUsage(sshClient);
BigDecimal memoryUsage = sshService.getMemoryUsage(sshClient); BigDecimal memoryUsage = sshService.getMemoryUsage(sshClient);
List<DiskUsageInfo> diskUsage = sshService.getDiskUsage(sshClient); List<DiskUsageInfo> diskUsage = sshService.getDiskUsage(sshClient);
// 4. 计算已用内存基于内存使用率和总内存 // 4. 计算已用内存基于内存使用率和总内存
Integer memoryUsed = null; Integer memoryUsed = null;
if (memoryUsage != null && server.getMemorySize() != null) { if (memoryUsage != null && server.getMemorySize() != null) {
@ -250,7 +277,7 @@ public class ServerMonitorScheduler {
.divide(new BigDecimal(100), 0, BigDecimal.ROUND_HALF_UP) .divide(new BigDecimal(100), 0, BigDecimal.ROUND_HALF_UP)
.intValue(); .intValue();
} }
// 5. 构建监控数据 // 5. 构建监控数据
ServerMonitorDataDTO data = ServerMonitorDataDTO.builder() ServerMonitorDataDTO data = ServerMonitorDataDTO.builder()
.serverId(server.getId()) .serverId(server.getId())
@ -260,17 +287,17 @@ public class ServerMonitorScheduler {
.diskUsage(diskUsage) .diskUsage(diskUsage)
.collectTime(LocalDateTime.now()) .collectTime(LocalDateTime.now())
.build(); .build();
log.debug("服务器监控数据采集成功: serverId={}, cpu={}%, mem={}%, diskCount={}", log.debug("服务器监控数据采集成功: serverId={}, cpu={}%, mem={}%, diskCount={}",
server.getId(), cpuUsage, memoryUsage, server.getId(), cpuUsage, memoryUsage,
diskUsage != null ? diskUsage.size() : 0); diskUsage != null ? diskUsage.size() : 0);
return data; return data;
} catch (Exception e) { } catch (Exception e) {
log.error("采集服务器监控数据失败: serverId={}, serverName={}, error={}", log.error("采集服务器监控数据失败: serverId={}, serverName={}, error={}",
server.getId(), server.getServerName(), e.getMessage()); server.getId(), server.getServerName(), e.getMessage());
return null; throw e; // 抛出异常让上层处理
} finally { } finally {
// 6. 关闭SSH连接 // 6. 关闭SSH连接
if (sshService != null && sshClient != null) { if (sshService != null && sshClient != null) {
@ -278,19 +305,19 @@ public class ServerMonitorScheduler {
} }
} }
} }
/** /**
* 清理历史监控数据 * 清理历史监控数据
* 此方法由定时任务管理系统调用建议每天凌晨执行 * 此方法由定时任务管理系统调用建议每天凌晨执行
*/ */
public void cleanOldMonitorData() { public void cleanOldMonitorData() {
log.info("========== 开始清理历史监控数据 =========="); log.info("========== 开始清理历史监控数据 ==========");
try { try {
// 删除30天前的数据 // 删除30天前的数据
LocalDateTime thirtyDaysAgo = LocalDateTime.now().minusDays(30); LocalDateTime thirtyDaysAgo = LocalDateTime.now().minusDays(30);
int deletedCount = monitorService.deleteOldData(thirtyDaysAgo); int deletedCount = monitorService.deleteOldData(thirtyDaysAgo);
log.info("========== 历史监控数据清理完成: count={} ==========", deletedCount); log.info("========== 历史监控数据清理完成: count={} ==========", deletedCount);
} catch (Exception e) { } catch (Exception e) {
log.error("清理历史监控数据失败", e); log.error("清理历史监控数据失败", e);

View File

@ -1200,9 +1200,9 @@ CREATE TABLE deploy_server_alert_rule
-- 告警类型 -- 告警类型
alert_type VARCHAR(20) NOT NULL COMMENT '告警类型: CPU/MEMORY/DISK', alert_type VARCHAR(20) NOT NULL COMMENT '告警类型: CPU/MEMORY/DISK',
-- 阈值 -- 阈值支持百分比和绝对值CPU/MEMORY/DISK为%NETWORK为MB/s
warning_threshold DECIMAL(5,2) NOT NULL COMMENT '警告阈值(%)', warning_threshold DECIMAL(10,2) NOT NULL COMMENT '警告阈值',
critical_threshold DECIMAL(5,2) NOT NULL COMMENT '严重阈值(%)', critical_threshold DECIMAL(10,2) NOT NULL COMMENT '严重阈值',
-- 持续时间(避免误报) -- 持续时间(避免误报)
duration_minutes INT DEFAULT 5 COMMENT '持续时长(分钟)', duration_minutes INT DEFAULT 5 COMMENT '持续时长(分钟)',