关于otter监控告警使用

一、背景

近期在使用otter完成单机房单向同步时，常常遇到channel假死的情况，导致Pipeline同步停止，系统表数据同步停止，影响生产环境用户数据查询相关的功能，虽然事后能够通过停channel后再启用channel重新启用同步任务，恢复需要同步的数据，但时常出现该问题而不能及时发现让人头疼不已。通过查询相关的资料有发现早有同学遇到过类似的问题，初步认为是otter的调度算法导致死锁导致任务停止，但目前仍然没有石锤，且otter开源团队目前未对该问题进行官方解答。具体问题描述可参考：https://github.com/alibaba/otter/issues/911

为了解决该问题，曾尝试在线下复现该问题，但是以失败告终，然后就换了一个思路：是否能及时发现同步停止的问题呢？按照这个思路我看到了官方其实目前支持五种同步的告警：延迟、Pipeline延迟、Process延迟、Position延迟、异常监控告警，同时还提供了告警自我恢复机制；通过使用测试其中的Pipeline延迟机制并开启自我恢复机制，发现确实能够及时的完成告警，而且在触发自我恢复阀的情况，系统能够自动完成channel的stop，然后自动再start，channel的同步任务恢复正常，现就针对otter监控告警机制进行说明。

二、otter监控机制解析：

首先得看一下otter监控机制的流程：1.首先我们通过otter的控制台为channel下的Pipeline配置监控。2.otter-manager在启动时会启用一个单线程的定时任务线程池，定时任务每120秒执行一次。2.1 该线程池任务在执行时会查询当前所有启用的监控记录；2.2 然后对监控规则按照Pipeline进行分组；2.3 遍历分组的后的具体规则。2.4 查询Pipeline信息，然后根据Pipeline中的channelId查询zookeeper上对应channel的状态，若channel的状态节点为null或者未停止状态则不执行后监控逻辑。2.5 根据不同的监控类型执行具体的判断逻辑；若当前的统计的数据满足监控告警的条件则执行告警逻辑，若开启了自我恢复机制则尝试恢复channel任务同步。源码分析如下：

#一、SelfMonitor中的start()开启监控
private synchronized void start() {if (executor == null) {// 创建定时任务线程池，单线程executor = new ScheduledThreadPoolExecutor(DEFAULT_POOL, new NamedThreadFactory("Self-Monitor"),new ThreadPoolExecutor.CallerRunsPolicy());}if (future == null) {// 每120秒执行一次future = executor.scheduleWithFixedDelay(new Runnable() {public void run() {try {// 调用GlobalMonitor#explore()monitor.explore();} catch (Exception e) {log.error("self-monitor failed.", e);}}}, interval, interval, TimeUnit.SECONDS);}}#二、GlobalMonitor#explore() 获取监控列表，默认是并行执行监控任务
public void explore() {// 查询了所有启用的监控列表通知根据Pipeline进行分组Map<Long, List<AlarmRule>> rules = alarmRuleService.getAlarmRules(AlarmRuleStatus.ENABLE);if (!CollectionUtils.isEmpty(rules)) {if (needConcurrent) {concurrentProcess(rules);} else {// 串行serialProcess(rules);}} else {log.warn("no enabled alarm rule at all. Check the rule setting please!");}// 自动恢复机制if (recoveryPaused) {List<Long> channelIds = channelService.listAllChannelId();if (needConcurrent) {concurrentProcess(channelIds);} else {// 串行serialProcess(channelIds);}}}#三、GlobalMonitor#concurrentProcess()将每个Pipeline的监控列表提交给线程池去执行
private void concurrentProcess(Map<Long, List<AlarmRule>> rules) {ExecutorCompletionService completionExecutor = new ExecutorCompletionService(executor);List<Future> futures = new ArrayList<Future>();for (Entry<Long, List<AlarmRule>> entry : rules.entrySet()) {final List<AlarmRule> alarmRules = entry.getValue();futures.add(completionExecutor.submit(new Callable<Object>() {@Overridepublic Object call() throws Exception {pipelineMonitor.explore(alarmRules);return null;}}));}List<Throwable> exceptions = new ArrayList<Throwable>();int index = 0;int size = futures.size();while (index < size) {try {Future<?> future = completionExecutor.take();future.get();} catch (InterruptedException e) {exceptions.add(e);} catch (ExecutionException e) {exceptions.add(e);}index++;}if (!exceptions.isEmpty()) {StringBuilder sb = new StringBuilder(exceptions.size() + " exception happens in global monitor\n");sb.append("exception stack start :\n");for (Throwable t : exceptions) {sb.append(ExceptionUtils.getStackTrace(t));}sb.append("exception stack end \n");throw new IllegalStateException(sb.toString());}}#四、PipelineMonitor#explore(List<AlarmRule> rules)完成不同类型的监控分发执行
public void explore(List<AlarmRule> rules) {Long pipelineId = rules.get(0).getPipelineId();Pipeline pipeline = pipelineService.findById(pipelineId);// 如果处于stop状态，则忽略报警ChannelStatus status = arbitrateManageService.channelEvent().status(pipeline.getChannelId());if (status == null || status.isStop()) {return;}List<AlarmRule> delayTimeRules = new LinkedList<AlarmRule>();List<AlarmRule> exceptonRules = new LinkedList<AlarmRule>();List<AlarmRule> pipelineTimeoutRules = new LinkedList<AlarmRule>();List<AlarmRule> processTimeoutRules = new LinkedList<AlarmRule>();List<AlarmRule> positionTimeoutRules = new LinkedList<AlarmRule>();Date now = new Date();for (AlarmRule rule : rules) {switch (rule.getMonitorName()) {case DELAYTIME:if (checkEnable(rule, now)) {delayTimeRules.add(rule);}break;case EXCEPTION:if (checkEnable(rule, now)) {exceptonRules.add(rule);}break;case PIPELINETIMEOUT:if (checkEnable(rule, now)) {pipelineTimeoutRules.add(rule);}break;case PROCESSTIMEOUT:if (checkEnable(rule, now)) {processTimeoutRules.add(rule);}break;case POSITIONTIMEOUT:if (checkEnable(rule, now)) {positionTimeoutRules.add(rule);}break;default:break;}}if (!delayTimeRules.isEmpty()) {delayStatRuleMonitor.explore(delayTimeRules);}if (!pipelineTimeoutRules.isEmpty()) {pipelineTimeoutRuleMonitor.explore(pipelineTimeoutRules);}if (!processTimeoutRules.isEmpty()) {processTimeoutRuleMonitor.explore(processTimeoutRules);}if (!positionTimeoutRules.isEmpty()) {positionTimeoutRuleMonitor.explore(positionTimeoutRules);}}#五、具体的监控实现类完成监控规则检查实施告警和自我恢复机制处理