Loading...

Type: Bug
Resolution: Fixed
Priority: Low
Fix Version/s: 5.9.11
Affects Version/s: 6.0.0, 5.10.0
Component/s: Data Center - Core
Labels:

The logDetails() method in HazelcastClusterSafetyManager contains a log bug that can prevent node panic.

private void logDetails(int nextValue) {
        if (log.isDebugEnabled()) {
            final Optional<Integer> dbSafetyNumber = getDbSafetyNumber();
            final Optional<Integer> cacheSafetyNumber = getCacheSafetyNumber();
            final Optional<String> lastCacheModifier = getLastCacheModifier();

            final String dbSafetyNumberString = dbSafetyNumber.isPresent()
                    ? String.valueOf(dbSafetyNumber.get())
                    : "not found";
            log.debug("Database number exists [ {} ] [ {} ]", dbSafetyNumber.isPresent(), dbSafetyNumberString);

            final String cacheSafetyNumberString = cacheSafetyNumber.isPresent()
                    ? String.valueOf(cacheSafetyNumber.get())
                    : "not found";
            log.debug("Cached number exists [ {} ] [ {} ], last modifier: [ {} ]", cacheSafetyNumber.isPresent(), cacheSafetyNumberString, lastCacheModifier.get());

            if (dbSafetyNumber.isPresent() && cacheSafetyNumber.isPresent()) {
                log.debug("Database number: {} should equal cached number: {}", dbSafetyNumberString, getCacheSafetyNumber());
            }
            log.debug("Next value: {}", nextValue);
        }
    }

Statement

log.debug("Cached number exists [ {} ] [ {} ], last modifier: [ {} ]", cacheSafetyNumber.isPresent(), cacheSafetyNumberString, lastCacheModifier.get());

will throw java.lang.IllegalStateException if the lastCacheModifier value is absent.

As a result, logDetails() throws an exception and prevents node panic:

@Override
    public void verify() {
        final int nextValue = getNextValue();
        final Optional<String> lastCacheModifier = getLastCacheModifier();
        final Optional<Integer> dbSafetyNumber = getDbSafetyNumber();
        final Optional<Integer> cacheSafetyNumber = getCacheSafetyNumber();

        if (dbSafetyNumber.isPresent() && cacheSafetyNumber.isPresent()) {
            if (!dbSafetyNumber.equals(cacheSafetyNumber)) {
                log.warn("detected different number in database [ {} ] and cache [ {} ]. Cache number last set by [ {} ]. Triggering panic on current node", dbSafetyNumber.get(), cacheSafetyNumber.get(), lastCacheModifier.get());

                logDetails(nextValue);

                panic();  ///THIS NEVER HAPPENS IF logDetails() throws exception
                return;
            }
        } else if (dbSafetyNumber.isPresent())
            log.debug("found cluster safety number in database [ {} ] but not in cache", dbSafetyNumber.get());
        else if (cacheSafetyNumber.isPresent())
            log.debug("found cluster safety number in cache [ {} ] but not in database", getCacheSafetyNumber());

        logDetails(nextValue);
        clusterSafetyDao.setSafetyNumber(nextValue);
        storeCacheNumber(nextValue);

        sanityCheck(getDbSafetyNumber(), getCacheSafetyNumber(), nextValue);
    }

Issue was reproduced locally:

2016-04-27 15:02:20,792 ERROR [Caesium-1-2] [impl.schedule.caesium.JobRunnerWrapper] runJob Scheduled job ClusterSafetyJob#ClusterSafetyJob failed to run
java.lang.IllegalStateException: value is absent
	at com.google.common.base.Optional$Absent.get(Optional.java:263)
	at com.atlassian.confluence.cluster.hazelcast.HazelcastClusterSafetyManager.logDetails(HazelcastClusterSafetyManager.java:112)
	at com.atlassian.confluence.cluster.hazelcast.HazelcastClusterSafetyManager.verify(HazelcastClusterSafetyManager.java:81)
	at sun.reflect.GeneratedMethodAccessor662.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:497)
	at org.springframework.aop.support.AopUtils.invokeJoinpointUsingReflection(AopUtils.java:302)
	at org.springframework.aop.framework.ReflectiveMethodInvocation.invokeJoinpoint(ReflectiveMethodInvocation.java:190)
	at org.springframework.aop.framework.ReflectiveMethodInvocation.proceed(ReflectiveMethodInvocation.java:157)
	at org.springframework.transaction.interceptor.TransactionInterceptor$1.proceedWithInvocation(TransactionInterceptor.java:99)
	at org.springframework.transaction.interceptor.TransactionAspectSupport.invokeWithinTransaction(TransactionAspectSupport.java:281)
	at org.springframework.transaction.interceptor.TransactionInterceptor.invoke(TransactionInterceptor.java:96)
	at org.springframework.aop.framework.ReflectiveMethodInvocation.proceed(ReflectiveMethodInvocation.java:179)
	at com.atlassian.spring.interceptors.SpringProfilingInterceptor.invoke(SpringProfilingInterceptor.java:16)
	at org.springframework.aop.framework.ReflectiveMethodInvocation.proceed(ReflectiveMethodInvocation.java:179)
	at com.atlassian.confluence.util.profiling.ConfluenceMonitoringMethodInterceptor.invoke(ConfluenceMonitoringMethodInterceptor.java:34)
	at org.springframework.aop.framework.ReflectiveMethodInvocation.proceed(ReflectiveMethodInvocation.java:179)
	at org.springframework.aop.framework.JdkDynamicAopProxy.invoke(JdkDynamicAopProxy.java:208)
	at com.sun.proxy.$Proxy128.verify(Unknown Source)
	at com.atlassian.confluence.cluster.safety.ClusterSafetyJob.runJob(ClusterSafetyJob.java:19)
	at com.atlassian.confluence.impl.schedule.caesium.JobRunnerWrapper.lambda$runJob$829(JobRunnerWrapper.java:71)
	at com.atlassian.confluence.impl.schedule.caesium.JobRunnerWrapper$$Lambda$451/482921092.perform(Unknown Source)
	at com.atlassian.confluence.impl.vcache.VCacheRequestContextManager.doInRequestContext(VCacheRequestContextManager.java:82)
	at com.atlassian.confluence.impl.vcache.VCacheRequestContextManager.doInRequestContext(VCacheRequestContextManager.java:72)
	at com.atlassian.confluence.impl.schedule.caesium.JobRunnerWrapper.runJob(JobRunnerWrapper.java:71)
	at com.atlassian.scheduler.core.JobLauncher.runJob(JobLauncher.java:153)
	at com.atlassian.scheduler.core.JobLauncher.launchAndBuildResponse(JobLauncher.java:118)
	at com.atlassian.scheduler.core.JobLauncher.launch(JobLauncher.java:97)
	at com.atlassian.scheduler.caesium.impl.CaesiumSchedulerService.launchJob(CaesiumSchedulerService.java:401)
	at com.atlassian.scheduler.caesium.impl.CaesiumSchedulerService.executeClusteredJob(CaesiumSchedulerService.java:396)
	at com.atlassian.scheduler.caesium.impl.CaesiumSchedulerService.executeQueuedJob(CaesiumSchedulerService.java:349)
	at com.atlassian.scheduler.caesium.impl.CaesiumSchedulerService$1.consume(CaesiumSchedulerService.java:255)
	at com.atlassian.scheduler.caesium.impl.CaesiumSchedulerService$1.consume(CaesiumSchedulerService.java:252)
	at com.atlassian.scheduler.caesium.impl.SchedulerQueueWorker.executeJob(SchedulerQueueWorker.java:65)
	at com.atlassian.scheduler.caesium.impl.SchedulerQueueWorker.executeNextJob(SchedulerQueueWorker.java:59)
	at com.atlassian.scheduler.caesium.impl.SchedulerQueueWorker.run(SchedulerQueueWorker.java:34)
	at java.lang.Thread.run(Thread.java:745)

Testing notes:

set up CDC locally (2 nodes)
set logging level to debug for com.atlassian.confluence.cluster.hazelcast.HazelcastClusterSafetyManager

emulate network outage between nodes. I used pfctl utility for that:

sudo pfctl -e
(sudo pfctl -sr; echo "block drop quick on lo0 proto tcp from any to any port = 5802") | sudo pfctl -f -
(sudo pfctl -sr; echo "block drop quick on lo0 proto tcp from any to any port = 5801") | sudo pfctl -f -
sudo pfctl -v -s rules

To clean up filtering:

sudo pfctl -f /etc/pf.conf

one node should panic on when safety job runs.

mentioned in: Page Loading...; Page Loading...; Page Loading...

Details

Description

Attachments

Issue Links

Activity

People

Dates