diff --git a/CHANGES.md b/CHANGES.md index c1e21103a..fa74530a9 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,9 @@ # Changes +## Version 5.0.1 (Not yet released) + +* Improve hang preventing task - Issue #544 + ## Version 5.0.0 (Not yet released) * Make priority granularity configurable - Issue #599 diff --git a/core/src/main/java/com/ericsson/bss/cassandra/ecchronos/core/JmxProxy.java b/core/src/main/java/com/ericsson/bss/cassandra/ecchronos/core/JmxProxy.java index 0cd16371c..19b0a6b24 100644 --- a/core/src/main/java/com/ericsson/bss/cassandra/ecchronos/core/JmxProxy.java +++ b/core/src/main/java/com/ericsson/bss/cassandra/ecchronos/core/JmxProxy.java @@ -108,4 +108,13 @@ public interface JmxProxy extends Closeable * @return The repaired ratio or 0 if it cannot be determined. */ double getPercentRepaired(TableReference tableReference); + + /** + * Retrieves the current operational status of the local Cassandra node via JMX. + * Returns a string indicating the node's state (e.g., "NORMAL", "JOINING", "LEAVING", "MOVING") + * or "Unknown" if the status is undeterminable. + * + * @return A string representing the node's status. + */ + String getNodeStatus(); } diff --git a/core/src/main/java/com/ericsson/bss/cassandra/ecchronos/core/JmxProxyFactoryImpl.java b/core/src/main/java/com/ericsson/bss/cassandra/ecchronos/core/JmxProxyFactoryImpl.java index b84057f52..3e457eca2 100644 --- a/core/src/main/java/com/ericsson/bss/cassandra/ecchronos/core/JmxProxyFactoryImpl.java +++ b/core/src/main/java/com/ericsson/bss/cassandra/ecchronos/core/JmxProxyFactoryImpl.java @@ -308,6 +308,20 @@ public double getPercentRepaired(final TableReference tableReference) } return 0.0; } + + @Override + public String getNodeStatus() + { + try + { + return (String) myMbeanServerConnection.getAttribute(myStorageServiceObject, "OperationMode"); + } + catch (Exception e) + { + LOG.error("Unable to retrieve node status {}", e.getMessage()); + return "Unknown"; + } + } } public static Builder builder() diff --git a/core/src/main/java/com/ericsson/bss/cassandra/ecchronos/core/repair/RepairTask.java b/core/src/main/java/com/ericsson/bss/cassandra/ecchronos/core/repair/RepairTask.java index 340b55636..52ac7b114 100644 --- a/core/src/main/java/com/ericsson/bss/cassandra/ecchronos/core/repair/RepairTask.java +++ b/core/src/main/java/com/ericsson/bss/cassandra/ecchronos/core/repair/RepairTask.java @@ -46,7 +46,7 @@ public abstract class RepairTask implements NotificationListener { private static final Logger LOG = LoggerFactory.getLogger(RepairTask.class); private static final Pattern RANGE_PATTERN = Pattern.compile("\\((-?[0-9]+),(-?[0-9]+)\\]"); - private static final long HANG_PREVENT_TIME_IN_MINUTES = 30; + private static final int HEALTH_CHECK_INTERVAL = 10; private final ScheduledExecutorService myExecutor = Executors.newSingleThreadScheduledExecutor( new ThreadFactoryBuilder().setNameFormat("HangPreventingTask-%d").build()); private final CountDownLatch myLatch = new CountDownLatch(1); @@ -259,7 +259,8 @@ private void rescheduleHangPrevention() { myHangPreventFuture.cancel(false); } - myHangPreventFuture = myExecutor.schedule(new HangPreventingTask(), HANG_PREVENT_TIME_IN_MINUTES, + // Schedule the first check to happen after 10 minutes + myHangPreventFuture = myExecutor.schedule(new HangPreventingTask(), HEALTH_CHECK_INTERVAL, TimeUnit.MINUTES); } @@ -364,21 +365,46 @@ public enum ProgressEventType private class HangPreventingTask implements Runnable { + private static final int MAX_CHECKS = 3; + private static final String NORMAL_STATUS = "NORMAL"; + private int checkCount = 0; + @Override public void run() { try (JmxProxy proxy = myJmxProxyFactory.connect()) { - proxy.forceTerminateAllRepairSessions(); + if (checkCount < MAX_CHECKS) + { + String nodeStatus = proxy.getNodeStatus(); + if (!NORMAL_STATUS.equals(nodeStatus)) + { + LOG.error("Local Cassandra node is down, aborting repair task."); + myLastError = new ScheduledJobException("Local Cassandra node is down"); + proxy.forceTerminateAllRepairSessions(); + myLatch.countDown(); // Signal to abort the repair task + } + else + { + checkCount++; + myHangPreventFuture = myExecutor.schedule(this, HEALTH_CHECK_INTERVAL, TimeUnit.MINUTES); + } + } + else + { + // After 3 successful checks or 30 minutes if still task is running terminate all repair sessions + proxy.forceTerminateAllRepairSessions(); + myLatch.countDown(); + } } catch (IOException e) { - LOG.error("Unable to prevent hanging repair task: {}", this, e); + LOG.error("Unable to check node status or prevent hanging repair task: {}", this, e); } - myLatch.countDown(); } } + @VisibleForTesting final Set getFailedRanges() { diff --git a/core/src/test/java/com/ericsson/bss/cassandra/ecchronos/core/repair/TestRepairGroupTasks.java b/core/src/test/java/com/ericsson/bss/cassandra/ecchronos/core/repair/TestRepairGroupTasks.java index 5f91bcb15..e38f572c6 100644 --- a/core/src/test/java/com/ericsson/bss/cassandra/ecchronos/core/repair/TestRepairGroupTasks.java +++ b/core/src/test/java/com/ericsson/bss/cassandra/ecchronos/core/repair/TestRepairGroupTasks.java @@ -25,6 +25,7 @@ import java.net.InetAddress; import java.net.UnknownHostException; +import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.List; @@ -254,13 +255,13 @@ public void addStorageServiceListener(NotificationListener listener) @Override public List getLiveNodes() { - throw new UnsupportedOperationException(); + return Collections.emptyList(); } @Override public List getUnreachableNodes() { - throw new UnsupportedOperationException(); + return Collections.emptyList(); } @Override @@ -286,7 +287,7 @@ public void removeStorageServiceListener(NotificationListener listener) @Override public long liveDiskSpaceUsed(TableReference tableReference) { - throw new UnsupportedOperationException(); + return 5; } @Override @@ -301,6 +302,12 @@ public double getPercentRepaired(TableReference tableReference) throw new UnsupportedOperationException(); } + @Override + public String getNodeStatus() + { + return "NORMAL"; + } + @Override public void close() { diff --git a/core/src/test/java/com/ericsson/bss/cassandra/ecchronos/core/repair/TestUtils.java b/core/src/test/java/com/ericsson/bss/cassandra/ecchronos/core/repair/TestUtils.java index 48216c9ae..1d0301b39 100644 --- a/core/src/test/java/com/ericsson/bss/cassandra/ecchronos/core/repair/TestUtils.java +++ b/core/src/test/java/com/ericsson/bss/cassandra/ecchronos/core/repair/TestUtils.java @@ -327,6 +327,12 @@ public double getPercentRepaired(TableReference tableReference) throw new UnsupportedOperationException(); } + @Override + public String getNodeStatus() + { + return "NORMAL"; + } + public void notify(Notification notification) { myListener.handleNotification(notification, null);