Commit b3ec3531 by lepdou

refactor env health check logs

parent 56e2d45e
......@@ -28,9 +28,9 @@ import javax.annotation.PostConstruct;
@Component
public class PortalSettings {
private Logger logger = LoggerFactory.getLogger(PortalSettings.class);
private static final Logger logger = LoggerFactory.getLogger(PortalSettings.class);
private static final int HEALTH_CHECK_INTERVAL = 10 * 1000;
private static final String DEFAULT_SUPPORT_ENV_LIST = "FAT,UAT,PRO";
@Autowired
......@@ -44,14 +44,10 @@ public class PortalSettings {
//mark env up or down
private Map<Env, Boolean> envStatusMark = new ConcurrentHashMap<>();
private ScheduledExecutorService healthCheckService;
@PostConstruct
private void postConstruct() {
//初始化portal支持操作的环境集合,线上的portal可能支持所有的环境操作,而线下环境则支持一部分.
// 每个环境的portal支持哪些环境配置在数据库里
String serverConfig = serverConfigService.getValue("apollo.portal.envs", "FAT,UAT,PRO");
String serverConfig = serverConfigService.getValue("apollo.portal.envs", DEFAULT_SUPPORT_ENV_LIST);
String[] configedEnvs = serverConfig.split(",");
List<String> allStrEnvs = Arrays.asList(configedEnvs);
for (String e : allStrEnvs) {
......@@ -62,7 +58,7 @@ public class PortalSettings {
envStatusMark.put(env, true);
}
healthCheckService = Executors.newScheduledThreadPool(1);
ScheduledExecutorService healthCheckService = Executors.newScheduledThreadPool(1);
healthCheckService
.scheduleWithFixedDelay(new HealthCheckTask(applicationContext), 1000, HEALTH_CHECK_INTERVAL,
......@@ -86,16 +82,16 @@ public class PortalSettings {
class HealthCheckTask implements Runnable {
private static final int ENV_DIED_THREADHOLD = 2;
private static final int ENV_DOWN_THRESHOLD = 2;
private Map<Env, Long> healthCheckFailCnt = new HashMap<>();
private Map<Env, Integer> healthCheckFailedCounter = new HashMap<>();
private AdminServiceAPI.HealthAPI healthAPI;
public HealthCheckTask(ApplicationContext context) {
healthAPI = context.getBean(AdminServiceAPI.HealthAPI.class);
for (Env env : allEnvs) {
healthCheckFailCnt.put(env, 0l);
healthCheckFailedCounter.put(env, 0);
}
}
......@@ -107,17 +103,17 @@ public class PortalSettings {
//revive
if (!envStatusMark.get(env)) {
envStatusMark.put(env, true);
healthCheckFailCnt.put(env, 0l);
logger.info("env up again [env:{}]", env);
healthCheckFailedCounter.put(env, 0);
logger.info("Env revived because env health check success. env: {}", env);
}
} else {
//maybe meta server up but admin server down
logger.warn("Env health check failed, maybe because of admin server down. env: {}", env);
handleEnvDown(env);
}
} catch (Exception e) {
//maybe meta server down
logger.warn("health check fail. [env:{}]", env, e.getMessage());
logger.warn("Env health check failed, maybe because of meta server down "
+ "or config error meta server address. env: {}", env);
handleEnvDown(env);
}
}
......@@ -130,17 +126,19 @@ public class PortalSettings {
}
private void handleEnvDown(Env env) {
long failCnt = healthCheckFailCnt.get(env);
healthCheckFailCnt.put(env, ++failCnt);
int failedTimes = healthCheckFailedCounter.get(env);
healthCheckFailedCounter.put(env, ++failedTimes);
if (!envStatusMark.get(env)) {
logger.warn("[env:{}] down yet.", env);
logger.error("Env is down. env: {}, failed times: {}", env, failedTimes);
} else {
if (failCnt >= ENV_DIED_THREADHOLD) {
if (failedTimes >= ENV_DOWN_THRESHOLD) {
envStatusMark.put(env, false);
logger.error("env turn to down [env:{}]", env);
logger.error("Env is down because health check failed for {} times, "
+ "which equals to down threshold. env: {}", ENV_DOWN_THRESHOLD, env);
} else {
logger.warn("env health check fail first time. [env:{}]", env);
logger.warn("Env health check failed for {} times which less than down threshold. down threshold:{}, env: {}",
failedTimes, ENV_DOWN_THRESHOLD, env);
}
}
......
......@@ -8,6 +8,8 @@ import com.ctrip.framework.apollo.core.enums.Env;
import com.ctrip.framework.apollo.portal.PortalSettings;
import com.dianping.cat.Cat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.autoconfigure.web.HttpMessageConverters;
import org.springframework.http.client.HttpComponentsClientHttpRequestFactory;
......@@ -35,6 +37,7 @@ public class AdminServiceAddressLocator {
private static final long OFFLINE_REFRESH_INTERVAL = 10 * 1000;
private static final int RETRY_TIMES = 3;
private static final String ADMIN_SERVICE_URL_PATH = "/services/admin";
private static final Logger logger = LoggerFactory.getLogger(AdminServiceAddressLocator.class);
private ScheduledExecutorService refreshServiceAddressService;
private RestTemplate restTemplate;
......@@ -91,10 +94,12 @@ public class AdminServiceAddressLocator {
refreshSuccess = refreshSuccess && currentEnvRefreshResult;
}
if (refreshSuccess){
refreshServiceAddressService.schedule(new RefreshAdminServerAddressTask(), NORMAL_REFRESH_INTERVAL, TimeUnit.MILLISECONDS);
if (refreshSuccess) {
refreshServiceAddressService
.schedule(new RefreshAdminServerAddressTask(), NORMAL_REFRESH_INTERVAL, TimeUnit.MILLISECONDS);
} else {
refreshServiceAddressService.schedule(new RefreshAdminServerAddressTask(), OFFLINE_REFRESH_INTERVAL, TimeUnit.MILLISECONDS);
refreshServiceAddressService
.schedule(new RefreshAdminServerAddressTask(), OFFLINE_REFRESH_INTERVAL, TimeUnit.MILLISECONDS);
}
}
}
......@@ -110,9 +115,11 @@ public class AdminServiceAddressLocator {
}
cache.put(env, Arrays.asList(services));
return true;
} catch (Throwable e) {//meta server error
Cat.logError("get admin server address fail", e);
continue;
} catch (Throwable e) {
logger.error(String.format("Get admin server address from meta server failed. env: %s, meta server address:%s",
env, MetaDomainConsts.getDomain(env)), e);
Cat.logError(String.format("Get admin server address from meta server failed. env: %s, meta server address:%s",
env, MetaDomainConsts.getDomain(env)), e);
}
}
return false;
......
......@@ -10,6 +10,8 @@ import com.dianping.cat.message.Transaction;
import org.apache.http.conn.ConnectTimeoutException;
import org.apache.http.conn.HttpHostConnectException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.ParameterizedTypeReference;
import org.springframework.http.HttpMethod;
......@@ -32,6 +34,8 @@ import javax.annotation.PostConstruct;
@Component
public class RetryableRestTemplate {
private Logger logger = LoggerFactory.getLogger(RetryableRestTemplate.class);
private UriTemplateHandler uriTemplateHandler = new DefaultUriTemplateHandler();
private RestTemplate restTemplate;
......@@ -56,7 +60,7 @@ public class RetryableRestTemplate {
Object... uriVariables)
throws RestClientException {
return execute(env, path, reference, uriVariables);
return exchangeGet(env, path, reference, uriVariables);
}
public <T> T post(Env env, String path, Object request, Class<T> responseType, Object... uriVariables)
......@@ -93,10 +97,10 @@ public class RetryableRestTemplate {
ct.complete();
return result;
} catch (Throwable t) {
logger.error("Http request failed, uri: {}, method: {}", uri, method, t);
Cat.logError(t);
if (canRetry(t, method)) {
Cat.logEvent(CatEventType.API_RETRY, uri);
continue;
} else {//biz exception rethrow
ct.setStatus(t);
ct.complete();
......@@ -112,8 +116,8 @@ public class RetryableRestTemplate {
throw e;
}
private <T> ResponseEntity<T> execute(Env env, String path, ParameterizedTypeReference<T> reference,
Object... uriVariables){
private <T> ResponseEntity<T> exchangeGet(Env env, String path, ParameterizedTypeReference<T> reference,
Object... uriVariables) {
if (path.startsWith("/")) {
path = path.substring(1, path.length());
}
......@@ -133,9 +137,9 @@ public class RetryableRestTemplate {
ct.complete();
return result;
} catch (Throwable t) {
logger.error("Http request failed, uri: {}, method: {}", uri, HttpMethod.GET, t);
Cat.logError(t);
Cat.logEvent(CatEventType.API_RETRY, uri);
continue;
}
}
......@@ -147,7 +151,7 @@ public class RetryableRestTemplate {
}
private List<ServiceDTO> getAdminServices( Env env, Transaction ct){
private List<ServiceDTO> getAdminServices(Env env, Transaction ct) {
List<ServiceDTO> services = adminServiceAddressLocator.getServiceList(env);
......@@ -180,7 +184,7 @@ public class RetryableRestTemplate {
restTemplate.delete(parseHost(service) + path, uriVariables);
break;
default:
throw new UnsupportedOperationException(String.format("not supported http method(method=%s)", method));
throw new UnsupportedOperationException(String.format("unsupported http method(method=%s)", method));
}
return result;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment