Commit b3ec3531 by lepdou

refactor env health check logs

parent 56e2d45e
...@@ -28,9 +28,9 @@ import javax.annotation.PostConstruct; ...@@ -28,9 +28,9 @@ import javax.annotation.PostConstruct;
@Component @Component
public class PortalSettings { public class PortalSettings {
private Logger logger = LoggerFactory.getLogger(PortalSettings.class); private static final Logger logger = LoggerFactory.getLogger(PortalSettings.class);
private static final int HEALTH_CHECK_INTERVAL = 10 * 1000; private static final int HEALTH_CHECK_INTERVAL = 10 * 1000;
private static final String DEFAULT_SUPPORT_ENV_LIST = "FAT,UAT,PRO";
@Autowired @Autowired
...@@ -44,14 +44,10 @@ public class PortalSettings { ...@@ -44,14 +44,10 @@ public class PortalSettings {
//mark env up or down //mark env up or down
private Map<Env, Boolean> envStatusMark = new ConcurrentHashMap<>(); private Map<Env, Boolean> envStatusMark = new ConcurrentHashMap<>();
private ScheduledExecutorService healthCheckService;
@PostConstruct @PostConstruct
private void postConstruct() { private void postConstruct() {
//初始化portal支持操作的环境集合,线上的portal可能支持所有的环境操作,而线下环境则支持一部分. String serverConfig = serverConfigService.getValue("apollo.portal.envs", DEFAULT_SUPPORT_ENV_LIST);
// 每个环境的portal支持哪些环境配置在数据库里
String serverConfig = serverConfigService.getValue("apollo.portal.envs", "FAT,UAT,PRO");
String[] configedEnvs = serverConfig.split(","); String[] configedEnvs = serverConfig.split(",");
List<String> allStrEnvs = Arrays.asList(configedEnvs); List<String> allStrEnvs = Arrays.asList(configedEnvs);
for (String e : allStrEnvs) { for (String e : allStrEnvs) {
...@@ -62,7 +58,7 @@ public class PortalSettings { ...@@ -62,7 +58,7 @@ public class PortalSettings {
envStatusMark.put(env, true); envStatusMark.put(env, true);
} }
healthCheckService = Executors.newScheduledThreadPool(1); ScheduledExecutorService healthCheckService = Executors.newScheduledThreadPool(1);
healthCheckService healthCheckService
.scheduleWithFixedDelay(new HealthCheckTask(applicationContext), 1000, HEALTH_CHECK_INTERVAL, .scheduleWithFixedDelay(new HealthCheckTask(applicationContext), 1000, HEALTH_CHECK_INTERVAL,
...@@ -86,16 +82,16 @@ public class PortalSettings { ...@@ -86,16 +82,16 @@ public class PortalSettings {
class HealthCheckTask implements Runnable { class HealthCheckTask implements Runnable {
private static final int ENV_DIED_THREADHOLD = 2; private static final int ENV_DOWN_THRESHOLD = 2;
private Map<Env, Long> healthCheckFailCnt = new HashMap<>(); private Map<Env, Integer> healthCheckFailedCounter = new HashMap<>();
private AdminServiceAPI.HealthAPI healthAPI; private AdminServiceAPI.HealthAPI healthAPI;
public HealthCheckTask(ApplicationContext context) { public HealthCheckTask(ApplicationContext context) {
healthAPI = context.getBean(AdminServiceAPI.HealthAPI.class); healthAPI = context.getBean(AdminServiceAPI.HealthAPI.class);
for (Env env : allEnvs) { for (Env env : allEnvs) {
healthCheckFailCnt.put(env, 0l); healthCheckFailedCounter.put(env, 0);
} }
} }
...@@ -107,17 +103,17 @@ public class PortalSettings { ...@@ -107,17 +103,17 @@ public class PortalSettings {
//revive //revive
if (!envStatusMark.get(env)) { if (!envStatusMark.get(env)) {
envStatusMark.put(env, true); envStatusMark.put(env, true);
healthCheckFailCnt.put(env, 0l); healthCheckFailedCounter.put(env, 0);
logger.info("env up again [env:{}]", env); logger.info("Env revived because env health check success. env: {}", env);
} }
} else { } else {
//maybe meta server up but admin server down logger.warn("Env health check failed, maybe because of admin server down. env: {}", env);
handleEnvDown(env); handleEnvDown(env);
} }
} catch (Exception e) { } catch (Exception e) {
//maybe meta server down logger.warn("Env health check failed, maybe because of meta server down "
logger.warn("health check fail. [env:{}]", env, e.getMessage()); + "or config error meta server address. env: {}", env);
handleEnvDown(env); handleEnvDown(env);
} }
} }
...@@ -130,17 +126,19 @@ public class PortalSettings { ...@@ -130,17 +126,19 @@ public class PortalSettings {
} }
private void handleEnvDown(Env env) { private void handleEnvDown(Env env) {
long failCnt = healthCheckFailCnt.get(env); int failedTimes = healthCheckFailedCounter.get(env);
healthCheckFailCnt.put(env, ++failCnt); healthCheckFailedCounter.put(env, ++failedTimes);
if (!envStatusMark.get(env)) { if (!envStatusMark.get(env)) {
logger.warn("[env:{}] down yet.", env); logger.error("Env is down. env: {}, failed times: {}", env, failedTimes);
} else { } else {
if (failCnt >= ENV_DIED_THREADHOLD) { if (failedTimes >= ENV_DOWN_THRESHOLD) {
envStatusMark.put(env, false); envStatusMark.put(env, false);
logger.error("env turn to down [env:{}]", env); logger.error("Env is down because health check failed for {} times, "
+ "which equals to down threshold. env: {}", ENV_DOWN_THRESHOLD, env);
} else { } else {
logger.warn("env health check fail first time. [env:{}]", env); logger.warn("Env health check failed for {} times which less than down threshold. down threshold:{}, env: {}",
failedTimes, ENV_DOWN_THRESHOLD, env);
} }
} }
......
...@@ -8,6 +8,8 @@ import com.ctrip.framework.apollo.core.enums.Env; ...@@ -8,6 +8,8 @@ import com.ctrip.framework.apollo.core.enums.Env;
import com.ctrip.framework.apollo.portal.PortalSettings; import com.ctrip.framework.apollo.portal.PortalSettings;
import com.dianping.cat.Cat; import com.dianping.cat.Cat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.autoconfigure.web.HttpMessageConverters; import org.springframework.boot.autoconfigure.web.HttpMessageConverters;
import org.springframework.http.client.HttpComponentsClientHttpRequestFactory; import org.springframework.http.client.HttpComponentsClientHttpRequestFactory;
...@@ -35,6 +37,7 @@ public class AdminServiceAddressLocator { ...@@ -35,6 +37,7 @@ public class AdminServiceAddressLocator {
private static final long OFFLINE_REFRESH_INTERVAL = 10 * 1000; private static final long OFFLINE_REFRESH_INTERVAL = 10 * 1000;
private static final int RETRY_TIMES = 3; private static final int RETRY_TIMES = 3;
private static final String ADMIN_SERVICE_URL_PATH = "/services/admin"; private static final String ADMIN_SERVICE_URL_PATH = "/services/admin";
private static final Logger logger = LoggerFactory.getLogger(AdminServiceAddressLocator.class);
private ScheduledExecutorService refreshServiceAddressService; private ScheduledExecutorService refreshServiceAddressService;
private RestTemplate restTemplate; private RestTemplate restTemplate;
...@@ -91,10 +94,12 @@ public class AdminServiceAddressLocator { ...@@ -91,10 +94,12 @@ public class AdminServiceAddressLocator {
refreshSuccess = refreshSuccess && currentEnvRefreshResult; refreshSuccess = refreshSuccess && currentEnvRefreshResult;
} }
if (refreshSuccess){ if (refreshSuccess) {
refreshServiceAddressService.schedule(new RefreshAdminServerAddressTask(), NORMAL_REFRESH_INTERVAL, TimeUnit.MILLISECONDS); refreshServiceAddressService
.schedule(new RefreshAdminServerAddressTask(), NORMAL_REFRESH_INTERVAL, TimeUnit.MILLISECONDS);
} else { } else {
refreshServiceAddressService.schedule(new RefreshAdminServerAddressTask(), OFFLINE_REFRESH_INTERVAL, TimeUnit.MILLISECONDS); refreshServiceAddressService
.schedule(new RefreshAdminServerAddressTask(), OFFLINE_REFRESH_INTERVAL, TimeUnit.MILLISECONDS);
} }
} }
} }
...@@ -110,9 +115,11 @@ public class AdminServiceAddressLocator { ...@@ -110,9 +115,11 @@ public class AdminServiceAddressLocator {
} }
cache.put(env, Arrays.asList(services)); cache.put(env, Arrays.asList(services));
return true; return true;
} catch (Throwable e) {//meta server error } catch (Throwable e) {
Cat.logError("get admin server address fail", e); logger.error(String.format("Get admin server address from meta server failed. env: %s, meta server address:%s",
continue; env, MetaDomainConsts.getDomain(env)), e);
Cat.logError(String.format("Get admin server address from meta server failed. env: %s, meta server address:%s",
env, MetaDomainConsts.getDomain(env)), e);
} }
} }
return false; return false;
......
...@@ -10,6 +10,8 @@ import com.dianping.cat.message.Transaction; ...@@ -10,6 +10,8 @@ import com.dianping.cat.message.Transaction;
import org.apache.http.conn.ConnectTimeoutException; import org.apache.http.conn.ConnectTimeoutException;
import org.apache.http.conn.HttpHostConnectException; import org.apache.http.conn.HttpHostConnectException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.ParameterizedTypeReference; import org.springframework.core.ParameterizedTypeReference;
import org.springframework.http.HttpMethod; import org.springframework.http.HttpMethod;
...@@ -32,6 +34,8 @@ import javax.annotation.PostConstruct; ...@@ -32,6 +34,8 @@ import javax.annotation.PostConstruct;
@Component @Component
public class RetryableRestTemplate { public class RetryableRestTemplate {
private Logger logger = LoggerFactory.getLogger(RetryableRestTemplate.class);
private UriTemplateHandler uriTemplateHandler = new DefaultUriTemplateHandler(); private UriTemplateHandler uriTemplateHandler = new DefaultUriTemplateHandler();
private RestTemplate restTemplate; private RestTemplate restTemplate;
...@@ -56,7 +60,7 @@ public class RetryableRestTemplate { ...@@ -56,7 +60,7 @@ public class RetryableRestTemplate {
Object... uriVariables) Object... uriVariables)
throws RestClientException { throws RestClientException {
return execute(env, path, reference, uriVariables); return exchangeGet(env, path, reference, uriVariables);
} }
public <T> T post(Env env, String path, Object request, Class<T> responseType, Object... uriVariables) public <T> T post(Env env, String path, Object request, Class<T> responseType, Object... uriVariables)
...@@ -93,10 +97,10 @@ public class RetryableRestTemplate { ...@@ -93,10 +97,10 @@ public class RetryableRestTemplate {
ct.complete(); ct.complete();
return result; return result;
} catch (Throwable t) { } catch (Throwable t) {
logger.error("Http request failed, uri: {}, method: {}", uri, method, t);
Cat.logError(t); Cat.logError(t);
if (canRetry(t, method)) { if (canRetry(t, method)) {
Cat.logEvent(CatEventType.API_RETRY, uri); Cat.logEvent(CatEventType.API_RETRY, uri);
continue;
} else {//biz exception rethrow } else {//biz exception rethrow
ct.setStatus(t); ct.setStatus(t);
ct.complete(); ct.complete();
...@@ -112,8 +116,8 @@ public class RetryableRestTemplate { ...@@ -112,8 +116,8 @@ public class RetryableRestTemplate {
throw e; throw e;
} }
private <T> ResponseEntity<T> execute(Env env, String path, ParameterizedTypeReference<T> reference, private <T> ResponseEntity<T> exchangeGet(Env env, String path, ParameterizedTypeReference<T> reference,
Object... uriVariables){ Object... uriVariables) {
if (path.startsWith("/")) { if (path.startsWith("/")) {
path = path.substring(1, path.length()); path = path.substring(1, path.length());
} }
...@@ -133,9 +137,9 @@ public class RetryableRestTemplate { ...@@ -133,9 +137,9 @@ public class RetryableRestTemplate {
ct.complete(); ct.complete();
return result; return result;
} catch (Throwable t) { } catch (Throwable t) {
logger.error("Http request failed, uri: {}, method: {}", uri, HttpMethod.GET, t);
Cat.logError(t); Cat.logError(t);
Cat.logEvent(CatEventType.API_RETRY, uri); Cat.logEvent(CatEventType.API_RETRY, uri);
continue;
} }
} }
...@@ -147,7 +151,7 @@ public class RetryableRestTemplate { ...@@ -147,7 +151,7 @@ public class RetryableRestTemplate {
} }
private List<ServiceDTO> getAdminServices( Env env, Transaction ct){ private List<ServiceDTO> getAdminServices(Env env, Transaction ct) {
List<ServiceDTO> services = adminServiceAddressLocator.getServiceList(env); List<ServiceDTO> services = adminServiceAddressLocator.getServiceList(env);
...@@ -180,7 +184,7 @@ public class RetryableRestTemplate { ...@@ -180,7 +184,7 @@ public class RetryableRestTemplate {
restTemplate.delete(parseHost(service) + path, uriVariables); restTemplate.delete(parseHost(service) + path, uriVariables);
break; break;
default: default:
throw new UnsupportedOperationException(String.format("not supported http method(method=%s)", method)); throw new UnsupportedOperationException(String.format("unsupported http method(method=%s)", method));
} }
return result; return result;
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment