Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fatal error: concurrent map iteration and map write #21288

Open
david-sanusi opened this issue Jun 10, 2024 · 0 comments
Open

fatal error: concurrent map iteration and map write #21288

david-sanusi opened this issue Jun 10, 2024 · 0 comments

Comments

@david-sanusi
Copy link

Overview of the Issue

Consul agent running in server mode in a 3 node cluster panics and then crashed. This only happens on the node that is the leader. When this happens cluster leadership is transfered to another node. After some time (could be in a couple of hours or even days) this leader also crashes, leadership transfers to another node and the cycle continues.


Reproduction Steps

It is not known how to reproduce this or what exactly triggers this. It happens randomly in an otherwise normally functioning cluster.

Consul info for both Client and Server

Client info
agent:
	check_monitors = 0
	check_ttls = 0
	checks = 3
	services = 2
build:
	prerelease =
	revision = 98cb473+
	version = 1.18.1
	version_metadata =
consul:
	acl = disabled
	known_servers = 5
	server = false
runtime:
	arch = amd64
	cpu_count = 2
	goroutines = 103
	max_procs = 2
	os = linux
	version = go1.22.3
serf_lan:
	coordinate_resets = 0
	encrypted = true
	event_queue = 0
	event_time = 806
	failed = 0
	health_score = 0
	intent_queue = 0
	left = 5
	member_time = 2369641
	members = 146
	query_queue = 0
	query_time = 1646
{
  "datacenter": "dc1",
  "client_addr": "127.0.0.1",
  "bind_addr": "{{ GetPrivateIP }}",
  "ports": {
    "grpc": 8502
  },
  "data_dir": "/consul/data",
  "pid_file": "/var/run/consul/consul.pid",
  "retry_join": ["provider=aws service=ecs tag_key=role tag_value=consul"],
  "limits": {
    "http_max_conns_per_client": 500
  },
  "encrypt": "",
  "log_level": "INFO",
  "leave_on_terminate": true,
  "advertise_reconnect_timeout": "1h",
  "cache": {
    "entry_fetch_rate": 1
  },
  "disable_update_check": true,
  "telemetry": {
    "dogstatsd_addr": "127.0.0.1:8125",
    "dogstatsd_tags": [
      "service:servicename-service"
    ]
  }
}

{
  "service": {
    "name": "servicename-service-proxy",
    "id": "servicename-service-proxy",
    "kind": "connect-proxy",
    "proxy": {
      "upstreams":[{"destination_name": "egress-proxy","local_bind_address": "127.0.0.1","local_bind_port": 13128,"config":{"connect_timeout_ms":5000}}],
      "destination_service_name": "servicename-service",
      "destination_service_id": "servicename-service",
      "local_service_port": 7843,
      "config": {
        "local_request_timeout_ms": 300000,
        "local_connect_timeout_ms": 5000,
        "local_idle_timeout_ms": 60000,
        "envoy_tracing_json": "{\"http\":{\"name\":\"envoy.tracers.datadog\",\"typedConfig\":{\"@type\":\"type.googleapis.com/envoy.config.trace.v3.DatadogConfig\",\"collector_cluster\":\"datadog_agent\",\"service_name\":\"servicename-service\"}}}"
      }
    },
    "port": 41641
  }
}

{
  "check": {
    "id": "servicename-service-proxy-healthcheck",
    "name": "servicename-service-proxy-healthcheck",
    "service_id": "servicename-service-proxy",
    "TCP": "127.0.0.1:41641",
    "interval": "10s",
    "timeout": "5s",
    "success_before_passing": 1,
    "failures_before_critical": 0
  }
}

{
  "service": {
    "name": "servicename-service",
    "id": "servicename-service",
    "port": 41141,
    "tags": [
      "fe_haproxy_http_app","haproxy_http_app","haproxy_httpchk=GET /health/"
    ]
  }
}

{
  "check": {
    "id": "servicename-service-alias-healthcheck",
    "name": "servicename-service-alias-healthcheck",
    "service_id": "servicename-service-proxy",
    "alias_service": "servicename-service"
  }
}

{
  "check": {
    "id": "servicename-service-healthcheck",
    "name": "servicename-service-healthcheck",
    "service_id": "servicename-service",
    "http": "https://servicename-service.service.consul:41141/health/",
    "interval": "5s",
    "timeout": "5s",
    "success_before_passing": 1,
    "failures_before_critical": 0,
    "tls_skip_verify": true
  }
}
Server info
agent:
	check_monitors = 0
	check_ttls = 0
	checks = 0
	services = 0
build:
	prerelease =
	revision = 98cb473+
	version = 1.18.1
	version_metadata =
consul:
	acl = disabled
	bootstrap = false
	known_datacenters = 1
	leader = false
	leader_addr = 10.140.32.151:8300
	server = true
raft:
	applied_index = 54165746
	commit_index = 54165746
	fsm_pending = 0
	last_contact = 46.544497ms
	last_log_index = 54165747
	last_log_term = 19019
	last_snapshot_index = 54163954
	last_snapshot_term = 19019
	latest_configuration = [removed]
	latest_configuration_index = 0
	num_peers = 2
	protocol_version = 3
	protocol_version_max = 3
	protocol_version_min = 0
	snapshot_version_max = 1
	snapshot_version_min = 0
	state = Follower
	term = 19019
runtime:
	arch = amd64
	cpu_count = 4
	goroutines = 1823
	max_procs = 4
	os = linux
	version = go1.22.3
serf_lan:
	coordinate_resets = 0
	encrypted = true
	event_queue = 0
	event_time = 806
	failed = 0
	health_score = 0
	intent_queue = 0
	left = 7
	member_time = 2370742
	members = 149
	query_queue = 0
	query_time = 1646
serf_wan:
	coordinate_resets = 0
	encrypted = true
	event_queue = 0
	event_time = 1
	failed = 0
	health_score = 0
	intent_queue = 0
	left = 0
	member_time = 40498
	members = 6
	query_queue = 0
	query_time = 1
{
  "server": true,
  "datacenter": "dc1",
  "node_name": "consul-server-0",
  "bootstrap_expect": 3,
  "ui_config": {
    "enabled" : true
  },
  "data_dir": "/consul/data",
  "pid_file": "/var/run/consul/consul.pid",
  "retry_join": ["provider=aws service=ec2 tag_key=role tag_value=consul",],
  "addresses": {
    "http": "0.0.0.0",
    "https": "0.0.0.0"
  },
  "bind_addr": "{{ GetPrivateIP }}",
  "client_addr": "0.0.0.0",
  "ports": {
    "http": 8500,
    "https": 8501
  },
  "tls": {
    "defaults": {
      "key_file": "/consul/certs/consul-server-0.key",
      "cert_file": "/consul/certs/consul-server-0.pem",
      "ca_file": "/consul/certs/ca.pem"
    }
  },
  "connect": {
    "enabled": true
  },
  "encrypt": "",
  "config_entries": {
    "bootstrap": [
      {
        "Kind": "proxy-defaults",
        "Name": "global",
        "Config": {
          "protocol": "http"
        }
      }
    ]
  },
  "domain": "consul",
  "leave_on_terminate": true,
  "disable_update_check": true,
  "log_level": "debug",
  "telemetry": {
    "enable_host_metrics": true,
    "dogstatsd_addr": "127.0.0.1:8125",
    "dogstatsd_tags": [
      "service:consul"
    ]
  }
}

Operating system and Environment details

The environment has Consul servers running on EC2 instances and ECS tasks running on ECS Fargate. There are about 140 consul clients running

Log Fragments

Crash log is attached: crash.log.txt

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant