Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cilium crash after update to 2.13.x #1267

Closed
CroutonDigital opened this issue Mar 8, 2024 · 2 comments
Closed

Cilium crash after update to 2.13.x #1267

CroutonDigital opened this issue Mar 8, 2024 · 2 comments

Comments

@CroutonDigital
Copy link

Description

After update to 2.13.2 can't add new cluster node in status NotReady:

cilium container is not started with errors:

 Running                                                                                                                                                                                      │
level=info msg=Invoked duration="546.384µs" function="github.com/cilium/cilium/cilium-dbg/cmd.glob..func39 (cmd/build-config.go:32)" subsys=hive                                             │
 level=info msg=Starting subsys=hive                                                                                                                                                          
 level=info msg="Establishing connection to apiserver" host="https://10.43.0.1:443" subsys=k8s-client                                                                                         
 level=info msg="Establishing connection to apiserver" host="https://10.43.0.1:443" subsys=k8s-client                                                                                         
 level=error msg="Unable to contact k8s api-server" error="Get \"https://10.43.0.1:443/api/v1/namespaces/kube-system\": dial tcp 10.43.0.1:443: i/o timeout" ipAddr="https://10.43.0.1:443" s │
 level=error msg="Start hook failed" error="Get \"https://10.43.0.1:443/api/v1/namespaces/kube-system\": dial tcp 10.43.0.1:443: i/o timeout" function="client.(*compositeClientset).onStart" 
 level=info msg=Stopping subsys=hive                                                                                                                                                          
 Error: Build config failed: failed to start: Get "https://10.43.0.1:443/api/v1/namespaces/kube-system": dial tcp 10.43.0.1:443: i/o timeout                                                  │

Kube.tf file

module "kube-hetzner" {
  providers = {
    hcloud = hcloud
  }
  hcloud_token = var.hcloud_token != "" ? var.hcloud_token : local.hcloud_token
  source = "kube-hetzner/kube-hetzner/hcloud"
  version = "2.13.2"

  ssh_port = 2222
  ssh_public_key = file("${path.module}/ssh/k8s-hetzner.pub")
  ssh_private_key = file("${path.module}/ssh/k8s-hetzner")

  network_ipv4_cidr = "10.0.0.0/8"
  cluster_ipv4_cidr = "10.42.0.0/16"

  control_plane_nodepools = [
    {
      name        = "control-plane-fsn1",
      server_type = "cpx21",
      location    = "fsn1",
      labels      = [],
      taints      = [],
      count       = 1
    },
    {
      name        = "control-plane-nbg1",
      server_type = "cpx21",
      location    = "nbg1",
      labels      = [],
      taints      = [],
      count       = 1
    },
  ]

  agent_nodepools = [
    {
      name        = "agent-small",
      server_type = "cpx11",
      location    = "fsn1",
      labels      = [],
      taints      = [],
      count       = 0
    },
    {
      name        = "agent-large",
      server_type = "ccx23",
      location    = "fsn1",
      labels      = [
        "nodetype=core-ccx23"
      ],
      taints      = [],
      count       = 3
    },
    {
      name        = "bots-large",
      server_type = "ccx23",
      location    = "fsn1",
      labels      = [
        "nodetype=bots-node"
      ],
      taints      = [],
      count       = 4
    },
    {
      name        = "agent-xsize",
      server_type = "ccx43",
      location    = "fsn1",
      labels      = [
        "nodetype=core-ccx43"
      ],
      taints      = [],
      count       = 0
    },
    {
      name        = "storage",
      server_type = "ccx23",
      location    = "fsn1",
      labels      = [
        "node.kubernetes.io/server-usage=storage"
      ],
      taints      = [],
      count       = 1
    },
    {
      name        = "egress",
      server_type = "cpx11",
      location    = "fsn1",
      labels = [
        "node.kubernetes.io/role=egress"
      ],
      taints = [
        "node.kubernetes.io/role=egress:NoSchedule"
      ],
      floating_ip = true
      count = 0
    },
    {
      name        = "agent-arm-small",
      server_type = "cax11",
      location    = "fsn1",
      labels      = [],
      taints      = [],
      count       = 0
    }
  ]


  load_balancer_type     = "lb11"
  load_balancer_location = "fsn1"

   autoscaler_nodepools = [
     {
       name        = "autoscaled-small"
       server_type = "ccx23"
       location    = "fsn1"
       min_nodes   = 0
       max_nodes   = 0
     },
     {
       name        = "autoscaled-large"
       server_type = "ccx23"
       location    = "fsn1"
       min_nodes   = 0
       max_nodes   = 6
       labels      = {
         nodetype: "bots-node"
         "node.kubernetes.io/role": "peak-workloads"
       }
       taints      = [{
          key: "node.kubernetes.io/role"
          value: "peak-workloads"
          effect: "NoExecute"
       }]
     }
   ]

   ingress_controller = "traefik"
   traefik_additional_options = ["--log.level=DEBUG"]
  initial_k3s_channel = "stable"
  cluster_name = "h-k3s-test"
  k3s_registries = <<-EOT
    mirrors:
      eu.gcr.io:
        endpoint:
          - "https://eu.gcr.io"
    configs:
      eu.gcr.io:
        auth:
          username: _json_key
          password: '{
  "type": "service_account",
  "project_id": "asset-management-ci-cd",
  "private_key_id": "********",
  "private_key": "*********",
  "client_email": "[email protected]",
  "client_id": "********",
  "auth_uri": "https://accounts.google.com/o/oauth2/auth",
  "token_uri": "https://oauth2.googleapis.com/token",
  "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
  "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/image-puller%40asset-management-ci-cd.iam.gserviceaccount.com",
  "universe_domain": "googleapis.com"
}'
  EOT
  restrict_outbound_traffic = true
   extra_firewall_rules = [
     {
       description = "Allow out tcp"
       direction       = "out"
       protocol        = "tcp"
       port            = "any"
       destination_ips = ["0.0.0.0/0", "::/0"]
     },
     {
       description = "Allow out udp"
       direction       = "out"
       protocol        = "udp"
       port            = "any"
       destination_ips = ["0.0.0.0/0", "::/0"]
     }
   ]

   cni_plugin = "cilium"
   cilium_routing_mode = "native"
   disable_kube_proxy = true

   placement_group_disable = true
   enable_cert_manager = false
   dns_servers = [
     "1.1.1.1",
     "8.8.8.8",
     "9.9.9.9",
   ]

  cilium_values = <<EOT
ipam:
  mode: kubernetes
k8s:
  requireIPv4PodCIDR: true
kubeProxyReplacement: true
routingMode: native
ipv4NativeRoutingCIDR: "10.0.0.0/8"
endpointRoutes:
  enabled: true
loadBalancer:
  acceleration: native
bpf:
  masquerade: true
socketLB:
  hostNamespaceOnly: true
egressGateway:
  enabled: true
prometheus:
  enabled: true
operator:
  prometheus:
    enabled: true
envoy:
  enabled: true
hubble:
  prometheus:
    enabled: true
MTU: 1450
  EOT



  traefik_values = <<EOT
deployment:
  replicas: 1
globalArguments: []
service:
  enabled: true
  type: LoadBalancer
  annotations:
    "load-balancer.hetzner.cloud/name": "h-k3s-test"
    "load-balancer.hetzner.cloud/use-private-ip": "true"
    "load-balancer.hetzner.cloud/disable-private-ingress": "true"
    "load-balancer.hetzner.cloud/location": "nbg1"
    "load-balancer.hetzner.cloud/type": "lb11"
    "load-balancer.hetzner.cloud/uses-proxyprotocol": "true"
logs:
  general:
    level: DEBUG
ports:
  web:
    redirectTo: websecure
    proxyProtocol:
      trustedIPs:
        - 127.0.0.1/32
        - 10.0.0.0/8
    forwardedHeaders:
      trustedIPs:
        - 127.0.0.1/32
        - 10.0.0.0/8
  websecure:
    proxyProtocol:
      trustedIPs:
        - 127.0.0.1/32
        - 10.0.0.0/8
    forwardedHeaders:
      trustedIPs:
        - 127.0.0.1/32
        - 10.0.0.0/8
tlsOptions: {}
tlsStore: {}
tls:
  secretName: *********
certResolvers:
  letsencrypt:
    email: maxim.*******@*******
    tlsChallenge: true
    httpChallenge:
      entryPoint: "web"
    storage: /data/acme.json
  EOT

}

provider "hcloud" {
  token = var.hcloud_token != "" ? var.hcloud_token : local.hcloud_token
}
terraform {
  required_version = ">= 1.5.0"
  required_providers {
    hcloud = {
      source  = "hetznercloud/hcloud"
      version = ">= 1.43.0"
    }
  }
}

Screenshots

Screenshot 2024-03-08 at 15 37 27 Screenshot 2024-03-08 at 15 37 19

Platform

Linux

@CroutonDigital CroutonDigital added the bug Something isn't working label Mar 8, 2024
@CroutonDigital
Copy link
Author

CroutonDigital commented Mar 8, 2024

I comment line:

  # disable_kube_proxy = true

and recreated node, issue is fixed

@mysticaltech mysticaltech removed the bug Something isn't working label Mar 10, 2024
@mysticaltech mysticaltech changed the title [Bug]: after update to 2.13.2 can't add new cluster node or restart node status NotReady Cilium crash after update to 2.13.x Mar 10, 2024
@mysticaltech
Copy link
Collaborator

Good find, important for backward compatibility during upgrades from previous versions.

@mysticaltech mysticaltech pinned this issue Mar 10, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants