0

I have a deployment of Portainer 2.14.2 and Docker Engine 20.10.7. It has been functional for quite a few months. Today I had some problems as the Portainer container (the one that is in charge of the UI, not the agent) was restarting. In one of those restarts, for an unknown reason, the database has been corrupted. Logs:

 time="2022-10-19T10:59:15Z" level=info msg="Encryption key file `portainer` not present"
 time="2022-10-19T10:59:15Z" level=info msg="Proceeding without encryption key"
 time="2022-10-19T10:59:15Z" level=info msg="Loading PortainerDB: portainer.db"
 panic: page 8 already freed

 goroutine 35 [running]:
 go.etcd.io/bbolt.(*freelist).free(0xc000728600, 0xb175, 0x7f104c311000)
       /tmp/go/pkg/mod/go.etcd.io/bbolt@v1.3.6/freelist.go:175 +0x2c8
 go.etcd.io/bbolt.(*node).spill(0xc000152070)
       /tmp/go/pkg/mod/go.etcd.io/bbolt@v1.3.6/node.go:359 +0x216
 go.etcd.io/bbolt.(*node).spill(0xc000152000)
       /tmp/go/pkg/mod/go.etcd.io/bbolt@v1.3.6/node.go:346 +0xaa
 go.etcd.io/bbolt.(*Bucket).spill(0xc00013e018)
       /tmp/go/pkg/mod/go.etcd.io/bbolt@v1.3.6/bucket.go:570 +0x33f
 go.etcd.io/bbolt.(*Tx).Commit(0xc00013e000)
       /tmp/go/pkg/mod/go.etcd.io/bbolt@v1.3.6/tx.go:160 +0xe7
 go.etcd.io/bbolt.(*DB).Update(0xc0001f1000?, 0xc000134ef8)
       /tmp/go/pkg/mod/go.etcd.io/bbolt@v1.3.6/db.go:748 +0xe5
 go.etcd.io/bbolt.(*batch).run(0xc00031c000)
       /tmp/go/pkg/mod/go.etcd.io/bbolt@v1.3.6/db.go:856 +0x126
 sync.(*Once).doSlow(0x0?, 0x1?)
       /opt/hostedtoolcache/go/1.18.3/x64/src/sync/once.go:68 +0xc2
 sync.(*Once).Do(...)
       /opt/hostedtoolcache/go/1.18.3/x64/src/sync/once.go:59
 go.etcd.io/bbolt.(*batch).trigger(0xc000321a00?)
       /tmp/go/pkg/mod/go.etcd.io/bbolt@v1.3.6/db.go:838 +0x45
 created by time.goFunc
       /opt/hostedtoolcache/go/1.18.3/x64/src/time/sleep.go:176 +0x32

My hypothesis is that in one of those restarts, the container might have been stopped in the middle of a writing procedure (although I am not 100% sure). This is the first time this has happened to me, so I don't know how to recover from this state without deploying a new Portainer stack or erasing the whole database, as this would be a really drastic solution.

If it helps this is the docker-compose:

version: "3.8"

networks:
    net:
        external: true

services:
    agent:
        image: portainer/agent:2.14.2-alpine
        environment:
            AGENT_CLUSTER_ADDR: tasks.agent
            AGENT_PORT: 9001
        volumes:
            - /var/run/docker.sock:/var/run/docker.sock:ro
            - /var/lib/docker/volumes:/var/lib/docker/volumes
        networks:
            - net
        deploy:
            mode: global
            restart_policy:
                condition: on-failure

    portainer:
        image: portainer/portainer-ce:2.14.2-alpine
        command: -H tcp://tasks.agent:9001 --tlsskipverify --admin-password-file=/run/secrets/portainer_secret
        ports:
            - "9000:9000"
            - "8000:8000"
        volumes:
            - "/var/volumes/portainer/data:/data"
        networks:
            - net
        secrets:
            - portainer_secret
            - source: ca_cert_secret
              target: /etc/ssl/certs/localCA.pem
        deploy:
            mode: replicated
            replicas: 1
            restart_policy:
                condition: on-failure
            placement:
                constraints:
                    - node.labels.stateful == true
            labels:
                - "traefik.enable=true"
                - "traefik.passHostHeader=true"
                - "traefik.http.routers.portainer.rule=Host(`portainer`)"
                - "traefik.http.services.portainer.loadbalancer.server.port=9000"
                - "traefik.http.routers.portainer.entrypoints=web"
                - "traefik.http.routers.portainer.service=portainer"
                - "traefik.http.routers.portainer.tls=true"
                - "traefik.http.routers.portainer.entrypoints=web-secure"

secrets:
    portainer_secret:
        external: true
    ca_cert_secret:
        external: true
Pablo Ochoa
  • 77
  • 1
  • 12
  • 1
    You have been taking regular backups right? Otherwise, the error indicates the db is a boltdb. so extracting the db and trying an external recovery tool (if such things even exist) is your only non burn-everything-down way forward. – Chris Becke Oct 21 '22 at 07:09
  • No, I haven't taken regular backups but I guess it is a recommended practice as it would be the only way to restore a corrupted DB. – Pablo Ochoa Oct 24 '22 at 12:14

0 Answers0