Forums

Home / Forums

You need to log in to create posts and topics. Login · Register

Multiple HDD OSD down after node/service restart

Pages: 1 2

Hi, I'm attempting to deploy a 3 node cluster on baremetal, each of which include 12 SSD and 12 HDD.  I set the first two SSD up as journal for each node.  My issue is that when I restart a node or its services, almost all of the HDD OSD do not come back online and I cannot restart them without error.  The only method that has worked is to use the dashboard UI and delete, then re-add them as OSD, but this is very tedious and if I restart the node or ceph services they just go down again.  Any help you can provide is greatly appreciated!

root@vlab-ext-jfesx77-pvsa:~# cat /opt/petasan/config/cluster_info.json
{
"backend_1_base_ip": "10.0.1.0",
"backend_1_eth_name": "eth2",
"backend_1_mask": "255.255.255.0",
"backend_1_vlan_id": "",
"backend_2_base_ip": "",
"backend_2_eth_name": "",
"backend_2_mask": "",
"backend_2_vlan_id": "",
"bonds": [],
"cifs_eth_name": "eth1",
"default_pool": "both",
"default_pool_pgs": "4096",
"default_pool_replicas": "3",
"eth_count": 3,
"iscsi_1_eth_name": "eth1",
"iscsi_2_eth_name": "eth1",
"jf_mtu_size": "9000",
"jumbo_frames": [
"eth1",
"eth2"
],
"management_eth_name": "eth0",
"management_nodes": [
{
"backend_1_ip": "10.0.1.167",
"backend_2_ip": "",
"is_backup": false,
"is_cifs": false,
"is_iscsi": true,
"is_management": true,
"is_storage": true,
"management_ip": "10.212.86.167",
"name": "vlab-ext-jfesx77-pvsa"
},
{
"backend_1_ip": "10.0.1.168",
"backend_2_ip": "",
"is_backup": false,
"is_cifs": false,
"is_iscsi": true,
"is_management": true,
"is_storage": true,
"management_ip": "10.212.86.168",
"name": "vlab-ext-jfesx78-pvsa"
},
{
"backend_1_ip": "10.0.1.169",
"backend_2_ip": "",
"is_backup": false,
"is_cifs": false,
"is_iscsi": true,
"is_management": true,
"is_storage": true,
"management_ip": "10.212.86.169",
"name": "vlab-ext-jfesx79-pvsa"
}
],
"name": "JF-EXT-BDW",
"storage_engine": "bluestore"

 

root@vlab-ext-jfesx77-pvsa:~# tail -f /opt/petasan/log/PetaSAN.log
27/04/2020 08:53:30 ERROR    Error activate osd fallback.
27/04/2020 08:53:30 ERROR    Error force activate osd  sdr
27/04/2020 08:54:00 INFO     starting activate osd.
27/04/2020 08:54:07 INFO     Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 39 c9842b59-e635-434c-8936-e022fd89c6c8
27/04/2020 08:54:07 ERROR    Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 39 c9842b59-e635-434c-8936-e022fd89c6c8
27/04/2020 08:54:07 ERROR    Error activating osd.
27/04/2020 08:54:07 INFO     Try activate all osds fallback ...
27/04/2020 08:54:13 ERROR    Error activate osd fallback.
27/04/2020 08:54:13 ERROR    Error force activate osd  sds
27/04/2020 08:54:43 INFO     starting activate osd.
27/04/2020 08:54:50 INFO     Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 36 87bacd05-2161-4eaf-8d5b-8e04f644b95c
27/04/2020 08:54:50 ERROR    Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 36 87bacd05-2161-4eaf-8d5b-8e04f644b95c
27/04/2020 08:54:50 ERROR    Error activating osd.
27/04/2020 08:54:50 INFO     Try activate all osds fallback ...
27/04/2020 08:54:56 ERROR    Error activate osd fallback.
27/04/2020 08:54:56 ERROR    Error force activate osd  sdt

root@vlab-ext-jfesx77-pvsa:~# ceph osd tree
ID  CLASS WEIGHT    TYPE NAME                          STATUS REWEIGHT PRI-AFF
-1       161.05142 root default
-13       161.05142     rack rack
-7        53.68381         host vlab-ext-jfesx77-pvsa
26   hdd   3.69699             osd.26                     up  1.00000 1.00000
27   hdd   3.69699             osd.27                   down        0 1.00000
34   hdd   3.69699             osd.34                   down        0 1.00000
35   hdd   3.69699             osd.35                   down        0 1.00000
36   hdd   3.69699             osd.36                   down        0 1.00000
37   hdd   3.69699             osd.37                   down        0 1.00000
38   hdd   3.69699             osd.38                     up  1.00000 1.00000
39   hdd   3.69699             osd.39                   down        0 1.00000
40   hdd   3.69699             osd.40                     up  1.00000 1.00000
41   hdd   3.69699             osd.41                   down        0 1.00000
42   hdd   3.69699             osd.42                   down  1.00000 1.00000
43   hdd   3.69699             osd.43                   down        0 1.00000
22   ssd   0.93199             osd.22                     up  1.00000 1.00000
23   ssd   0.93199             osd.23                     up  1.00000 1.00000
24   ssd   0.93199             osd.24                     up  1.00000 1.00000
25   ssd   0.93199             osd.25                     up  1.00000 1.00000
28   ssd   0.93199             osd.28                     up  1.00000 1.00000
29   ssd   0.93199             osd.29                     up  1.00000 1.00000
30   ssd   0.93199             osd.30                     up  1.00000 1.00000
31   ssd   0.93199             osd.31                     up  1.00000 1.00000
32   ssd   0.93199             osd.32                     up  1.00000 1.00000
33   ssd   0.93199             osd.33                     up  1.00000 1.00000
-10        53.68381         host vlab-ext-jfesx78-pvsa
48   hdd   3.69699             osd.48                   down        0 1.00000
49   hdd   3.69699             osd.49                   down        0 1.00000
56   hdd   3.69699             osd.56                   down        0 1.00000
57   hdd   3.69699             osd.57                   down        0 1.00000
58   hdd   3.69699             osd.58                   down        0 1.00000
59   hdd   3.69699             osd.59                   down        0 1.00000
60   hdd   3.69699             osd.60                   down        0 1.00000
61   hdd   3.69699             osd.61                   down        0 1.00000
62   hdd   3.69699             osd.62                   down        0 1.00000
63   hdd   3.69699             osd.63                     up  1.00000 1.00000
64   hdd   3.69699             osd.64                   down        0 1.00000
65   hdd   3.69699             osd.65                     up  1.00000 1.00000
44   ssd   0.93199             osd.44                     up  1.00000 1.00000
45   ssd   0.93199             osd.45                     up  1.00000 1.00000
46   ssd   0.93199             osd.46                     up  1.00000 1.00000
47   ssd   0.93199             osd.47                     up  1.00000 1.00000
50   ssd   0.93199             osd.50                     up  1.00000 1.00000
51   ssd   0.93199             osd.51                     up  1.00000 1.00000
52   ssd   0.93199             osd.52                     up  1.00000 1.00000
53   ssd   0.93199             osd.53                     up  1.00000 1.00000
54   ssd   0.93199             osd.54                     up  1.00000 1.00000
55   ssd   0.93199             osd.55                     up  1.00000 1.00000
-3        53.68381         host vlab-ext-jfesx79-pvsa
4   hdd   3.69699             osd.4                    down        0 1.00000
5   hdd   3.69699             osd.5                    down        0 1.00000
12   hdd   3.69699             osd.12                   down        0 1.00000
13   hdd   3.69699             osd.13                   down        0 1.00000
14   hdd   3.69699             osd.14                   down        0 1.00000
15   hdd   3.69699             osd.15                   down        0 1.00000
16   hdd   3.69699             osd.16                     up  1.00000 1.00000
17   hdd   3.69699             osd.17                   down        0 1.00000
18   hdd   3.69699             osd.18                     up  1.00000 1.00000
19   hdd   3.69699             osd.19                   down        0 1.00000
20   hdd   3.69699             osd.20                   down        0 1.00000
21   hdd   3.69699             osd.21                   down        0 1.00000
0   ssd   0.93199             osd.0                      up  1.00000 1.00000
1   ssd   0.93199             osd.1                      up  1.00000 1.00000
2   ssd   0.93199             osd.2                      up  1.00000 1.00000
3   ssd   0.93199             osd.3                      up  1.00000 1.00000
6   ssd   0.93199             osd.6                      up  1.00000 1.00000
7   ssd   0.93199             osd.7                      up  1.00000 1.00000
8   ssd   0.93199             osd.8                      up  1.00000 1.00000
9   ssd   0.93199             osd.9                      up  1.00000 1.00000
10   ssd   0.93199             osd.10                     up  1.00000 1.00000
11   ssd   0.93199             osd.11                     up  1.00000 1.00000

root@vlab-ext-jfesx77-pvsa:~# systemctl restart ceph-osd@27
root@vlab-ext-jfesx77-pvsa:~# systemctl status ceph-osd@27
ceph-osd@27.service - Ceph object storage daemon osd.27
Loaded: loaded (/lib/systemd/system/ceph-osd@.service; indirect; vendor preset: enabled)
Active: failed (Result: exit-code) since Mon 2020-04-27 08:59:31 PDT; 4s ago
Process: 27804 ExecStart=/usr/bin/ceph-osd -f --cluster ${CLUSTER} --id 27 --setuser ceph --setgroup ceph (code=exited, status=1/FAILURE)
Process: 27800 ExecStartPre=/usr/lib/ceph/ceph-osd-prestart.sh --cluster ${CLUSTER} --id 27 (code=exited, status=0/SUCCESS)
Main PID: 27804 (code=exited, status=1/FAILURE)

Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: ceph-osd@27.service: Main process exited, code=exited, status=1/FAILURE
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: ceph-osd@27.service: Failed with result 'exit-code'.
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: ceph-osd@27.service: Service hold-off time over, scheduling restart.
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: ceph-osd@27.service: Scheduled restart job, restart counter is at 5.
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: Stopped Ceph object storage daemon osd.27.
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: ceph-osd@27.service: Start request repeated too quickly.
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: ceph-osd@27.service: Failed with result 'exit-code'.
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: Failed to start Ceph object storage daemon osd.27.

 

Thanks!

do you have enough ram ?

if you deploy the cluster with say 1 OSD per node, does it work ?

I have 64GB memory per node.  I can try redeploying but I suspect that yes if I choose one OSD per node it would work.

24/22 OSDs per node requires more that 64 GB RAM. Each OSD requires 4 GB + the other services do need RAM as well (check our guide), also if you do use SSD cache devices, they also need RAM ( 2% of disk partition size)

I would test with say 8 OSDs in total per host, if you have no issue, then it is probably pure RAM.

Just for clarity I realize I misspoke when I said this is baremetal.  It's 3 nodes which have VMware ESXi installed, but all of the disks are raw disk mapped.  What I meant was I'm not using VM backed storage, but instead passing through the hdd/ssd.  When I look at memory metrics for these I don't see them using more than 5GB.  I increased to 96GB one node 77 and see no change.  Thoughts?

root@vlab-ext-jfesx77-pvsa:~# free -h
total        used        free      shared  buff/cache   available
Mem:            94G        1.6G         92G        9.2M        742M         92G
Swap:            0B          0B          0B

root@vlab-ext-jfesx78-pvsa:/proc/sys/kernel# free -h
total        used        free      shared  buff/cache   available
Mem:            62G        4.4G         54G        7.2M        4.1G         58G
Swap:            0B          0B          0B

root@vlab-ext-jfesx79-pvsa:~# free -h
total        used        free      shared  buff/cache   available
Mem:            62G        2.8G         56G        7.8M        4.0G         59G
Swap:            0B          0B          0B

Are you SSD OSDs all fine and it is a problem just with HDDs ? do all HDDs have an issue or some ? is the issue only with  OSDs with journal ?

All SSD are ok and only two of the twelce HDD are ok.  I'm redeploying now and going to try to just put OSD on the HDD and see if results differ.

what version do you have ?

I first saw the issue on a clean install of 2.5.0.  I also upgraded it to the 2.5.2 and the problem persists.

As I said earlier I just tried redeploying with only the 12 HDD and 1 SSD journal.  Like before they all show up online after the installation, but after a node reboot only 2 of the 12 come back up.

 

root@vlab-ext-jfesx77-pvsa:~# tail -f /opt/petasan/log/PetaSAN.log
27/04/2020 12:03:34 ERROR    Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 13 707f2d2b-0046-4409-875a-1412445ea5e5
27/04/2020 12:03:34 ERROR    Error activating osd.
27/04/2020 12:03:34 INFO     Try activate all osds fallback ...
27/04/2020 12:03:57 ERROR    Error activate osd fallback.
27/04/2020 12:03:57 ERROR    Error force activate osd  sdo
27/04/2020 12:04:27 INFO     starting activate osd.
27/04/2020 12:04:51 INFO     Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 20 353d61a0-d6aa-4ded-81dd-a2f24649362c
27/04/2020 12:04:51 ERROR    Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 20 353d61a0-d6aa-4ded-81dd-a2f24649362c
27/04/2020 12:04:51 ERROR    Error activating osd.
27/04/2020 12:04:51 INFO     Try activate all osds fallback ...
27/04/2020 12:05:14 ERROR    Error activate osd fallback.
27/04/2020 12:05:14 ERROR    Error force activate osd  sdp
27/04/2020 12:05:44 INFO     starting activate osd.
27/04/2020 12:06:08 INFO     Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 21 5d254f5a-b718-47df-80ee-abdcecfb5dfa
27/04/2020 12:06:09 ERROR    Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 21 5d254f5a-b718-47df-80ee-abdcecfb5dfa
27/04/2020 12:06:09 ERROR    Error activating osd.
27/04/2020 12:06:09 INFO     Try activate all osds fallback ...
27/04/2020 12:06:32 ERROR    Error activate osd fallback.
27/04/2020 12:06:32 ERROR    Error force activate osd  sdq
27/04/2020 12:07:02 INFO     starting activate osd.
27/04/2020 12:07:25 INFO     Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 18 5466c85b-4204-4ac7-bf53-429c9e33deeb
27/04/2020 12:07:56 INFO     starting activate osd.
27/04/2020 12:08:19 INFO     Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 19 0fd8dd56-8108-47b6-bab4-7e98e1c19e19
27/04/2020 12:08:20 ERROR    Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 19 0fd8dd56-8108-47b6-bab4-7e98e1c19e19
27/04/2020 12:08:20 ERROR    Error activating osd.
27/04/2020 12:08:20 INFO     Try activate all osds fallback ...
27/04/2020 12:08:41 ERROR    Error activate osd fallback.
27/04/2020 12:08:41 ERROR    Error force activate osd  sds
27/04/2020 12:09:11 INFO     starting activate osd.
27/04/2020 12:09:33 INFO     Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 16 597b3105-e01a-4939-a04a-b4637c54193d
27/04/2020 12:09:34 ERROR    Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 16 597b3105-e01a-4939-a04a-b4637c54193d
27/04/2020 12:09:34 ERROR    Error activating osd.
27/04/2020 12:09:34 INFO     Try activate all osds fallback ...
27/04/2020 12:09:56 ERROR    Error activate osd fallback.
27/04/2020 12:09:56 ERROR    Error force activate osd  sdt
27/04/2020 12:10:26 INFO     starting activate osd.
27/04/2020 12:10:48 INFO     Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 17 b05d6d29-16a0-4d3b-9297-3ad40ca549cf
27/04/2020 12:10:49 ERROR    Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 17 b05d6d29-16a0-4d3b-9297-3ad40ca549cf
27/04/2020 12:10:49 ERROR    Error activating osd.
27/04/2020 12:10:49 INFO     Try activate all osds fallback ...
27/04/2020 12:11:11 ERROR    Error activate osd fallback.
27/04/2020 12:11:11 ERROR    Error force activate osd  sdu
27/04/2020 12:11:41 INFO     starting activate osd.
27/04/2020 12:12:03 INFO     Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 14 346ba8e1-874e-4c41-be40-d0f0e49527cb
27/04/2020 12:12:04 ERROR    Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 14 346ba8e1-874e-4c41-be40-d0f0e49527cb
27/04/2020 12:12:04 ERROR    Error activating osd.
27/04/2020 12:12:04 INFO     Try activate all osds fallback ...
27/04/2020 12:12:25 ERROR    Error activate osd fallback.
27/04/2020 12:12:25 ERROR    Error force activate osd  sdv
27/04/2020 12:12:55 INFO     starting activate osd.
27/04/2020 12:13:18 INFO     Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 15 b7461add-27b6-4557-a3e4-e1dfe8c22eb9
27/04/2020 12:13:19 ERROR    Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 15 b7461add-27b6-4557-a3e4-e1dfe8c22eb9
27/04/2020 12:13:19 ERROR    Error activating osd.
27/04/2020 12:13:19 INFO     Try activate all osds fallback ...
27/04/2020 12:13:40 ERROR    Error activate osd fallback.
27/04/2020 12:13:40 ERROR    Error force activate osd  sdw
27/04/2020 12:14:10 INFO     starting activate osd.
27/04/2020 12:14:33 INFO     Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 22 33b0af45-d492-424e-9837-c2fae49def9a
27/04/2020 12:14:34 ERROR    Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 22 33b0af45-d492-424e-9837-c2fae49def9a
27/04/2020 12:14:34 ERROR    Error activating osd.
27/04/2020 12:14:34 INFO     Try activate all osds fallback ...
27/04/2020 12:14:55 ERROR    Error activate osd fallback.
27/04/2020 12:14:55 ERROR    Error force activate osd  sdx
27/04/2020 12:15:25 INFO     starting activate osd.
27/04/2020 12:15:48 INFO     Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 23 6be3fb10-741a-4431-91c0-f5ae51e249bb
27/04/2020 12:15:49 ERROR    Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 23 6be3fb10-741a-4431-91c0-f5ae51e249bb
27/04/2020 12:15:49 ERROR    Error activating osd.
27/04/2020 12:15:49 INFO     Try activate all osds fallback ...
27/04/2020 12:16:10 ERROR    Error activate osd fallback.
27/04/2020 12:16:10 ERROR    Error force activate osd  sdy
^C
root@vlab-ext-jfesx77-pvsa:~# ceph osd tree
ID CLASS WEIGHT    TYPE NAME                      STATUS REWEIGHT PRI-AFF
-1       133.10266 root default
-5        44.36755     host vlab-ext-jfesx77-pvsa
12   hdd   3.69730         osd.12                     up  1.00000 1.00000
13   hdd   3.69730         osd.13                   down        0 1.00000
14   hdd   3.69730         osd.14                   down        0 1.00000
15   hdd   3.69730         osd.15                   down        0 1.00000
16   hdd   3.69730         osd.16                   down        0 1.00000
17   hdd   3.69730         osd.17                   down        0 1.00000
18   hdd   3.69730         osd.18                     up  1.00000 1.00000
19   hdd   3.69730         osd.19                   down        0 1.00000
20   hdd   3.69730         osd.20                   down        0 1.00000
21   hdd   3.69730         osd.21                   down        0 1.00000
22   hdd   3.69730         osd.22                   down        0 1.00000
23   hdd   3.69730         osd.23                   down        0 1.00000

can you show output of

parted /dev/sdX print

sdX is the journal device

Pages: 1 2