Multiple HDD OSD down after node/service restart
Pages: 1 2
brent
11 Posts
April 27, 2020, 4:11 pmQuote from brent on April 27, 2020, 4:11 pmHi, I'm attempting to deploy a 3 node cluster on baremetal, each of which include 12 SSD and 12 HDD. I set the first two SSD up as journal for each node. My issue is that when I restart a node or its services, almost all of the HDD OSD do not come back online and I cannot restart them without error. The only method that has worked is to use the dashboard UI and delete, then re-add them as OSD, but this is very tedious and if I restart the node or ceph services they just go down again. Any help you can provide is greatly appreciated!
root@vlab-ext-jfesx77-pvsa:~# cat /opt/petasan/config/cluster_info.json
{
"backend_1_base_ip": "10.0.1.0",
"backend_1_eth_name": "eth2",
"backend_1_mask": "255.255.255.0",
"backend_1_vlan_id": "",
"backend_2_base_ip": "",
"backend_2_eth_name": "",
"backend_2_mask": "",
"backend_2_vlan_id": "",
"bonds": [],
"cifs_eth_name": "eth1",
"default_pool": "both",
"default_pool_pgs": "4096",
"default_pool_replicas": "3",
"eth_count": 3,
"iscsi_1_eth_name": "eth1",
"iscsi_2_eth_name": "eth1",
"jf_mtu_size": "9000",
"jumbo_frames": [
"eth1",
"eth2"
],
"management_eth_name": "eth0",
"management_nodes": [
{
"backend_1_ip": "10.0.1.167",
"backend_2_ip": "",
"is_backup": false,
"is_cifs": false,
"is_iscsi": true,
"is_management": true,
"is_storage": true,
"management_ip": "10.212.86.167",
"name": "vlab-ext-jfesx77-pvsa"
},
{
"backend_1_ip": "10.0.1.168",
"backend_2_ip": "",
"is_backup": false,
"is_cifs": false,
"is_iscsi": true,
"is_management": true,
"is_storage": true,
"management_ip": "10.212.86.168",
"name": "vlab-ext-jfesx78-pvsa"
},
{
"backend_1_ip": "10.0.1.169",
"backend_2_ip": "",
"is_backup": false,
"is_cifs": false,
"is_iscsi": true,
"is_management": true,
"is_storage": true,
"management_ip": "10.212.86.169",
"name": "vlab-ext-jfesx79-pvsa"
}
],
"name": "JF-EXT-BDW",
"storage_engine": "bluestore"
root@vlab-ext-jfesx77-pvsa:~# tail -f /opt/petasan/log/PetaSAN.log
27/04/2020 08:53:30 ERROR Error activate osd fallback.
27/04/2020 08:53:30 ERROR Error force activate osd sdr
27/04/2020 08:54:00 INFO starting activate osd.
27/04/2020 08:54:07 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 39 c9842b59-e635-434c-8936-e022fd89c6c8
27/04/2020 08:54:07 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 39 c9842b59-e635-434c-8936-e022fd89c6c8
27/04/2020 08:54:07 ERROR Error activating osd.
27/04/2020 08:54:07 INFO Try activate all osds fallback ...
27/04/2020 08:54:13 ERROR Error activate osd fallback.
27/04/2020 08:54:13 ERROR Error force activate osd sds
27/04/2020 08:54:43 INFO starting activate osd.
27/04/2020 08:54:50 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 36 87bacd05-2161-4eaf-8d5b-8e04f644b95c
27/04/2020 08:54:50 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 36 87bacd05-2161-4eaf-8d5b-8e04f644b95c
27/04/2020 08:54:50 ERROR Error activating osd.
27/04/2020 08:54:50 INFO Try activate all osds fallback ...
27/04/2020 08:54:56 ERROR Error activate osd fallback.
27/04/2020 08:54:56 ERROR Error force activate osd sdt
root@vlab-ext-jfesx77-pvsa:~# ceph osd tree
ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF
-1 161.05142 root default
-13 161.05142 rack rack
-7 53.68381 host vlab-ext-jfesx77-pvsa
26 hdd 3.69699 osd.26 up 1.00000 1.00000
27 hdd 3.69699 osd.27 down 0 1.00000
34 hdd 3.69699 osd.34 down 0 1.00000
35 hdd 3.69699 osd.35 down 0 1.00000
36 hdd 3.69699 osd.36 down 0 1.00000
37 hdd 3.69699 osd.37 down 0 1.00000
38 hdd 3.69699 osd.38 up 1.00000 1.00000
39 hdd 3.69699 osd.39 down 0 1.00000
40 hdd 3.69699 osd.40 up 1.00000 1.00000
41 hdd 3.69699 osd.41 down 0 1.00000
42 hdd 3.69699 osd.42 down 1.00000 1.00000
43 hdd 3.69699 osd.43 down 0 1.00000
22 ssd 0.93199 osd.22 up 1.00000 1.00000
23 ssd 0.93199 osd.23 up 1.00000 1.00000
24 ssd 0.93199 osd.24 up 1.00000 1.00000
25 ssd 0.93199 osd.25 up 1.00000 1.00000
28 ssd 0.93199 osd.28 up 1.00000 1.00000
29 ssd 0.93199 osd.29 up 1.00000 1.00000
30 ssd 0.93199 osd.30 up 1.00000 1.00000
31 ssd 0.93199 osd.31 up 1.00000 1.00000
32 ssd 0.93199 osd.32 up 1.00000 1.00000
33 ssd 0.93199 osd.33 up 1.00000 1.00000
-10 53.68381 host vlab-ext-jfesx78-pvsa
48 hdd 3.69699 osd.48 down 0 1.00000
49 hdd 3.69699 osd.49 down 0 1.00000
56 hdd 3.69699 osd.56 down 0 1.00000
57 hdd 3.69699 osd.57 down 0 1.00000
58 hdd 3.69699 osd.58 down 0 1.00000
59 hdd 3.69699 osd.59 down 0 1.00000
60 hdd 3.69699 osd.60 down 0 1.00000
61 hdd 3.69699 osd.61 down 0 1.00000
62 hdd 3.69699 osd.62 down 0 1.00000
63 hdd 3.69699 osd.63 up 1.00000 1.00000
64 hdd 3.69699 osd.64 down 0 1.00000
65 hdd 3.69699 osd.65 up 1.00000 1.00000
44 ssd 0.93199 osd.44 up 1.00000 1.00000
45 ssd 0.93199 osd.45 up 1.00000 1.00000
46 ssd 0.93199 osd.46 up 1.00000 1.00000
47 ssd 0.93199 osd.47 up 1.00000 1.00000
50 ssd 0.93199 osd.50 up 1.00000 1.00000
51 ssd 0.93199 osd.51 up 1.00000 1.00000
52 ssd 0.93199 osd.52 up 1.00000 1.00000
53 ssd 0.93199 osd.53 up 1.00000 1.00000
54 ssd 0.93199 osd.54 up 1.00000 1.00000
55 ssd 0.93199 osd.55 up 1.00000 1.00000
-3 53.68381 host vlab-ext-jfesx79-pvsa
4 hdd 3.69699 osd.4 down 0 1.00000
5 hdd 3.69699 osd.5 down 0 1.00000
12 hdd 3.69699 osd.12 down 0 1.00000
13 hdd 3.69699 osd.13 down 0 1.00000
14 hdd 3.69699 osd.14 down 0 1.00000
15 hdd 3.69699 osd.15 down 0 1.00000
16 hdd 3.69699 osd.16 up 1.00000 1.00000
17 hdd 3.69699 osd.17 down 0 1.00000
18 hdd 3.69699 osd.18 up 1.00000 1.00000
19 hdd 3.69699 osd.19 down 0 1.00000
20 hdd 3.69699 osd.20 down 0 1.00000
21 hdd 3.69699 osd.21 down 0 1.00000
0 ssd 0.93199 osd.0 up 1.00000 1.00000
1 ssd 0.93199 osd.1 up 1.00000 1.00000
2 ssd 0.93199 osd.2 up 1.00000 1.00000
3 ssd 0.93199 osd.3 up 1.00000 1.00000
6 ssd 0.93199 osd.6 up 1.00000 1.00000
7 ssd 0.93199 osd.7 up 1.00000 1.00000
8 ssd 0.93199 osd.8 up 1.00000 1.00000
9 ssd 0.93199 osd.9 up 1.00000 1.00000
10 ssd 0.93199 osd.10 up 1.00000 1.00000
11 ssd 0.93199 osd.11 up 1.00000 1.00000
root@vlab-ext-jfesx77-pvsa:~# systemctl restart ceph-osd@27
root@vlab-ext-jfesx77-pvsa:~# systemctl status ceph-osd@27
● ceph-osd@27.service - Ceph object storage daemon osd.27
Loaded: loaded (/lib/systemd/system/ceph-osd@.service; indirect; vendor preset: enabled)
Active: failed (Result: exit-code) since Mon 2020-04-27 08:59:31 PDT; 4s ago
Process: 27804 ExecStart=/usr/bin/ceph-osd -f --cluster ${CLUSTER} --id 27 --setuser ceph --setgroup ceph (code=exited, status=1/FAILURE)
Process: 27800 ExecStartPre=/usr/lib/ceph/ceph-osd-prestart.sh --cluster ${CLUSTER} --id 27 (code=exited, status=0/SUCCESS)
Main PID: 27804 (code=exited, status=1/FAILURE)
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: ceph-osd@27.service: Main process exited, code=exited, status=1/FAILURE
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: ceph-osd@27.service: Failed with result 'exit-code'.
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: ceph-osd@27.service: Service hold-off time over, scheduling restart.
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: ceph-osd@27.service: Scheduled restart job, restart counter is at 5.
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: Stopped Ceph object storage daemon osd.27.
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: ceph-osd@27.service: Start request repeated too quickly.
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: ceph-osd@27.service: Failed with result 'exit-code'.
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: Failed to start Ceph object storage daemon osd.27.
Thanks!
Hi, I'm attempting to deploy a 3 node cluster on baremetal, each of which include 12 SSD and 12 HDD. I set the first two SSD up as journal for each node. My issue is that when I restart a node or its services, almost all of the HDD OSD do not come back online and I cannot restart them without error. The only method that has worked is to use the dashboard UI and delete, then re-add them as OSD, but this is very tedious and if I restart the node or ceph services they just go down again. Any help you can provide is greatly appreciated!
root@vlab-ext-jfesx77-pvsa:~# cat /opt/petasan/config/cluster_info.json
{
"backend_1_base_ip": "10.0.1.0",
"backend_1_eth_name": "eth2",
"backend_1_mask": "255.255.255.0",
"backend_1_vlan_id": "",
"backend_2_base_ip": "",
"backend_2_eth_name": "",
"backend_2_mask": "",
"backend_2_vlan_id": "",
"bonds": [],
"cifs_eth_name": "eth1",
"default_pool": "both",
"default_pool_pgs": "4096",
"default_pool_replicas": "3",
"eth_count": 3,
"iscsi_1_eth_name": "eth1",
"iscsi_2_eth_name": "eth1",
"jf_mtu_size": "9000",
"jumbo_frames": [
"eth1",
"eth2"
],
"management_eth_name": "eth0",
"management_nodes": [
{
"backend_1_ip": "10.0.1.167",
"backend_2_ip": "",
"is_backup": false,
"is_cifs": false,
"is_iscsi": true,
"is_management": true,
"is_storage": true,
"management_ip": "10.212.86.167",
"name": "vlab-ext-jfesx77-pvsa"
},
{
"backend_1_ip": "10.0.1.168",
"backend_2_ip": "",
"is_backup": false,
"is_cifs": false,
"is_iscsi": true,
"is_management": true,
"is_storage": true,
"management_ip": "10.212.86.168",
"name": "vlab-ext-jfesx78-pvsa"
},
{
"backend_1_ip": "10.0.1.169",
"backend_2_ip": "",
"is_backup": false,
"is_cifs": false,
"is_iscsi": true,
"is_management": true,
"is_storage": true,
"management_ip": "10.212.86.169",
"name": "vlab-ext-jfesx79-pvsa"
}
],
"name": "JF-EXT-BDW",
"storage_engine": "bluestore"
root@vlab-ext-jfesx77-pvsa:~# tail -f /opt/petasan/log/PetaSAN.log
27/04/2020 08:53:30 ERROR Error activate osd fallback.
27/04/2020 08:53:30 ERROR Error force activate osd sdr
27/04/2020 08:54:00 INFO starting activate osd.
27/04/2020 08:54:07 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 39 c9842b59-e635-434c-8936-e022fd89c6c8
27/04/2020 08:54:07 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 39 c9842b59-e635-434c-8936-e022fd89c6c8
27/04/2020 08:54:07 ERROR Error activating osd.
27/04/2020 08:54:07 INFO Try activate all osds fallback ...
27/04/2020 08:54:13 ERROR Error activate osd fallback.
27/04/2020 08:54:13 ERROR Error force activate osd sds
27/04/2020 08:54:43 INFO starting activate osd.
27/04/2020 08:54:50 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 36 87bacd05-2161-4eaf-8d5b-8e04f644b95c
27/04/2020 08:54:50 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 36 87bacd05-2161-4eaf-8d5b-8e04f644b95c
27/04/2020 08:54:50 ERROR Error activating osd.
27/04/2020 08:54:50 INFO Try activate all osds fallback ...
27/04/2020 08:54:56 ERROR Error activate osd fallback.
27/04/2020 08:54:56 ERROR Error force activate osd sdt
root@vlab-ext-jfesx77-pvsa:~# ceph osd tree
ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF
-1 161.05142 root default
-13 161.05142 rack rack
-7 53.68381 host vlab-ext-jfesx77-pvsa
26 hdd 3.69699 osd.26 up 1.00000 1.00000
27 hdd 3.69699 osd.27 down 0 1.00000
34 hdd 3.69699 osd.34 down 0 1.00000
35 hdd 3.69699 osd.35 down 0 1.00000
36 hdd 3.69699 osd.36 down 0 1.00000
37 hdd 3.69699 osd.37 down 0 1.00000
38 hdd 3.69699 osd.38 up 1.00000 1.00000
39 hdd 3.69699 osd.39 down 0 1.00000
40 hdd 3.69699 osd.40 up 1.00000 1.00000
41 hdd 3.69699 osd.41 down 0 1.00000
42 hdd 3.69699 osd.42 down 1.00000 1.00000
43 hdd 3.69699 osd.43 down 0 1.00000
22 ssd 0.93199 osd.22 up 1.00000 1.00000
23 ssd 0.93199 osd.23 up 1.00000 1.00000
24 ssd 0.93199 osd.24 up 1.00000 1.00000
25 ssd 0.93199 osd.25 up 1.00000 1.00000
28 ssd 0.93199 osd.28 up 1.00000 1.00000
29 ssd 0.93199 osd.29 up 1.00000 1.00000
30 ssd 0.93199 osd.30 up 1.00000 1.00000
31 ssd 0.93199 osd.31 up 1.00000 1.00000
32 ssd 0.93199 osd.32 up 1.00000 1.00000
33 ssd 0.93199 osd.33 up 1.00000 1.00000
-10 53.68381 host vlab-ext-jfesx78-pvsa
48 hdd 3.69699 osd.48 down 0 1.00000
49 hdd 3.69699 osd.49 down 0 1.00000
56 hdd 3.69699 osd.56 down 0 1.00000
57 hdd 3.69699 osd.57 down 0 1.00000
58 hdd 3.69699 osd.58 down 0 1.00000
59 hdd 3.69699 osd.59 down 0 1.00000
60 hdd 3.69699 osd.60 down 0 1.00000
61 hdd 3.69699 osd.61 down 0 1.00000
62 hdd 3.69699 osd.62 down 0 1.00000
63 hdd 3.69699 osd.63 up 1.00000 1.00000
64 hdd 3.69699 osd.64 down 0 1.00000
65 hdd 3.69699 osd.65 up 1.00000 1.00000
44 ssd 0.93199 osd.44 up 1.00000 1.00000
45 ssd 0.93199 osd.45 up 1.00000 1.00000
46 ssd 0.93199 osd.46 up 1.00000 1.00000
47 ssd 0.93199 osd.47 up 1.00000 1.00000
50 ssd 0.93199 osd.50 up 1.00000 1.00000
51 ssd 0.93199 osd.51 up 1.00000 1.00000
52 ssd 0.93199 osd.52 up 1.00000 1.00000
53 ssd 0.93199 osd.53 up 1.00000 1.00000
54 ssd 0.93199 osd.54 up 1.00000 1.00000
55 ssd 0.93199 osd.55 up 1.00000 1.00000
-3 53.68381 host vlab-ext-jfesx79-pvsa
4 hdd 3.69699 osd.4 down 0 1.00000
5 hdd 3.69699 osd.5 down 0 1.00000
12 hdd 3.69699 osd.12 down 0 1.00000
13 hdd 3.69699 osd.13 down 0 1.00000
14 hdd 3.69699 osd.14 down 0 1.00000
15 hdd 3.69699 osd.15 down 0 1.00000
16 hdd 3.69699 osd.16 up 1.00000 1.00000
17 hdd 3.69699 osd.17 down 0 1.00000
18 hdd 3.69699 osd.18 up 1.00000 1.00000
19 hdd 3.69699 osd.19 down 0 1.00000
20 hdd 3.69699 osd.20 down 0 1.00000
21 hdd 3.69699 osd.21 down 0 1.00000
0 ssd 0.93199 osd.0 up 1.00000 1.00000
1 ssd 0.93199 osd.1 up 1.00000 1.00000
2 ssd 0.93199 osd.2 up 1.00000 1.00000
3 ssd 0.93199 osd.3 up 1.00000 1.00000
6 ssd 0.93199 osd.6 up 1.00000 1.00000
7 ssd 0.93199 osd.7 up 1.00000 1.00000
8 ssd 0.93199 osd.8 up 1.00000 1.00000
9 ssd 0.93199 osd.9 up 1.00000 1.00000
10 ssd 0.93199 osd.10 up 1.00000 1.00000
11 ssd 0.93199 osd.11 up 1.00000 1.00000
root@vlab-ext-jfesx77-pvsa:~# systemctl restart ceph-osd@27
root@vlab-ext-jfesx77-pvsa:~# systemctl status ceph-osd@27
● ceph-osd@27.service - Ceph object storage daemon osd.27
Loaded: loaded (/lib/systemd/system/ceph-osd@.service; indirect; vendor preset: enabled)
Active: failed (Result: exit-code) since Mon 2020-04-27 08:59:31 PDT; 4s ago
Process: 27804 ExecStart=/usr/bin/ceph-osd -f --cluster ${CLUSTER} --id 27 --setuser ceph --setgroup ceph (code=exited, status=1/FAILURE)
Process: 27800 ExecStartPre=/usr/lib/ceph/ceph-osd-prestart.sh --cluster ${CLUSTER} --id 27 (code=exited, status=0/SUCCESS)
Main PID: 27804 (code=exited, status=1/FAILURE)
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: ceph-osd@27.service: Main process exited, code=exited, status=1/FAILURE
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: ceph-osd@27.service: Failed with result 'exit-code'.
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: ceph-osd@27.service: Service hold-off time over, scheduling restart.
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: ceph-osd@27.service: Scheduled restart job, restart counter is at 5.
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: Stopped Ceph object storage daemon osd.27.
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: ceph-osd@27.service: Start request repeated too quickly.
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: ceph-osd@27.service: Failed with result 'exit-code'.
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: Failed to start Ceph object storage daemon osd.27.
Thanks!
admin
2,930 Posts
April 27, 2020, 4:16 pmQuote from admin on April 27, 2020, 4:16 pmdo you have enough ram ?
if you deploy the cluster with say 1 OSD per node, does it work ?
do you have enough ram ?
if you deploy the cluster with say 1 OSD per node, does it work ?
Last edited on April 27, 2020, 4:17 pm by admin · #2
brent
11 Posts
April 27, 2020, 4:20 pmQuote from brent on April 27, 2020, 4:20 pmI have 64GB memory per node. I can try redeploying but I suspect that yes if I choose one OSD per node it would work.
I have 64GB memory per node. I can try redeploying but I suspect that yes if I choose one OSD per node it would work.
admin
2,930 Posts
April 27, 2020, 4:32 pmQuote from admin on April 27, 2020, 4:32 pm24/22 OSDs per node requires more that 64 GB RAM. Each OSD requires 4 GB + the other services do need RAM as well (check our guide), also if you do use SSD cache devices, they also need RAM ( 2% of disk partition size)
I would test with say 8 OSDs in total per host, if you have no issue, then it is probably pure RAM.
24/22 OSDs per node requires more that 64 GB RAM. Each OSD requires 4 GB + the other services do need RAM as well (check our guide), also if you do use SSD cache devices, they also need RAM ( 2% of disk partition size)
I would test with say 8 OSDs in total per host, if you have no issue, then it is probably pure RAM.
brent
11 Posts
April 27, 2020, 5:02 pmQuote from brent on April 27, 2020, 5:02 pmJust for clarity I realize I misspoke when I said this is baremetal. It's 3 nodes which have VMware ESXi installed, but all of the disks are raw disk mapped. What I meant was I'm not using VM backed storage, but instead passing through the hdd/ssd. When I look at memory metrics for these I don't see them using more than 5GB. I increased to 96GB one node 77 and see no change. Thoughts?
root@vlab-ext-jfesx77-pvsa:~# free -h
total used free shared buff/cache available
Mem: 94G 1.6G 92G 9.2M 742M 92G
Swap: 0B 0B 0B
root@vlab-ext-jfesx78-pvsa:/proc/sys/kernel# free -h
total used free shared buff/cache available
Mem: 62G 4.4G 54G 7.2M 4.1G 58G
Swap: 0B 0B 0B
root@vlab-ext-jfesx79-pvsa:~# free -h
total used free shared buff/cache available
Mem: 62G 2.8G 56G 7.8M 4.0G 59G
Swap: 0B 0B 0B
Just for clarity I realize I misspoke when I said this is baremetal. It's 3 nodes which have VMware ESXi installed, but all of the disks are raw disk mapped. What I meant was I'm not using VM backed storage, but instead passing through the hdd/ssd. When I look at memory metrics for these I don't see them using more than 5GB. I increased to 96GB one node 77 and see no change. Thoughts?
root@vlab-ext-jfesx77-pvsa:~# free -h
total used free shared buff/cache available
Mem: 94G 1.6G 92G 9.2M 742M 92G
Swap: 0B 0B 0B
root@vlab-ext-jfesx78-pvsa:/proc/sys/kernel# free -h
total used free shared buff/cache available
Mem: 62G 4.4G 54G 7.2M 4.1G 58G
Swap: 0B 0B 0B
root@vlab-ext-jfesx79-pvsa:~# free -h
total used free shared buff/cache available
Mem: 62G 2.8G 56G 7.8M 4.0G 59G
Swap: 0B 0B 0B
admin
2,930 Posts
April 27, 2020, 5:25 pmQuote from admin on April 27, 2020, 5:25 pmAre you SSD OSDs all fine and it is a problem just with HDDs ? do all HDDs have an issue or some ? is the issue only with OSDs with journal ?
Are you SSD OSDs all fine and it is a problem just with HDDs ? do all HDDs have an issue or some ? is the issue only with OSDs with journal ?
Last edited on April 27, 2020, 5:26 pm by admin · #6
brent
11 Posts
April 27, 2020, 5:37 pmQuote from brent on April 27, 2020, 5:37 pmAll SSD are ok and only two of the twelce HDD are ok. I'm redeploying now and going to try to just put OSD on the HDD and see if results differ.
All SSD are ok and only two of the twelce HDD are ok. I'm redeploying now and going to try to just put OSD on the HDD and see if results differ.
admin
2,930 Posts
April 27, 2020, 7:28 pmQuote from admin on April 27, 2020, 7:28 pmwhat version do you have ?
what version do you have ?
brent
11 Posts
April 27, 2020, 8:11 pmQuote from brent on April 27, 2020, 8:11 pmI first saw the issue on a clean install of 2.5.0. I also upgraded it to the 2.5.2 and the problem persists.
As I said earlier I just tried redeploying with only the 12 HDD and 1 SSD journal. Like before they all show up online after the installation, but after a node reboot only 2 of the 12 come back up.
root@vlab-ext-jfesx77-pvsa:~# tail -f /opt/petasan/log/PetaSAN.log
27/04/2020 12:03:34 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 13 707f2d2b-0046-4409-875a-1412445ea5e5
27/04/2020 12:03:34 ERROR Error activating osd.
27/04/2020 12:03:34 INFO Try activate all osds fallback ...
27/04/2020 12:03:57 ERROR Error activate osd fallback.
27/04/2020 12:03:57 ERROR Error force activate osd sdo
27/04/2020 12:04:27 INFO starting activate osd.
27/04/2020 12:04:51 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 20 353d61a0-d6aa-4ded-81dd-a2f24649362c
27/04/2020 12:04:51 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 20 353d61a0-d6aa-4ded-81dd-a2f24649362c
27/04/2020 12:04:51 ERROR Error activating osd.
27/04/2020 12:04:51 INFO Try activate all osds fallback ...
27/04/2020 12:05:14 ERROR Error activate osd fallback.
27/04/2020 12:05:14 ERROR Error force activate osd sdp
27/04/2020 12:05:44 INFO starting activate osd.
27/04/2020 12:06:08 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 21 5d254f5a-b718-47df-80ee-abdcecfb5dfa
27/04/2020 12:06:09 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 21 5d254f5a-b718-47df-80ee-abdcecfb5dfa
27/04/2020 12:06:09 ERROR Error activating osd.
27/04/2020 12:06:09 INFO Try activate all osds fallback ...
27/04/2020 12:06:32 ERROR Error activate osd fallback.
27/04/2020 12:06:32 ERROR Error force activate osd sdq
27/04/2020 12:07:02 INFO starting activate osd.
27/04/2020 12:07:25 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 18 5466c85b-4204-4ac7-bf53-429c9e33deeb
27/04/2020 12:07:56 INFO starting activate osd.
27/04/2020 12:08:19 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 19 0fd8dd56-8108-47b6-bab4-7e98e1c19e19
27/04/2020 12:08:20 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 19 0fd8dd56-8108-47b6-bab4-7e98e1c19e19
27/04/2020 12:08:20 ERROR Error activating osd.
27/04/2020 12:08:20 INFO Try activate all osds fallback ...
27/04/2020 12:08:41 ERROR Error activate osd fallback.
27/04/2020 12:08:41 ERROR Error force activate osd sds
27/04/2020 12:09:11 INFO starting activate osd.
27/04/2020 12:09:33 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 16 597b3105-e01a-4939-a04a-b4637c54193d
27/04/2020 12:09:34 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 16 597b3105-e01a-4939-a04a-b4637c54193d
27/04/2020 12:09:34 ERROR Error activating osd.
27/04/2020 12:09:34 INFO Try activate all osds fallback ...
27/04/2020 12:09:56 ERROR Error activate osd fallback.
27/04/2020 12:09:56 ERROR Error force activate osd sdt
27/04/2020 12:10:26 INFO starting activate osd.
27/04/2020 12:10:48 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 17 b05d6d29-16a0-4d3b-9297-3ad40ca549cf
27/04/2020 12:10:49 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 17 b05d6d29-16a0-4d3b-9297-3ad40ca549cf
27/04/2020 12:10:49 ERROR Error activating osd.
27/04/2020 12:10:49 INFO Try activate all osds fallback ...
27/04/2020 12:11:11 ERROR Error activate osd fallback.
27/04/2020 12:11:11 ERROR Error force activate osd sdu
27/04/2020 12:11:41 INFO starting activate osd.
27/04/2020 12:12:03 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 14 346ba8e1-874e-4c41-be40-d0f0e49527cb
27/04/2020 12:12:04 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 14 346ba8e1-874e-4c41-be40-d0f0e49527cb
27/04/2020 12:12:04 ERROR Error activating osd.
27/04/2020 12:12:04 INFO Try activate all osds fallback ...
27/04/2020 12:12:25 ERROR Error activate osd fallback.
27/04/2020 12:12:25 ERROR Error force activate osd sdv
27/04/2020 12:12:55 INFO starting activate osd.
27/04/2020 12:13:18 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 15 b7461add-27b6-4557-a3e4-e1dfe8c22eb9
27/04/2020 12:13:19 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 15 b7461add-27b6-4557-a3e4-e1dfe8c22eb9
27/04/2020 12:13:19 ERROR Error activating osd.
27/04/2020 12:13:19 INFO Try activate all osds fallback ...
27/04/2020 12:13:40 ERROR Error activate osd fallback.
27/04/2020 12:13:40 ERROR Error force activate osd sdw
27/04/2020 12:14:10 INFO starting activate osd.
27/04/2020 12:14:33 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 22 33b0af45-d492-424e-9837-c2fae49def9a
27/04/2020 12:14:34 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 22 33b0af45-d492-424e-9837-c2fae49def9a
27/04/2020 12:14:34 ERROR Error activating osd.
27/04/2020 12:14:34 INFO Try activate all osds fallback ...
27/04/2020 12:14:55 ERROR Error activate osd fallback.
27/04/2020 12:14:55 ERROR Error force activate osd sdx
27/04/2020 12:15:25 INFO starting activate osd.
27/04/2020 12:15:48 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 23 6be3fb10-741a-4431-91c0-f5ae51e249bb
27/04/2020 12:15:49 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 23 6be3fb10-741a-4431-91c0-f5ae51e249bb
27/04/2020 12:15:49 ERROR Error activating osd.
27/04/2020 12:15:49 INFO Try activate all osds fallback ...
27/04/2020 12:16:10 ERROR Error activate osd fallback.
27/04/2020 12:16:10 ERROR Error force activate osd sdy
^C
root@vlab-ext-jfesx77-pvsa:~# ceph osd tree
ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF
-1 133.10266 root default
-5 44.36755 host vlab-ext-jfesx77-pvsa
12 hdd 3.69730 osd.12 up 1.00000 1.00000
13 hdd 3.69730 osd.13 down 0 1.00000
14 hdd 3.69730 osd.14 down 0 1.00000
15 hdd 3.69730 osd.15 down 0 1.00000
16 hdd 3.69730 osd.16 down 0 1.00000
17 hdd 3.69730 osd.17 down 0 1.00000
18 hdd 3.69730 osd.18 up 1.00000 1.00000
19 hdd 3.69730 osd.19 down 0 1.00000
20 hdd 3.69730 osd.20 down 0 1.00000
21 hdd 3.69730 osd.21 down 0 1.00000
22 hdd 3.69730 osd.22 down 0 1.00000
23 hdd 3.69730 osd.23 down 0 1.00000
I first saw the issue on a clean install of 2.5.0. I also upgraded it to the 2.5.2 and the problem persists.
As I said earlier I just tried redeploying with only the 12 HDD and 1 SSD journal. Like before they all show up online after the installation, but after a node reboot only 2 of the 12 come back up.
root@vlab-ext-jfesx77-pvsa:~# tail -f /opt/petasan/log/PetaSAN.log
27/04/2020 12:03:34 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 13 707f2d2b-0046-4409-875a-1412445ea5e5
27/04/2020 12:03:34 ERROR Error activating osd.
27/04/2020 12:03:34 INFO Try activate all osds fallback ...
27/04/2020 12:03:57 ERROR Error activate osd fallback.
27/04/2020 12:03:57 ERROR Error force activate osd sdo
27/04/2020 12:04:27 INFO starting activate osd.
27/04/2020 12:04:51 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 20 353d61a0-d6aa-4ded-81dd-a2f24649362c
27/04/2020 12:04:51 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 20 353d61a0-d6aa-4ded-81dd-a2f24649362c
27/04/2020 12:04:51 ERROR Error activating osd.
27/04/2020 12:04:51 INFO Try activate all osds fallback ...
27/04/2020 12:05:14 ERROR Error activate osd fallback.
27/04/2020 12:05:14 ERROR Error force activate osd sdp
27/04/2020 12:05:44 INFO starting activate osd.
27/04/2020 12:06:08 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 21 5d254f5a-b718-47df-80ee-abdcecfb5dfa
27/04/2020 12:06:09 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 21 5d254f5a-b718-47df-80ee-abdcecfb5dfa
27/04/2020 12:06:09 ERROR Error activating osd.
27/04/2020 12:06:09 INFO Try activate all osds fallback ...
27/04/2020 12:06:32 ERROR Error activate osd fallback.
27/04/2020 12:06:32 ERROR Error force activate osd sdq
27/04/2020 12:07:02 INFO starting activate osd.
27/04/2020 12:07:25 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 18 5466c85b-4204-4ac7-bf53-429c9e33deeb
27/04/2020 12:07:56 INFO starting activate osd.
27/04/2020 12:08:19 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 19 0fd8dd56-8108-47b6-bab4-7e98e1c19e19
27/04/2020 12:08:20 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 19 0fd8dd56-8108-47b6-bab4-7e98e1c19e19
27/04/2020 12:08:20 ERROR Error activating osd.
27/04/2020 12:08:20 INFO Try activate all osds fallback ...
27/04/2020 12:08:41 ERROR Error activate osd fallback.
27/04/2020 12:08:41 ERROR Error force activate osd sds
27/04/2020 12:09:11 INFO starting activate osd.
27/04/2020 12:09:33 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 16 597b3105-e01a-4939-a04a-b4637c54193d
27/04/2020 12:09:34 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 16 597b3105-e01a-4939-a04a-b4637c54193d
27/04/2020 12:09:34 ERROR Error activating osd.
27/04/2020 12:09:34 INFO Try activate all osds fallback ...
27/04/2020 12:09:56 ERROR Error activate osd fallback.
27/04/2020 12:09:56 ERROR Error force activate osd sdt
27/04/2020 12:10:26 INFO starting activate osd.
27/04/2020 12:10:48 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 17 b05d6d29-16a0-4d3b-9297-3ad40ca549cf
27/04/2020 12:10:49 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 17 b05d6d29-16a0-4d3b-9297-3ad40ca549cf
27/04/2020 12:10:49 ERROR Error activating osd.
27/04/2020 12:10:49 INFO Try activate all osds fallback ...
27/04/2020 12:11:11 ERROR Error activate osd fallback.
27/04/2020 12:11:11 ERROR Error force activate osd sdu
27/04/2020 12:11:41 INFO starting activate osd.
27/04/2020 12:12:03 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 14 346ba8e1-874e-4c41-be40-d0f0e49527cb
27/04/2020 12:12:04 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 14 346ba8e1-874e-4c41-be40-d0f0e49527cb
27/04/2020 12:12:04 ERROR Error activating osd.
27/04/2020 12:12:04 INFO Try activate all osds fallback ...
27/04/2020 12:12:25 ERROR Error activate osd fallback.
27/04/2020 12:12:25 ERROR Error force activate osd sdv
27/04/2020 12:12:55 INFO starting activate osd.
27/04/2020 12:13:18 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 15 b7461add-27b6-4557-a3e4-e1dfe8c22eb9
27/04/2020 12:13:19 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 15 b7461add-27b6-4557-a3e4-e1dfe8c22eb9
27/04/2020 12:13:19 ERROR Error activating osd.
27/04/2020 12:13:19 INFO Try activate all osds fallback ...
27/04/2020 12:13:40 ERROR Error activate osd fallback.
27/04/2020 12:13:40 ERROR Error force activate osd sdw
27/04/2020 12:14:10 INFO starting activate osd.
27/04/2020 12:14:33 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 22 33b0af45-d492-424e-9837-c2fae49def9a
27/04/2020 12:14:34 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 22 33b0af45-d492-424e-9837-c2fae49def9a
27/04/2020 12:14:34 ERROR Error activating osd.
27/04/2020 12:14:34 INFO Try activate all osds fallback ...
27/04/2020 12:14:55 ERROR Error activate osd fallback.
27/04/2020 12:14:55 ERROR Error force activate osd sdx
27/04/2020 12:15:25 INFO starting activate osd.
27/04/2020 12:15:48 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 23 6be3fb10-741a-4431-91c0-f5ae51e249bb
27/04/2020 12:15:49 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 23 6be3fb10-741a-4431-91c0-f5ae51e249bb
27/04/2020 12:15:49 ERROR Error activating osd.
27/04/2020 12:15:49 INFO Try activate all osds fallback ...
27/04/2020 12:16:10 ERROR Error activate osd fallback.
27/04/2020 12:16:10 ERROR Error force activate osd sdy
^C
root@vlab-ext-jfesx77-pvsa:~# ceph osd tree
ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF
-1 133.10266 root default
-5 44.36755 host vlab-ext-jfesx77-pvsa
12 hdd 3.69730 osd.12 up 1.00000 1.00000
13 hdd 3.69730 osd.13 down 0 1.00000
14 hdd 3.69730 osd.14 down 0 1.00000
15 hdd 3.69730 osd.15 down 0 1.00000
16 hdd 3.69730 osd.16 down 0 1.00000
17 hdd 3.69730 osd.17 down 0 1.00000
18 hdd 3.69730 osd.18 up 1.00000 1.00000
19 hdd 3.69730 osd.19 down 0 1.00000
20 hdd 3.69730 osd.20 down 0 1.00000
21 hdd 3.69730 osd.21 down 0 1.00000
22 hdd 3.69730 osd.22 down 0 1.00000
23 hdd 3.69730 osd.23 down 0 1.00000
admin
2,930 Posts
April 27, 2020, 9:15 pmQuote from admin on April 27, 2020, 9:15 pmcan you show output of
parted /dev/sdX print
sdX is the journal device
can you show output of
parted /dev/sdX print
sdX is the journal device
Pages: 1 2
Multiple HDD OSD down after node/service restart
brent
11 Posts
Quote from brent on April 27, 2020, 4:11 pmHi, I'm attempting to deploy a 3 node cluster on baremetal, each of which include 12 SSD and 12 HDD. I set the first two SSD up as journal for each node. My issue is that when I restart a node or its services, almost all of the HDD OSD do not come back online and I cannot restart them without error. The only method that has worked is to use the dashboard UI and delete, then re-add them as OSD, but this is very tedious and if I restart the node or ceph services they just go down again. Any help you can provide is greatly appreciated!
root@vlab-ext-jfesx77-pvsa:~# cat /opt/petasan/config/cluster_info.json
{
"backend_1_base_ip": "10.0.1.0",
"backend_1_eth_name": "eth2",
"backend_1_mask": "255.255.255.0",
"backend_1_vlan_id": "",
"backend_2_base_ip": "",
"backend_2_eth_name": "",
"backend_2_mask": "",
"backend_2_vlan_id": "",
"bonds": [],
"cifs_eth_name": "eth1",
"default_pool": "both",
"default_pool_pgs": "4096",
"default_pool_replicas": "3",
"eth_count": 3,
"iscsi_1_eth_name": "eth1",
"iscsi_2_eth_name": "eth1",
"jf_mtu_size": "9000",
"jumbo_frames": [
"eth1",
"eth2"
],
"management_eth_name": "eth0",
"management_nodes": [
{
"backend_1_ip": "10.0.1.167",
"backend_2_ip": "",
"is_backup": false,
"is_cifs": false,
"is_iscsi": true,
"is_management": true,
"is_storage": true,
"management_ip": "10.212.86.167",
"name": "vlab-ext-jfesx77-pvsa"
},
{
"backend_1_ip": "10.0.1.168",
"backend_2_ip": "",
"is_backup": false,
"is_cifs": false,
"is_iscsi": true,
"is_management": true,
"is_storage": true,
"management_ip": "10.212.86.168",
"name": "vlab-ext-jfesx78-pvsa"
},
{
"backend_1_ip": "10.0.1.169",
"backend_2_ip": "",
"is_backup": false,
"is_cifs": false,
"is_iscsi": true,
"is_management": true,
"is_storage": true,
"management_ip": "10.212.86.169",
"name": "vlab-ext-jfesx79-pvsa"
}
],
"name": "JF-EXT-BDW",
"storage_engine": "bluestore"
root@vlab-ext-jfesx77-pvsa:~# tail -f /opt/petasan/log/PetaSAN.log
27/04/2020 08:53:30 ERROR Error activate osd fallback.
27/04/2020 08:53:30 ERROR Error force activate osd sdr
27/04/2020 08:54:00 INFO starting activate osd.
27/04/2020 08:54:07 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 39 c9842b59-e635-434c-8936-e022fd89c6c8
27/04/2020 08:54:07 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 39 c9842b59-e635-434c-8936-e022fd89c6c8
27/04/2020 08:54:07 ERROR Error activating osd.
27/04/2020 08:54:07 INFO Try activate all osds fallback ...
27/04/2020 08:54:13 ERROR Error activate osd fallback.
27/04/2020 08:54:13 ERROR Error force activate osd sds
27/04/2020 08:54:43 INFO starting activate osd.
27/04/2020 08:54:50 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 36 87bacd05-2161-4eaf-8d5b-8e04f644b95c
27/04/2020 08:54:50 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 36 87bacd05-2161-4eaf-8d5b-8e04f644b95c
27/04/2020 08:54:50 ERROR Error activating osd.
27/04/2020 08:54:50 INFO Try activate all osds fallback ...
27/04/2020 08:54:56 ERROR Error activate osd fallback.
27/04/2020 08:54:56 ERROR Error force activate osd sdtroot@vlab-ext-jfesx77-pvsa:~# ceph osd tree
ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF
-1 161.05142 root default
-13 161.05142 rack rack
-7 53.68381 host vlab-ext-jfesx77-pvsa
26 hdd 3.69699 osd.26 up 1.00000 1.00000
27 hdd 3.69699 osd.27 down 0 1.00000
34 hdd 3.69699 osd.34 down 0 1.00000
35 hdd 3.69699 osd.35 down 0 1.00000
36 hdd 3.69699 osd.36 down 0 1.00000
37 hdd 3.69699 osd.37 down 0 1.00000
38 hdd 3.69699 osd.38 up 1.00000 1.00000
39 hdd 3.69699 osd.39 down 0 1.00000
40 hdd 3.69699 osd.40 up 1.00000 1.00000
41 hdd 3.69699 osd.41 down 0 1.00000
42 hdd 3.69699 osd.42 down 1.00000 1.00000
43 hdd 3.69699 osd.43 down 0 1.00000
22 ssd 0.93199 osd.22 up 1.00000 1.00000
23 ssd 0.93199 osd.23 up 1.00000 1.00000
24 ssd 0.93199 osd.24 up 1.00000 1.00000
25 ssd 0.93199 osd.25 up 1.00000 1.00000
28 ssd 0.93199 osd.28 up 1.00000 1.00000
29 ssd 0.93199 osd.29 up 1.00000 1.00000
30 ssd 0.93199 osd.30 up 1.00000 1.00000
31 ssd 0.93199 osd.31 up 1.00000 1.00000
32 ssd 0.93199 osd.32 up 1.00000 1.00000
33 ssd 0.93199 osd.33 up 1.00000 1.00000
-10 53.68381 host vlab-ext-jfesx78-pvsa
48 hdd 3.69699 osd.48 down 0 1.00000
49 hdd 3.69699 osd.49 down 0 1.00000
56 hdd 3.69699 osd.56 down 0 1.00000
57 hdd 3.69699 osd.57 down 0 1.00000
58 hdd 3.69699 osd.58 down 0 1.00000
59 hdd 3.69699 osd.59 down 0 1.00000
60 hdd 3.69699 osd.60 down 0 1.00000
61 hdd 3.69699 osd.61 down 0 1.00000
62 hdd 3.69699 osd.62 down 0 1.00000
63 hdd 3.69699 osd.63 up 1.00000 1.00000
64 hdd 3.69699 osd.64 down 0 1.00000
65 hdd 3.69699 osd.65 up 1.00000 1.00000
44 ssd 0.93199 osd.44 up 1.00000 1.00000
45 ssd 0.93199 osd.45 up 1.00000 1.00000
46 ssd 0.93199 osd.46 up 1.00000 1.00000
47 ssd 0.93199 osd.47 up 1.00000 1.00000
50 ssd 0.93199 osd.50 up 1.00000 1.00000
51 ssd 0.93199 osd.51 up 1.00000 1.00000
52 ssd 0.93199 osd.52 up 1.00000 1.00000
53 ssd 0.93199 osd.53 up 1.00000 1.00000
54 ssd 0.93199 osd.54 up 1.00000 1.00000
55 ssd 0.93199 osd.55 up 1.00000 1.00000
-3 53.68381 host vlab-ext-jfesx79-pvsa
4 hdd 3.69699 osd.4 down 0 1.00000
5 hdd 3.69699 osd.5 down 0 1.00000
12 hdd 3.69699 osd.12 down 0 1.00000
13 hdd 3.69699 osd.13 down 0 1.00000
14 hdd 3.69699 osd.14 down 0 1.00000
15 hdd 3.69699 osd.15 down 0 1.00000
16 hdd 3.69699 osd.16 up 1.00000 1.00000
17 hdd 3.69699 osd.17 down 0 1.00000
18 hdd 3.69699 osd.18 up 1.00000 1.00000
19 hdd 3.69699 osd.19 down 0 1.00000
20 hdd 3.69699 osd.20 down 0 1.00000
21 hdd 3.69699 osd.21 down 0 1.00000
0 ssd 0.93199 osd.0 up 1.00000 1.00000
1 ssd 0.93199 osd.1 up 1.00000 1.00000
2 ssd 0.93199 osd.2 up 1.00000 1.00000
3 ssd 0.93199 osd.3 up 1.00000 1.00000
6 ssd 0.93199 osd.6 up 1.00000 1.00000
7 ssd 0.93199 osd.7 up 1.00000 1.00000
8 ssd 0.93199 osd.8 up 1.00000 1.00000
9 ssd 0.93199 osd.9 up 1.00000 1.00000
10 ssd 0.93199 osd.10 up 1.00000 1.00000
11 ssd 0.93199 osd.11 up 1.00000 1.00000root@vlab-ext-jfesx77-pvsa:~# systemctl restart ceph-osd@27
root@vlab-ext-jfesx77-pvsa:~# systemctl status ceph-osd@27
● ceph-osd@27.service - Ceph object storage daemon osd.27
Loaded: loaded (/lib/systemd/system/ceph-osd@.service; indirect; vendor preset: enabled)
Active: failed (Result: exit-code) since Mon 2020-04-27 08:59:31 PDT; 4s ago
Process: 27804 ExecStart=/usr/bin/ceph-osd -f --cluster ${CLUSTER} --id 27 --setuser ceph --setgroup ceph (code=exited, status=1/FAILURE)
Process: 27800 ExecStartPre=/usr/lib/ceph/ceph-osd-prestart.sh --cluster ${CLUSTER} --id 27 (code=exited, status=0/SUCCESS)
Main PID: 27804 (code=exited, status=1/FAILURE)Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: ceph-osd@27.service: Main process exited, code=exited, status=1/FAILURE
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: ceph-osd@27.service: Failed with result 'exit-code'.
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: ceph-osd@27.service: Service hold-off time over, scheduling restart.
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: ceph-osd@27.service: Scheduled restart job, restart counter is at 5.
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: Stopped Ceph object storage daemon osd.27.
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: ceph-osd@27.service: Start request repeated too quickly.
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: ceph-osd@27.service: Failed with result 'exit-code'.
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: Failed to start Ceph object storage daemon osd.27.
Thanks!
Hi, I'm attempting to deploy a 3 node cluster on baremetal, each of which include 12 SSD and 12 HDD. I set the first two SSD up as journal for each node. My issue is that when I restart a node or its services, almost all of the HDD OSD do not come back online and I cannot restart them without error. The only method that has worked is to use the dashboard UI and delete, then re-add them as OSD, but this is very tedious and if I restart the node or ceph services they just go down again. Any help you can provide is greatly appreciated!
root@vlab-ext-jfesx77-pvsa:~# cat /opt/petasan/config/cluster_info.json
{
"backend_1_base_ip": "10.0.1.0",
"backend_1_eth_name": "eth2",
"backend_1_mask": "255.255.255.0",
"backend_1_vlan_id": "",
"backend_2_base_ip": "",
"backend_2_eth_name": "",
"backend_2_mask": "",
"backend_2_vlan_id": "",
"bonds": [],
"cifs_eth_name": "eth1",
"default_pool": "both",
"default_pool_pgs": "4096",
"default_pool_replicas": "3",
"eth_count": 3,
"iscsi_1_eth_name": "eth1",
"iscsi_2_eth_name": "eth1",
"jf_mtu_size": "9000",
"jumbo_frames": [
"eth1",
"eth2"
],
"management_eth_name": "eth0",
"management_nodes": [
{
"backend_1_ip": "10.0.1.167",
"backend_2_ip": "",
"is_backup": false,
"is_cifs": false,
"is_iscsi": true,
"is_management": true,
"is_storage": true,
"management_ip": "10.212.86.167",
"name": "vlab-ext-jfesx77-pvsa"
},
{
"backend_1_ip": "10.0.1.168",
"backend_2_ip": "",
"is_backup": false,
"is_cifs": false,
"is_iscsi": true,
"is_management": true,
"is_storage": true,
"management_ip": "10.212.86.168",
"name": "vlab-ext-jfesx78-pvsa"
},
{
"backend_1_ip": "10.0.1.169",
"backend_2_ip": "",
"is_backup": false,
"is_cifs": false,
"is_iscsi": true,
"is_management": true,
"is_storage": true,
"management_ip": "10.212.86.169",
"name": "vlab-ext-jfesx79-pvsa"
}
],
"name": "JF-EXT-BDW",
"storage_engine": "bluestore"
root@vlab-ext-jfesx77-pvsa:~# tail -f /opt/petasan/log/PetaSAN.log
27/04/2020 08:53:30 ERROR Error activate osd fallback.
27/04/2020 08:53:30 ERROR Error force activate osd sdr
27/04/2020 08:54:00 INFO starting activate osd.
27/04/2020 08:54:07 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 39 c9842b59-e635-434c-8936-e022fd89c6c8
27/04/2020 08:54:07 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 39 c9842b59-e635-434c-8936-e022fd89c6c8
27/04/2020 08:54:07 ERROR Error activating osd.
27/04/2020 08:54:07 INFO Try activate all osds fallback ...
27/04/2020 08:54:13 ERROR Error activate osd fallback.
27/04/2020 08:54:13 ERROR Error force activate osd sds
27/04/2020 08:54:43 INFO starting activate osd.
27/04/2020 08:54:50 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 36 87bacd05-2161-4eaf-8d5b-8e04f644b95c
27/04/2020 08:54:50 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 36 87bacd05-2161-4eaf-8d5b-8e04f644b95c
27/04/2020 08:54:50 ERROR Error activating osd.
27/04/2020 08:54:50 INFO Try activate all osds fallback ...
27/04/2020 08:54:56 ERROR Error activate osd fallback.
27/04/2020 08:54:56 ERROR Error force activate osd sdt
root@vlab-ext-jfesx77-pvsa:~# ceph osd tree
ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF
-1 161.05142 root default
-13 161.05142 rack rack
-7 53.68381 host vlab-ext-jfesx77-pvsa
26 hdd 3.69699 osd.26 up 1.00000 1.00000
27 hdd 3.69699 osd.27 down 0 1.00000
34 hdd 3.69699 osd.34 down 0 1.00000
35 hdd 3.69699 osd.35 down 0 1.00000
36 hdd 3.69699 osd.36 down 0 1.00000
37 hdd 3.69699 osd.37 down 0 1.00000
38 hdd 3.69699 osd.38 up 1.00000 1.00000
39 hdd 3.69699 osd.39 down 0 1.00000
40 hdd 3.69699 osd.40 up 1.00000 1.00000
41 hdd 3.69699 osd.41 down 0 1.00000
42 hdd 3.69699 osd.42 down 1.00000 1.00000
43 hdd 3.69699 osd.43 down 0 1.00000
22 ssd 0.93199 osd.22 up 1.00000 1.00000
23 ssd 0.93199 osd.23 up 1.00000 1.00000
24 ssd 0.93199 osd.24 up 1.00000 1.00000
25 ssd 0.93199 osd.25 up 1.00000 1.00000
28 ssd 0.93199 osd.28 up 1.00000 1.00000
29 ssd 0.93199 osd.29 up 1.00000 1.00000
30 ssd 0.93199 osd.30 up 1.00000 1.00000
31 ssd 0.93199 osd.31 up 1.00000 1.00000
32 ssd 0.93199 osd.32 up 1.00000 1.00000
33 ssd 0.93199 osd.33 up 1.00000 1.00000
-10 53.68381 host vlab-ext-jfesx78-pvsa
48 hdd 3.69699 osd.48 down 0 1.00000
49 hdd 3.69699 osd.49 down 0 1.00000
56 hdd 3.69699 osd.56 down 0 1.00000
57 hdd 3.69699 osd.57 down 0 1.00000
58 hdd 3.69699 osd.58 down 0 1.00000
59 hdd 3.69699 osd.59 down 0 1.00000
60 hdd 3.69699 osd.60 down 0 1.00000
61 hdd 3.69699 osd.61 down 0 1.00000
62 hdd 3.69699 osd.62 down 0 1.00000
63 hdd 3.69699 osd.63 up 1.00000 1.00000
64 hdd 3.69699 osd.64 down 0 1.00000
65 hdd 3.69699 osd.65 up 1.00000 1.00000
44 ssd 0.93199 osd.44 up 1.00000 1.00000
45 ssd 0.93199 osd.45 up 1.00000 1.00000
46 ssd 0.93199 osd.46 up 1.00000 1.00000
47 ssd 0.93199 osd.47 up 1.00000 1.00000
50 ssd 0.93199 osd.50 up 1.00000 1.00000
51 ssd 0.93199 osd.51 up 1.00000 1.00000
52 ssd 0.93199 osd.52 up 1.00000 1.00000
53 ssd 0.93199 osd.53 up 1.00000 1.00000
54 ssd 0.93199 osd.54 up 1.00000 1.00000
55 ssd 0.93199 osd.55 up 1.00000 1.00000
-3 53.68381 host vlab-ext-jfesx79-pvsa
4 hdd 3.69699 osd.4 down 0 1.00000
5 hdd 3.69699 osd.5 down 0 1.00000
12 hdd 3.69699 osd.12 down 0 1.00000
13 hdd 3.69699 osd.13 down 0 1.00000
14 hdd 3.69699 osd.14 down 0 1.00000
15 hdd 3.69699 osd.15 down 0 1.00000
16 hdd 3.69699 osd.16 up 1.00000 1.00000
17 hdd 3.69699 osd.17 down 0 1.00000
18 hdd 3.69699 osd.18 up 1.00000 1.00000
19 hdd 3.69699 osd.19 down 0 1.00000
20 hdd 3.69699 osd.20 down 0 1.00000
21 hdd 3.69699 osd.21 down 0 1.00000
0 ssd 0.93199 osd.0 up 1.00000 1.00000
1 ssd 0.93199 osd.1 up 1.00000 1.00000
2 ssd 0.93199 osd.2 up 1.00000 1.00000
3 ssd 0.93199 osd.3 up 1.00000 1.00000
6 ssd 0.93199 osd.6 up 1.00000 1.00000
7 ssd 0.93199 osd.7 up 1.00000 1.00000
8 ssd 0.93199 osd.8 up 1.00000 1.00000
9 ssd 0.93199 osd.9 up 1.00000 1.00000
10 ssd 0.93199 osd.10 up 1.00000 1.00000
11 ssd 0.93199 osd.11 up 1.00000 1.00000
root@vlab-ext-jfesx77-pvsa:~# systemctl restart ceph-osd@27
root@vlab-ext-jfesx77-pvsa:~# systemctl status ceph-osd@27
● ceph-osd@27.service - Ceph object storage daemon osd.27
Loaded: loaded (/lib/systemd/system/ceph-osd@.service; indirect; vendor preset: enabled)
Active: failed (Result: exit-code) since Mon 2020-04-27 08:59:31 PDT; 4s ago
Process: 27804 ExecStart=/usr/bin/ceph-osd -f --cluster ${CLUSTER} --id 27 --setuser ceph --setgroup ceph (code=exited, status=1/FAILURE)
Process: 27800 ExecStartPre=/usr/lib/ceph/ceph-osd-prestart.sh --cluster ${CLUSTER} --id 27 (code=exited, status=0/SUCCESS)
Main PID: 27804 (code=exited, status=1/FAILURE)
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: ceph-osd@27.service: Main process exited, code=exited, status=1/FAILURE
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: ceph-osd@27.service: Failed with result 'exit-code'.
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: ceph-osd@27.service: Service hold-off time over, scheduling restart.
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: ceph-osd@27.service: Scheduled restart job, restart counter is at 5.
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: Stopped Ceph object storage daemon osd.27.
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: ceph-osd@27.service: Start request repeated too quickly.
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: ceph-osd@27.service: Failed with result 'exit-code'.
Apr 27 08:59:31 vlab-ext-jfesx77-pvsa systemd[1]: Failed to start Ceph object storage daemon osd.27.
Thanks!
admin
2,930 Posts
Quote from admin on April 27, 2020, 4:16 pmdo you have enough ram ?
if you deploy the cluster with say 1 OSD per node, does it work ?
do you have enough ram ?
if you deploy the cluster with say 1 OSD per node, does it work ?
brent
11 Posts
Quote from brent on April 27, 2020, 4:20 pmI have 64GB memory per node. I can try redeploying but I suspect that yes if I choose one OSD per node it would work.
I have 64GB memory per node. I can try redeploying but I suspect that yes if I choose one OSD per node it would work.
admin
2,930 Posts
Quote from admin on April 27, 2020, 4:32 pm24/22 OSDs per node requires more that 64 GB RAM. Each OSD requires 4 GB + the other services do need RAM as well (check our guide), also if you do use SSD cache devices, they also need RAM ( 2% of disk partition size)
I would test with say 8 OSDs in total per host, if you have no issue, then it is probably pure RAM.
24/22 OSDs per node requires more that 64 GB RAM. Each OSD requires 4 GB + the other services do need RAM as well (check our guide), also if you do use SSD cache devices, they also need RAM ( 2% of disk partition size)
I would test with say 8 OSDs in total per host, if you have no issue, then it is probably pure RAM.
brent
11 Posts
Quote from brent on April 27, 2020, 5:02 pmJust for clarity I realize I misspoke when I said this is baremetal. It's 3 nodes which have VMware ESXi installed, but all of the disks are raw disk mapped. What I meant was I'm not using VM backed storage, but instead passing through the hdd/ssd. When I look at memory metrics for these I don't see them using more than 5GB. I increased to 96GB one node 77 and see no change. Thoughts?
root@vlab-ext-jfesx77-pvsa:~# free -h
total used free shared buff/cache available
Mem: 94G 1.6G 92G 9.2M 742M 92G
Swap: 0B 0B 0Broot@vlab-ext-jfesx78-pvsa:/proc/sys/kernel# free -h
total used free shared buff/cache available
Mem: 62G 4.4G 54G 7.2M 4.1G 58G
Swap: 0B 0B 0Broot@vlab-ext-jfesx79-pvsa:~# free -h
total used free shared buff/cache available
Mem: 62G 2.8G 56G 7.8M 4.0G 59G
Swap: 0B 0B 0B
Just for clarity I realize I misspoke when I said this is baremetal. It's 3 nodes which have VMware ESXi installed, but all of the disks are raw disk mapped. What I meant was I'm not using VM backed storage, but instead passing through the hdd/ssd. When I look at memory metrics for these I don't see them using more than 5GB. I increased to 96GB one node 77 and see no change. Thoughts?
root@vlab-ext-jfesx77-pvsa:~# free -h
total used free shared buff/cache available
Mem: 94G 1.6G 92G 9.2M 742M 92G
Swap: 0B 0B 0B
root@vlab-ext-jfesx78-pvsa:/proc/sys/kernel# free -h
total used free shared buff/cache available
Mem: 62G 4.4G 54G 7.2M 4.1G 58G
Swap: 0B 0B 0B
root@vlab-ext-jfesx79-pvsa:~# free -h
total used free shared buff/cache available
Mem: 62G 2.8G 56G 7.8M 4.0G 59G
Swap: 0B 0B 0B
admin
2,930 Posts
Quote from admin on April 27, 2020, 5:25 pmAre you SSD OSDs all fine and it is a problem just with HDDs ? do all HDDs have an issue or some ? is the issue only with OSDs with journal ?
Are you SSD OSDs all fine and it is a problem just with HDDs ? do all HDDs have an issue or some ? is the issue only with OSDs with journal ?
brent
11 Posts
Quote from brent on April 27, 2020, 5:37 pmAll SSD are ok and only two of the twelce HDD are ok. I'm redeploying now and going to try to just put OSD on the HDD and see if results differ.
All SSD are ok and only two of the twelce HDD are ok. I'm redeploying now and going to try to just put OSD on the HDD and see if results differ.
admin
2,930 Posts
Quote from admin on April 27, 2020, 7:28 pmwhat version do you have ?
what version do you have ?
brent
11 Posts
Quote from brent on April 27, 2020, 8:11 pmI first saw the issue on a clean install of 2.5.0. I also upgraded it to the 2.5.2 and the problem persists.
As I said earlier I just tried redeploying with only the 12 HDD and 1 SSD journal. Like before they all show up online after the installation, but after a node reboot only 2 of the 12 come back up.
root@vlab-ext-jfesx77-pvsa:~# tail -f /opt/petasan/log/PetaSAN.log
27/04/2020 12:03:34 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 13 707f2d2b-0046-4409-875a-1412445ea5e5
27/04/2020 12:03:34 ERROR Error activating osd.
27/04/2020 12:03:34 INFO Try activate all osds fallback ...
27/04/2020 12:03:57 ERROR Error activate osd fallback.
27/04/2020 12:03:57 ERROR Error force activate osd sdo
27/04/2020 12:04:27 INFO starting activate osd.
27/04/2020 12:04:51 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 20 353d61a0-d6aa-4ded-81dd-a2f24649362c
27/04/2020 12:04:51 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 20 353d61a0-d6aa-4ded-81dd-a2f24649362c
27/04/2020 12:04:51 ERROR Error activating osd.
27/04/2020 12:04:51 INFO Try activate all osds fallback ...
27/04/2020 12:05:14 ERROR Error activate osd fallback.
27/04/2020 12:05:14 ERROR Error force activate osd sdp
27/04/2020 12:05:44 INFO starting activate osd.
27/04/2020 12:06:08 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 21 5d254f5a-b718-47df-80ee-abdcecfb5dfa
27/04/2020 12:06:09 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 21 5d254f5a-b718-47df-80ee-abdcecfb5dfa
27/04/2020 12:06:09 ERROR Error activating osd.
27/04/2020 12:06:09 INFO Try activate all osds fallback ...
27/04/2020 12:06:32 ERROR Error activate osd fallback.
27/04/2020 12:06:32 ERROR Error force activate osd sdq
27/04/2020 12:07:02 INFO starting activate osd.
27/04/2020 12:07:25 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 18 5466c85b-4204-4ac7-bf53-429c9e33deeb
27/04/2020 12:07:56 INFO starting activate osd.
27/04/2020 12:08:19 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 19 0fd8dd56-8108-47b6-bab4-7e98e1c19e19
27/04/2020 12:08:20 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 19 0fd8dd56-8108-47b6-bab4-7e98e1c19e19
27/04/2020 12:08:20 ERROR Error activating osd.
27/04/2020 12:08:20 INFO Try activate all osds fallback ...
27/04/2020 12:08:41 ERROR Error activate osd fallback.
27/04/2020 12:08:41 ERROR Error force activate osd sds
27/04/2020 12:09:11 INFO starting activate osd.
27/04/2020 12:09:33 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 16 597b3105-e01a-4939-a04a-b4637c54193d
27/04/2020 12:09:34 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 16 597b3105-e01a-4939-a04a-b4637c54193d
27/04/2020 12:09:34 ERROR Error activating osd.
27/04/2020 12:09:34 INFO Try activate all osds fallback ...
27/04/2020 12:09:56 ERROR Error activate osd fallback.
27/04/2020 12:09:56 ERROR Error force activate osd sdt
27/04/2020 12:10:26 INFO starting activate osd.
27/04/2020 12:10:48 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 17 b05d6d29-16a0-4d3b-9297-3ad40ca549cf
27/04/2020 12:10:49 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 17 b05d6d29-16a0-4d3b-9297-3ad40ca549cf
27/04/2020 12:10:49 ERROR Error activating osd.
27/04/2020 12:10:49 INFO Try activate all osds fallback ...
27/04/2020 12:11:11 ERROR Error activate osd fallback.
27/04/2020 12:11:11 ERROR Error force activate osd sdu
27/04/2020 12:11:41 INFO starting activate osd.
27/04/2020 12:12:03 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 14 346ba8e1-874e-4c41-be40-d0f0e49527cb
27/04/2020 12:12:04 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 14 346ba8e1-874e-4c41-be40-d0f0e49527cb
27/04/2020 12:12:04 ERROR Error activating osd.
27/04/2020 12:12:04 INFO Try activate all osds fallback ...
27/04/2020 12:12:25 ERROR Error activate osd fallback.
27/04/2020 12:12:25 ERROR Error force activate osd sdv
27/04/2020 12:12:55 INFO starting activate osd.
27/04/2020 12:13:18 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 15 b7461add-27b6-4557-a3e4-e1dfe8c22eb9
27/04/2020 12:13:19 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 15 b7461add-27b6-4557-a3e4-e1dfe8c22eb9
27/04/2020 12:13:19 ERROR Error activating osd.
27/04/2020 12:13:19 INFO Try activate all osds fallback ...
27/04/2020 12:13:40 ERROR Error activate osd fallback.
27/04/2020 12:13:40 ERROR Error force activate osd sdw
27/04/2020 12:14:10 INFO starting activate osd.
27/04/2020 12:14:33 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 22 33b0af45-d492-424e-9837-c2fae49def9a
27/04/2020 12:14:34 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 22 33b0af45-d492-424e-9837-c2fae49def9a
27/04/2020 12:14:34 ERROR Error activating osd.
27/04/2020 12:14:34 INFO Try activate all osds fallback ...
27/04/2020 12:14:55 ERROR Error activate osd fallback.
27/04/2020 12:14:55 ERROR Error force activate osd sdx
27/04/2020 12:15:25 INFO starting activate osd.
27/04/2020 12:15:48 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 23 6be3fb10-741a-4431-91c0-f5ae51e249bb
27/04/2020 12:15:49 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 23 6be3fb10-741a-4431-91c0-f5ae51e249bb
27/04/2020 12:15:49 ERROR Error activating osd.
27/04/2020 12:15:49 INFO Try activate all osds fallback ...
27/04/2020 12:16:10 ERROR Error activate osd fallback.
27/04/2020 12:16:10 ERROR Error force activate osd sdy
^C
root@vlab-ext-jfesx77-pvsa:~# ceph osd tree
ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF
-1 133.10266 root default
-5 44.36755 host vlab-ext-jfesx77-pvsa
12 hdd 3.69730 osd.12 up 1.00000 1.00000
13 hdd 3.69730 osd.13 down 0 1.00000
14 hdd 3.69730 osd.14 down 0 1.00000
15 hdd 3.69730 osd.15 down 0 1.00000
16 hdd 3.69730 osd.16 down 0 1.00000
17 hdd 3.69730 osd.17 down 0 1.00000
18 hdd 3.69730 osd.18 up 1.00000 1.00000
19 hdd 3.69730 osd.19 down 0 1.00000
20 hdd 3.69730 osd.20 down 0 1.00000
21 hdd 3.69730 osd.21 down 0 1.00000
22 hdd 3.69730 osd.22 down 0 1.00000
23 hdd 3.69730 osd.23 down 0 1.00000
I first saw the issue on a clean install of 2.5.0. I also upgraded it to the 2.5.2 and the problem persists.
As I said earlier I just tried redeploying with only the 12 HDD and 1 SSD journal. Like before they all show up online after the installation, but after a node reboot only 2 of the 12 come back up.
root@vlab-ext-jfesx77-pvsa:~# tail -f /opt/petasan/log/PetaSAN.log
27/04/2020 12:03:34 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 13 707f2d2b-0046-4409-875a-1412445ea5e5
27/04/2020 12:03:34 ERROR Error activating osd.
27/04/2020 12:03:34 INFO Try activate all osds fallback ...
27/04/2020 12:03:57 ERROR Error activate osd fallback.
27/04/2020 12:03:57 ERROR Error force activate osd sdo
27/04/2020 12:04:27 INFO starting activate osd.
27/04/2020 12:04:51 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 20 353d61a0-d6aa-4ded-81dd-a2f24649362c
27/04/2020 12:04:51 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 20 353d61a0-d6aa-4ded-81dd-a2f24649362c
27/04/2020 12:04:51 ERROR Error activating osd.
27/04/2020 12:04:51 INFO Try activate all osds fallback ...
27/04/2020 12:05:14 ERROR Error activate osd fallback.
27/04/2020 12:05:14 ERROR Error force activate osd sdp
27/04/2020 12:05:44 INFO starting activate osd.
27/04/2020 12:06:08 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 21 5d254f5a-b718-47df-80ee-abdcecfb5dfa
27/04/2020 12:06:09 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 21 5d254f5a-b718-47df-80ee-abdcecfb5dfa
27/04/2020 12:06:09 ERROR Error activating osd.
27/04/2020 12:06:09 INFO Try activate all osds fallback ...
27/04/2020 12:06:32 ERROR Error activate osd fallback.
27/04/2020 12:06:32 ERROR Error force activate osd sdq
27/04/2020 12:07:02 INFO starting activate osd.
27/04/2020 12:07:25 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 18 5466c85b-4204-4ac7-bf53-429c9e33deeb
27/04/2020 12:07:56 INFO starting activate osd.
27/04/2020 12:08:19 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 19 0fd8dd56-8108-47b6-bab4-7e98e1c19e19
27/04/2020 12:08:20 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 19 0fd8dd56-8108-47b6-bab4-7e98e1c19e19
27/04/2020 12:08:20 ERROR Error activating osd.
27/04/2020 12:08:20 INFO Try activate all osds fallback ...
27/04/2020 12:08:41 ERROR Error activate osd fallback.
27/04/2020 12:08:41 ERROR Error force activate osd sds
27/04/2020 12:09:11 INFO starting activate osd.
27/04/2020 12:09:33 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 16 597b3105-e01a-4939-a04a-b4637c54193d
27/04/2020 12:09:34 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 16 597b3105-e01a-4939-a04a-b4637c54193d
27/04/2020 12:09:34 ERROR Error activating osd.
27/04/2020 12:09:34 INFO Try activate all osds fallback ...
27/04/2020 12:09:56 ERROR Error activate osd fallback.
27/04/2020 12:09:56 ERROR Error force activate osd sdt
27/04/2020 12:10:26 INFO starting activate osd.
27/04/2020 12:10:48 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 17 b05d6d29-16a0-4d3b-9297-3ad40ca549cf
27/04/2020 12:10:49 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 17 b05d6d29-16a0-4d3b-9297-3ad40ca549cf
27/04/2020 12:10:49 ERROR Error activating osd.
27/04/2020 12:10:49 INFO Try activate all osds fallback ...
27/04/2020 12:11:11 ERROR Error activate osd fallback.
27/04/2020 12:11:11 ERROR Error force activate osd sdu
27/04/2020 12:11:41 INFO starting activate osd.
27/04/2020 12:12:03 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 14 346ba8e1-874e-4c41-be40-d0f0e49527cb
27/04/2020 12:12:04 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 14 346ba8e1-874e-4c41-be40-d0f0e49527cb
27/04/2020 12:12:04 ERROR Error activating osd.
27/04/2020 12:12:04 INFO Try activate all osds fallback ...
27/04/2020 12:12:25 ERROR Error activate osd fallback.
27/04/2020 12:12:25 ERROR Error force activate osd sdv
27/04/2020 12:12:55 INFO starting activate osd.
27/04/2020 12:13:18 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 15 b7461add-27b6-4557-a3e4-e1dfe8c22eb9
27/04/2020 12:13:19 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 15 b7461add-27b6-4557-a3e4-e1dfe8c22eb9
27/04/2020 12:13:19 ERROR Error activating osd.
27/04/2020 12:13:19 INFO Try activate all osds fallback ...
27/04/2020 12:13:40 ERROR Error activate osd fallback.
27/04/2020 12:13:40 ERROR Error force activate osd sdw
27/04/2020 12:14:10 INFO starting activate osd.
27/04/2020 12:14:33 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 22 33b0af45-d492-424e-9837-c2fae49def9a
27/04/2020 12:14:34 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 22 33b0af45-d492-424e-9837-c2fae49def9a
27/04/2020 12:14:34 ERROR Error activating osd.
27/04/2020 12:14:34 INFO Try activate all osds fallback ...
27/04/2020 12:14:55 ERROR Error activate osd fallback.
27/04/2020 12:14:55 ERROR Error force activate osd sdx
27/04/2020 12:15:25 INFO starting activate osd.
27/04/2020 12:15:48 INFO Starting : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 23 6be3fb10-741a-4431-91c0-f5ae51e249bb
27/04/2020 12:15:49 ERROR Error executing : ceph-volume --log-path /opt/petasan/log lvm activate --bluestore 23 6be3fb10-741a-4431-91c0-f5ae51e249bb
27/04/2020 12:15:49 ERROR Error activating osd.
27/04/2020 12:15:49 INFO Try activate all osds fallback ...
27/04/2020 12:16:10 ERROR Error activate osd fallback.
27/04/2020 12:16:10 ERROR Error force activate osd sdy
^C
root@vlab-ext-jfesx77-pvsa:~# ceph osd tree
ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF
-1 133.10266 root default
-5 44.36755 host vlab-ext-jfesx77-pvsa
12 hdd 3.69730 osd.12 up 1.00000 1.00000
13 hdd 3.69730 osd.13 down 0 1.00000
14 hdd 3.69730 osd.14 down 0 1.00000
15 hdd 3.69730 osd.15 down 0 1.00000
16 hdd 3.69730 osd.16 down 0 1.00000
17 hdd 3.69730 osd.17 down 0 1.00000
18 hdd 3.69730 osd.18 up 1.00000 1.00000
19 hdd 3.69730 osd.19 down 0 1.00000
20 hdd 3.69730 osd.20 down 0 1.00000
21 hdd 3.69730 osd.21 down 0 1.00000
22 hdd 3.69730 osd.22 down 0 1.00000
23 hdd 3.69730 osd.23 down 0 1.00000
admin
2,930 Posts
Quote from admin on April 27, 2020, 9:15 pmcan you show output of
parted /dev/sdX print
sdX is the journal device
can you show output of
parted /dev/sdX print
sdX is the journal device