Forums

Home / Forums

You need to log in to create posts and topics. Login · Register

Ceph Health Problem

any thing that led to this ? were any OSDs deleted ?

is it stuck at this, or is it slowly recovering ?

if it is stuck, query a stuck pg and see what it reports in recovery status.

no OSD bad or removed

root@peta-2:/etc/ceph# ceph health detail
HEALTH_WARN Reduced data availability: 72 pgs inactive, 64 pgs incomplete
PG_AVAILABILITY Reduced data availability: 72 pgs inactive, 64 pgs incomplete
pg 1.16 is incomplete, acting [48,44]
pg 1.6d is incomplete, acting [2,50]
pg 1.7a is incomplete, acting [34,17]
pg 1.7f is incomplete, acting [29,14]
pg 1.155 is stuck inactive for 103011.099283, current state unknown, last acting []
pg 1.1ce is incomplete, acting [53,3]
pg 1.1eb is incomplete, acting [9,19]
pg 1.1fb is incomplete, acting [30,46]
pg 1.22b is incomplete, acting [48,26]
pg 1.2c1 is incomplete, acting [42,15]
pg 1.302 is incomplete, acting [29,18]
pg 1.319 is incomplete, acting [21,33]
pg 1.348 is incomplete, acting [29,5]
pg 1.356 is incomplete, acting [4,39]
pg 1.363 is incomplete, acting [42,17]
pg 1.37d is incomplete, acting [48,22]
pg 1.396 is incomplete, acting [1,41]
pg 1.3b8 is incomplete, acting [42,22]
pg 1.3fe is incomplete, acting [14,25]
pg 1.409 is incomplete, acting [29,17]
pg 1.449 is stuck inactive for 103011.099283, current state unknown, last acting []
pg 1.46f is stuck inactive for 103011.099283, current state unknown, last acting []
pg 1.4d8 is incomplete, acting [45,4]
pg 1.4f7 is incomplete, acting [39,23]
pg 1.520 is stuck inactive for 103011.099283, current state unknown, last acting []
pg 1.558 is incomplete, acting [50,23]
pg 1.651 is incomplete, acting [51,11]
pg 1.692 is incomplete, acting [22,29]
pg 1.6a6 is incomplete, acting [7,18]
pg 1.6cf is stuck inactive for 103011.099283, current state unknown, last acting []
pg 1.6d8 is incomplete, acting [2,47]
pg 1.6e8 is incomplete, acting [9,33]
pg 1.6f1 is incomplete, acting [0,38]
pg 1.76c is incomplete, acting [7,52]
pg 1.7b7 is incomplete, acting [40,21]
pg 1.7b9 is incomplete, acting [14,25]
pg 1.7cf is incomplete, acting [0,50]
pg 1.7d8 is incomplete, acting [29,17]
pg 1.7fa is incomplete, acting [33,41]
pg 1.844 is incomplete, acting [38,2]
pg 1.8b3 is incomplete, acting [47,2]
pg 1.909 is incomplete, acting [7,37]
pg 1.94f is incomplete, acting [29,16]
pg 1.964 is incomplete, acting [37,4]
pg 1.974 is stuck inactive since forever, current state incomplete, last acting [33,53]
pg 1.eeb is incomplete, acting [21,44]
pg 1.f2c is incomplete, acting [4,47]
pg 1.f95 is incomplete, acting [2,29]
pg 1.fb8 is incomplete, acting [2,25]
pg 1.fe7 is incomplete, acting [0,52]
pg 1.ff8 is incomplete, acting [7,50]

 

ceph pg 1.ff8 query

                "created": 0,
"last_epoch_clean": 0,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "0'0",
"last_scrub_stamp": "0.000000",
"last_deep_scrub": "0'0",
"last_deep_scrub_stamp": "0.000000",
"last_clean_scrub_stamp": "0.000000",
"log_size": 0,
"ondisk_log_size": 0,
"stats_invalid": false,
"dirty_stats_invalid": false,
"omap_stats_invalid": false,
"hitset_stats_invalid": false,
"hitset_bytes_stats_invalid": false,
"pin_stats_invalid": false,
"snaptrimq_len": 0,
"stat_sum": {
"num_bytes": 0,
"num_objects": 0,
"num_object_clones": 0,
"num_object_copies": 0,
"num_objects_missing_on_primary": 0,
"num_objects_missing": 0,
"num_objects_degraded": 0,
"num_objects_misplaced": 0,
"num_objects_unfound": 0,
"num_objects_dirty": 0,
"num_whiteouts": 0,
"num_read": 0,
"num_read_kb": 0,
"num_write": 0,
"num_write_kb": 0,
"num_scrub_errors": 0,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors": 0,
"num_objects_recovered": 0,
"num_bytes_recovered": 0,
"num_keys_recovered": 0,
"num_objects_omap": 0,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0,
"num_flush": 0,
"num_flush_kb": 0,
"num_evict": 0,
"num_evict_kb": 0,
"num_promote": 0,
"num_flush_mode_high": 0,
"num_flush_mode_low": 0,
"num_evict_mode_some": 0,
"num_evict_mode_full": 0,
"num_objects_pinned": 0,
"num_legacy_snapsets": 0,
"num_large_omap_objects": 0
},
"up": [],
"acting": [],
"blocked_by": [],
"up_primary": -1,
"acting_primary": -1
},
"empty": 1,
"dne": 1,
"incomplete": 0,
"last_epoch_started": 0,
"hit_set_history": {
"current_last_update": "0'0",
"history": []
}
},
{
"peer": "50",
"pgid": "1.ff8",
"last_update": "0'0",
"last_complete": "0'0",
"log_tail": "0'0",
"last_user_version": 0,
"last_backfill": "MAX",
"last_backfill_bitwise": 0,
"purged_snaps": [],
"history": {
"epoch_created": 0,
"epoch_pool_created": 0,
"last_epoch_started": 0,
"last_interval_started": 0,
"last_epoch_clean": 0,
"last_interval_clean": 0,
"last_epoch_split": 0,
"last_epoch_marked_full": 0,
"same_up_since": 0,
"same_interval_since": 0,
"same_primary_since": 0,
"last_scrub": "0'0",
"last_scrub_stamp": "0.000000",
"last_deep_scrub": "0'0",
"last_deep_scrub_stamp": "0.000000",
"last_clean_scrub_stamp": "0.000000"
},
"stats": {
"version": "0'0",
"reported_seq": "0",
"reported_epoch": "0",
"state": "unknown",
"last_fresh": "0.000000",
"last_change": "0.000000",
"last_active": "0.000000",
"last_peered": "0.000000",
"last_clean": "0.000000",
"last_became_active": "0.000000",
"last_became_peered": "0.000000",
"last_unstale": "0.000000",
"last_undegraded": "0.000000",
"last_fullsized": "0.000000",
"mapping_epoch": 0,
"log_start": "0'0",
"ondisk_log_start": "0'0",
"created": 0,
"last_epoch_clean": 0,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "0'0",
"last_scrub_stamp": "0.000000",
"last_deep_scrub": "0'0",
"last_deep_scrub_stamp": "0.000000",
"last_clean_scrub_stamp": "0.000000",
"log_size": 0,
"ondisk_log_size": 0,
"stats_invalid": false,
"dirty_stats_invalid": false,
"omap_stats_invalid": false,
"hitset_stats_invalid": false,
"hitset_bytes_stats_invalid": false,
"pin_stats_invalid": false,
"snaptrimq_len": 0,
"stat_sum": {
"num_bytes": 0,
"num_objects": 0,
"num_object_clones": 0,
"num_object_copies": 0,
"num_objects_missing_on_primary": 0,
"num_objects_missing": 0,
"num_objects_degraded": 0,
"num_objects_misplaced": 0,
"num_objects_unfound": 0,
"num_objects_dirty": 0,
"num_whiteouts": 0,
"num_read": 0,
"num_read_kb": 0,
"num_write": 0,
"num_write_kb": 0,
"num_scrub_errors": 0,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors": 0,
"num_objects_recovered": 0,
"num_bytes_recovered": 0,
"num_keys_recovered": 0,
"num_objects_omap": 0,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0,
"num_flush": 0,
"num_flush_kb": 0,
"num_evict": 0,
"num_evict_kb": 0,
"num_promote": 0,
"num_flush_mode_high": 0,
"num_flush_mode_low": 0,
"num_evict_mode_some": 0,
"num_evict_mode_full": 0,
"num_objects_pinned": 0,
"num_legacy_snapsets": 0,
"num_large_omap_objects": 0
},
"up": [],
"acting": [],
"blocked_by": [],
"up_primary": -1,
"acting_primary": -1
},
"empty": 1,
"dne": 1,
"incomplete": 0,
"last_epoch_started": 0,
"hit_set_history": {
"current_last_update": "0'0",
"history": []
}
}
],
"recovery_state": [
{
"name": "Started/Primary/Peering/Incomplete",
"enter_time": "2018-11-07 09:22:42.983356",
"comment": "not enough complete instances of this PG"
},
{
"name": "Started/Primary/Peering",
"enter_time": "2018-11-07 09:22:42.947534",
"past_intervals": [
{
"first": "2098",
"last": "6839",
"all_participants": [
{
"osd": 7
},
{
"osd": 20
},
{
"osd": 48
},
{
"osd": 50
}
],
"intervals": [
{
"first": "4262",
"last": "4271",
"acting": "48"
},
{
"first": "4272",
"last": "4441",
"acting": "20"
},
{
"first": "6837",
"last": "6839",
"acting": "7"
}
]
}
],
"probing_osds": [
"7",
"20",
"48",
"50"
],
"down_osds_we_would_probe": [],
"peering_blocked_by": [],
"peering_blocked_by_detail": [
{
"detail": "peering_blocked_by_history_les_bound"
}
]
},
{
"name": "Started",
"enter_time": "2018-11-07 09:22:42.947468"
}
],
"agent_state": {}
}

 

best regards

 

marcel

 

 

try restarting the following osds: 7,20,48,50 and see if it helps.
the problem lies in:
"detail": "peering_blocked_by_history_les_bound"
you can search for the meaning of this error and proposed fixes.
you can also look at the logs of the above osds.

// closed

i have reinstalled to new release

the error comes from a hard reboot  after pwr switching

 

best regards

Marcel