diff options
Diffstat (limited to 'fs/bcachefs/btree_io.c')
-rw-r--r-- | fs/bcachefs/btree_io.c | 139 |
1 files changed, 58 insertions, 81 deletions
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 7bca15c604f5..839d68802e42 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -46,8 +46,6 @@ void bch2_btree_node_io_unlock(struct btree *b) void bch2_btree_node_io_lock(struct btree *b) { - bch2_assert_btree_nodes_not_locked(); - wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight, TASK_UNINTERRUPTIBLE); } @@ -66,16 +64,12 @@ void __bch2_btree_node_wait_on_write(struct btree *b) void bch2_btree_node_wait_on_read(struct btree *b) { - bch2_assert_btree_nodes_not_locked(); - wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, TASK_UNINTERRUPTIBLE); } void bch2_btree_node_wait_on_write(struct btree *b) { - bch2_assert_btree_nodes_not_locked(); - wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, TASK_UNINTERRUPTIBLE); } @@ -534,7 +528,7 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c, printbuf_indent_add(out, 2); prt_printf(out, "\nnode offset %u/%u", - b->written, btree_ptr_sectors_written(&b->key)); + b->written, btree_ptr_sectors_written(bkey_i_to_s_c(&b->key))); if (i) prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s)); if (k) @@ -585,7 +579,7 @@ static int __btree_err(int ret, switch (ret) { case -BCH_ERR_btree_node_read_err_fixable: ret = !silent - ? bch2_fsck_err(c, FSCK_CAN_FIX, err_type, "%s", out.buf) + ? __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf) : -BCH_ERR_fsck_fix; if (ret != -BCH_ERR_fsck_fix && ret != -BCH_ERR_fsck_ignore) @@ -689,6 +683,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, int write, bool have_retry, bool *saw_error) { unsigned version = le16_to_cpu(i->version); + unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); struct printbuf buf1 = PRINTBUF; struct printbuf buf2 = PRINTBUF; int ret = 0; @@ -732,15 +727,14 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_node_unsupported_version, "BSET_SEPARATE_WHITEOUTS no longer supported"); - if (btree_err_on(offset + sectors > btree_sectors(c), + if (!write && + btree_err_on(offset + sectors > (ptr_written ?: btree_sectors(c)), -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i, NULL, bset_past_end_of_btree_node, - "bset past end of btree node")) { + "bset past end of btree node (offset %u len %u but written %zu)", + offset, sectors, ptr_written ?: btree_sectors(c))) i->u64s = 0; - ret = 0; - goto out; - } btree_err_on(offset && !i->u64s, -BCH_ERR_btree_node_read_err_fixable, @@ -832,21 +826,19 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, BSET_BIG_ENDIAN(i), write, &bn->format); } -out: fsck_err: printbuf_exit(&buf2); printbuf_exit(&buf1); return ret; } -static int bset_key_invalid(struct bch_fs *c, struct btree *b, - struct bkey_s_c k, - bool updated_range, int rw, - struct printbuf *err) +static int bset_key_validate(struct bch_fs *c, struct btree *b, + struct bkey_s_c k, + bool updated_range, int rw) { - return __bch2_bkey_invalid(c, k, btree_node_type(b), READ, err) ?: - (!updated_range ? bch2_bkey_in_btree_node(c, b, k, err) : 0) ?: - (rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0); + return __bch2_bkey_validate(c, k, btree_node_type(b), 0) ?: + (!updated_range ? bch2_bkey_in_btree_node(c, b, k, 0) : 0) ?: + (rw == WRITE ? bch2_bkey_val_validate(c, k, 0) : 0); } static bool bkey_packed_valid(struct bch_fs *c, struct btree *b, @@ -861,12 +853,9 @@ static bool bkey_packed_valid(struct bch_fs *c, struct btree *b, if (!bkeyp_u64s_valid(&b->format, k)) return false; - struct printbuf buf = PRINTBUF; struct bkey tmp; struct bkey_s u = __bkey_disassemble(b, k, &tmp); - bool ret = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b), READ, &buf); - printbuf_exit(&buf); - return ret; + return !__bch2_bkey_validate(c, u.s_c, btree_node_type(b), BCH_VALIDATE_silent); } static int validate_bset_keys(struct bch_fs *c, struct btree *b, @@ -918,19 +907,11 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, u = __bkey_disassemble(b, k, &tmp); - printbuf_reset(&buf); - if (bset_key_invalid(c, b, u.s_c, updated_range, write, &buf)) { - printbuf_reset(&buf); - bset_key_invalid(c, b, u.s_c, updated_range, write, &buf); - prt_printf(&buf, "\n "); - bch2_bkey_val_to_text(&buf, c, u.s_c); - - btree_err(-BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, k, - btree_node_bad_bkey, - "invalid bkey: %s", buf.buf); + ret = bset_key_validate(c, b, u.s_c, updated_range, write); + if (ret == -BCH_ERR_fsck_delete_bkey) goto drop_this_key; - } + if (ret) + goto fsck_err; if (write) bch2_bkey_compat(b->c.level, b->c.btree_id, version, @@ -1002,7 +983,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); unsigned u64s; - unsigned ptr_written = btree_ptr_sectors_written(&b->key); + unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); + u64 max_journal_seq = 0; struct printbuf buf = PRINTBUF; int ret = 0, retry_read = 0, write = READ; u64 start_time = local_clock(); @@ -1178,6 +1160,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, sort_iter_add(iter, vstruct_idx(i, 0), vstruct_last(i)); + + max_journal_seq = max(max_journal_seq, le64_to_cpu(i->journal_seq)); } if (ptr_written) { @@ -1207,6 +1191,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, set_btree_bset(b, b->set, &b->data->keys); b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter); + memset((uint8_t *)(sorted + 1) + b->nr.live_u64s * sizeof(u64), 0, + btree_buf_bytes(b) - + sizeof(struct btree_node) - + b->nr.live_u64s * sizeof(u64)); u64s = le16_to_cpu(sorted->keys.u64s); *sorted = *b->data; @@ -1214,6 +1202,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, swap(sorted, b->data); set_btree_bset(b, b->set, &b->data->keys); b->nsets = 1; + b->data->keys.journal_seq = cpu_to_le64(max_journal_seq); BUG_ON(b->nr.live_u64s != u64s); @@ -1227,23 +1216,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, struct bkey tmp; struct bkey_s u = __bkey_disassemble(b, k, &tmp); - printbuf_reset(&buf); - - if (bch2_bkey_val_invalid(c, u.s_c, READ, &buf) || + ret = bch2_bkey_val_validate(c, u.s_c, READ); + if (ret == -BCH_ERR_fsck_delete_bkey || (bch2_inject_invalid_keys && - !bversion_cmp(u.k->version, MAX_VERSION))) { - printbuf_reset(&buf); - - prt_printf(&buf, "invalid bkey: "); - bch2_bkey_val_invalid(c, u.s_c, READ, &buf); - prt_printf(&buf, "\n "); - bch2_bkey_val_to_text(&buf, c, u.s_c); - - btree_err(-BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, k, - btree_node_bad_bkey, - "%s", buf.buf); - + !bversion_cmp(u.k->bversion, MAX_VERSION))) { btree_keys_account_key_drop(&b->nr, 0, k); i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); @@ -1252,6 +1228,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, set_btree_bset_end(b, b->set); continue; } + if (ret) + goto fsck_err; if (u.k->type == KEY_TYPE_btree_ptr_v2) { struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u); @@ -1688,7 +1666,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, bch2_btree_pos_to_text(&buf, c, b); bch_err_ratelimited(c, "%s", buf.buf); - if (c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && + if (c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) bch2_fatal_error(c); @@ -1766,13 +1744,13 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id, set_btree_node_read_in_flight(b); + /* we can't pass the trans to read_done() for fsck errors, so it must be unlocked */ + bch2_trans_unlock(trans); bch2_btree_node_read(trans, b, true); if (btree_node_read_error(b)) { - bch2_btree_node_hash_remove(&c->btree_cache, b); - mutex_lock(&c->btree_cache.lock); - list_move(&b->list, &c->btree_cache.freeable); + bch2_btree_node_hash_remove(&c->btree_cache, b); mutex_unlock(&c->btree_cache.lock); ret = -BCH_ERR_btree_node_read_error; @@ -1796,15 +1774,16 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, struct btree_write *w) { - unsigned long old, new, v = READ_ONCE(b->will_make_reachable); + unsigned long old, new; + old = READ_ONCE(b->will_make_reachable); do { - old = new = v; + new = old; if (!(old & 1)) break; new &= ~1UL; - } while ((v = cmpxchg(&b->will_make_reachable, old, new)) != old); + } while (!try_cmpxchg(&b->will_make_reachable, &old, new)); if (old & 1) closure_put(&((struct btree_update *) new)->cl); @@ -1815,14 +1794,14 @@ static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, static void __btree_node_write_done(struct bch_fs *c, struct btree *b) { struct btree_write *w = btree_prev_write(b); - unsigned long old, new, v; + unsigned long old, new; unsigned type = 0; bch2_btree_complete_write(c, b, w); - v = READ_ONCE(b->flags); + old = READ_ONCE(b->flags); do { - old = new = v; + new = old; if ((old & (1U << BTREE_NODE_dirty)) && (old & (1U << BTREE_NODE_need_write)) && @@ -1842,7 +1821,7 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b) new &= ~(1U << BTREE_NODE_write_in_flight); new &= ~(1U << BTREE_NODE_write_in_flight_inner); } - } while ((v = cmpxchg(&b->flags, old, new)) != old); + } while (!try_cmpxchg(&b->flags, &old, new)); if (new & (1U << BTREE_NODE_write_in_flight)) __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type); @@ -1855,10 +1834,11 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b) struct btree_trans *trans = bch2_trans_get(c); btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - __btree_node_write_done(c, b); - six_unlock_read(&b->c.lock); + /* we don't need transaction context anymore after we got the lock. */ bch2_trans_put(trans); + __btree_node_write_done(c, b); + six_unlock_read(&b->c.lock); } static void btree_node_write_work(struct work_struct *work) @@ -1887,7 +1867,7 @@ static void btree_node_write_work(struct work_struct *work) } } else { - ret = bch2_trans_do(c, NULL, NULL, 0, + ret = bch2_trans_do(c, bch2_btree_node_update_key_get_iter(trans, b, &wbio->key, BCH_WATERMARK_interior_updates| BCH_TRANS_COMMIT_journal_reclaim| @@ -1950,18 +1930,14 @@ static void btree_node_write_endio(struct bio *bio) static int validate_bset_for_write(struct bch_fs *c, struct btree *b, struct bset *i, unsigned sectors) { - struct printbuf buf = PRINTBUF; bool saw_error; - int ret; - ret = bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), - BKEY_TYPE_btree, WRITE, &buf); - - if (ret) - bch2_fs_inconsistent(c, "invalid btree node key before write: %s", buf.buf); - printbuf_exit(&buf); - if (ret) + int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key), + BKEY_TYPE_btree, WRITE); + if (ret) { + bch2_fs_inconsistent(c, "invalid btree node key before write"); return ret; + } ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?: validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error); @@ -2014,8 +1990,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) * dirty bit requires a write lock, we can't race with other threads * redirtying it: */ + old = READ_ONCE(b->flags); do { - old = new = READ_ONCE(b->flags); + new = old; if (!(old & (1 << BTREE_NODE_dirty))) return; @@ -2046,14 +2023,14 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) new |= (1 << BTREE_NODE_write_in_flight_inner); new |= (1 << BTREE_NODE_just_written); new ^= (1 << BTREE_NODE_write_idx); - } while (cmpxchg_acquire(&b->flags, old, new) != old); + } while (!try_cmpxchg_acquire(&b->flags, &old, new)); if (new & (1U << BTREE_NODE_need_write)) return; do_write: BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0)); - atomic_dec(&c->btree_cache.dirty); + atomic_long_dec(&c->btree_cache.nr_dirty); BUG_ON(btree_node_fake(b)); BUG_ON((b->will_make_reachable != 0) != !b->written); @@ -2133,7 +2110,7 @@ do_write: if (!b->written && b->key.k.type == KEY_TYPE_btree_ptr_v2) - BUG_ON(btree_ptr_sectors_written(&b->key) != sectors_to_write); + BUG_ON(btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)) != sectors_to_write); memset(data + bytes_to_write, 0, (sectors_to_write << 9) - bytes_to_write); |