diff options
Diffstat (limited to 'fs/xfs')
100 files changed, 2861 insertions, 2024 deletions
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 436f686a9891..1193fd6e4bad 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -684,8 +684,10 @@ xfs_alloc_update_counters( xfs_trans_agblocks_delta(tp, len); if (unlikely(be32_to_cpu(agf->agf_freeblks) > - be32_to_cpu(agf->agf_length))) + be32_to_cpu(agf->agf_length))) { + xfs_buf_mark_corrupt(agbp); return -EFSCORRUPTED; + } xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS); return 0; @@ -751,6 +753,7 @@ xfs_alloc_ag_vextent_small( bp = xfs_btree_get_bufs(args->mp, args->tp, args->agno, fbno); if (!bp) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, args->mp); error = -EFSCORRUPTED; goto error; } @@ -1995,24 +1998,32 @@ xfs_alloc_longest_free_extent( return pag->pagf_flcount > 0 || pag->pagf_longest > 0; } +/* + * Compute the minimum length of the AGFL in the given AG. If @pag is NULL, + * return the largest possible minimum length. + */ unsigned int xfs_alloc_min_freelist( struct xfs_mount *mp, struct xfs_perag *pag) { + /* AG btrees have at least 1 level. */ + static const uint8_t fake_levels[XFS_BTNUM_AGF] = {1, 1, 1}; + const uint8_t *levels = pag ? pag->pagf_levels : fake_levels; unsigned int min_free; + ASSERT(mp->m_ag_maxlevels > 0); + /* space needed by-bno freespace btree */ - min_free = min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_BNOi] + 1, + min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1, mp->m_ag_maxlevels); /* space needed by-size freespace btree */ - min_free += min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_CNTi] + 1, + min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1, mp->m_ag_maxlevels); /* space needed reverse mapping used space btree */ if (xfs_sb_version_hasrmapbt(&mp->m_sb)) - min_free += min_t(unsigned int, - pag->pagf_levels[XFS_BTNUM_RMAPi] + 1, - mp->m_rmap_maxlevels); + min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1, + mp->m_rmap_maxlevels); return min_free; } @@ -2087,8 +2098,10 @@ xfs_free_agfl_block( return error; bp = xfs_btree_get_bufs(tp->t_mountp, tp, agno, agbno); - if (!bp) + if (!bp) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, tp->t_mountp); return -EFSCORRUPTED; + } xfs_trans_binval(tp, bp); return 0; diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 510ca6974604..c83ff610ecb6 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -1007,7 +1007,7 @@ restart: * The INCOMPLETE flag means that we will find the "old" * attr, not the "new" one. */ - args->flags |= XFS_ATTR_INCOMPLETE; + args->op_flags |= XFS_DA_OP_INCOMPLETE; state = xfs_da_state_alloc(); state->args = args; state->mp = mp; diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index de33efc9b4f9..2b74b6e9a354 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -251,14 +251,6 @@ xfs_attr3_leaf_verify( return fa; /* - * In recovery there is a transient state where count == 0 is valid - * because we may have transitioned an empty shortform attr to a leaf - * if the attr didn't fit in shortform. - */ - if (!xfs_log_in_recovery(mp) && ichdr.count == 0) - return __this_address; - - /* * firstused is the block offset of the first name info structure. * Make sure it doesn't go off the block or crash into the header. */ @@ -443,7 +435,7 @@ xfs_attr_copy_value( *========================================================================*/ /* - * Query whether the requested number of additional bytes of extended + * Query whether the total requested number of attr fork bytes of extended * attribute space will be able to fit inline. * * Returns zero if not, else the di_forkoff fork offset to be used in the @@ -463,8 +455,14 @@ xfs_attr_shortform_bytesfit( int maxforkoff; int offset; + /* + * Check if the new size could fit at all first: + */ + if (bytes > XFS_LITINO(mp)) + return 0; + /* rounded down */ - offset = (XFS_LITINO(mp, dp->i_d.di_version) - bytes) >> 3; + offset = (XFS_LITINO(mp) - bytes) >> 3; if (dp->i_d.di_format == XFS_DINODE_FMT_DEV) { minforkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; @@ -531,8 +529,7 @@ xfs_attr_shortform_bytesfit( minforkoff = roundup(minforkoff, 8) >> 3; /* attr fork btree root can have at least this many key/ptr pairs */ - maxforkoff = XFS_LITINO(mp, dp->i_d.di_version) - - XFS_BMDR_SPACE_CALC(MINABTPTRS); + maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS); maxforkoff = maxforkoff >> 3; /* rounded down */ if (offset >= maxforkoff) @@ -2287,8 +2284,10 @@ xfs_attr3_leaf_lookup_int( leaf = bp->b_addr; xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); entries = xfs_attr3_leaf_entryp(leaf); - if (ichdr.count >= args->geo->blksize / 8) + if (ichdr.count >= args->geo->blksize / 8) { + xfs_buf_mark_corrupt(bp); return -EFSCORRUPTED; + } /* * Binary search. (note: small blocks will skip this loop) @@ -2304,10 +2303,14 @@ xfs_attr3_leaf_lookup_int( else break; } - if (!(probe >= 0 && (!ichdr.count || probe < ichdr.count))) + if (!(probe >= 0 && (!ichdr.count || probe < ichdr.count))) { + xfs_buf_mark_corrupt(bp); return -EFSCORRUPTED; - if (!(span <= 4 || be32_to_cpu(entry->hashval) == hashval)) + } + if (!(span <= 4 || be32_to_cpu(entry->hashval) == hashval)) { + xfs_buf_mark_corrupt(bp); return -EFSCORRUPTED; + } /* * Since we may have duplicate hashval's, find the first matching @@ -2339,8 +2342,8 @@ xfs_attr3_leaf_lookup_int( * If we are looking for INCOMPLETE entries, show only those. * If we are looking for complete entries, show only those. */ - if ((args->flags & XFS_ATTR_INCOMPLETE) != - (entry->flags & XFS_ATTR_INCOMPLETE)) { + if (!!(args->op_flags & XFS_DA_OP_INCOMPLETE) != + !!(entry->flags & XFS_ATTR_INCOMPLETE)) { continue; } if (entry->flags & XFS_ATTR_LOCAL) { diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 7b74e18becff..38c05d6ae2aa 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -17,13 +17,27 @@ struct xfs_inode; struct xfs_trans; /* - * Used to keep a list of "remote value" extents when unlinking an inode. + * Incore version of the attribute leaf header. */ -typedef struct xfs_attr_inactive_list { - xfs_dablk_t valueblk; /* block number of value bytes */ - int valuelen; /* number of bytes in value */ -} xfs_attr_inactive_list_t; - +struct xfs_attr3_icleaf_hdr { + uint32_t forw; + uint32_t back; + uint16_t magic; + uint16_t count; + uint16_t usedbytes; + /* + * Firstused is 32-bit here instead of 16-bit like the on-disk variant + * to support maximum fsb size of 64k without overflow issues throughout + * the attr code. Instead, the overflow condition is handled on + * conversion to/from disk. + */ + uint32_t firstused; + __u8 holes; + struct { + uint16_t base; + uint16_t size; + } freemap[XFS_ATTR_LEAF_MAPSIZE]; +}; /*======================================================================== * Function prototypes for the kernel. diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 3e39b7d40f25..de9096b8a47c 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -25,6 +25,23 @@ #define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */ /* + * Remote Attribute Values + * ======================= + * + * Remote extended attribute values are conceptually simple -- they're written + * to data blocks mapped by an inode's attribute fork, and they have an upper + * size limit of 64k. Setting a value does not involve the XFS log. + * + * However, on a v5 filesystem, maximally sized remote attr values require one + * block more than 64k worth of space to hold both the remote attribute value + * header (64 bytes). On a 4k block filesystem this results in a 68k buffer; + * on a 64k block filesystem, this would be a 128k buffer. Note that the log + * format can only handle a dirty buffer of XFS_MAX_BLOCKSIZE length (64k). + * Therefore, we /must/ ensure that remote attribute value buffers never touch + * the logging system and therefore never have a log item. + */ + +/* * Each contiguous block has a header, so it is not just a simple attribute * length to FSB conversion. */ @@ -400,17 +417,25 @@ xfs_attr_rmtval_get( (map[i].br_startblock != HOLESTARTBLOCK)); dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); - error = xfs_trans_read_buf(mp, args->trans, - mp->m_ddev_targp, - dblkno, dblkcnt, 0, &bp, - &xfs_attr3_rmt_buf_ops); - if (error) + bp = xfs_buf_read(mp->m_ddev_targp, dblkno, dblkcnt, 0, + &xfs_attr3_rmt_buf_ops); + if (!bp) + return -ENOMEM; + error = bp->b_error; + if (error) { + xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_relse(bp); + + /* bad CRC means corrupted metadata */ + if (error == -EFSBADCRC) + error = -EFSCORRUPTED; return error; + } error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino, &offset, &valuelen, &dst); - xfs_trans_brelse(args->trans, bp); + xfs_buf_relse(bp); if (error) return error; @@ -551,6 +576,32 @@ xfs_attr_rmtval_set( return 0; } +/* Mark stale any incore buffers for the remote value. */ +int +xfs_attr_rmtval_stale( + struct xfs_inode *ip, + struct xfs_bmbt_irec *map, + xfs_buf_flags_t incore_flags) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_buf *bp; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + ASSERT((map->br_startblock != DELAYSTARTBLOCK) && + (map->br_startblock != HOLESTARTBLOCK)); + + bp = xfs_buf_incore(mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, map->br_startblock), + XFS_FSB_TO_BB(mp, map->br_blockcount), incore_flags); + if (bp) { + xfs_buf_stale(bp); + xfs_buf_relse(bp); + } + + return 0; +} + /* * Remove the value associated with an attribute by deleting the * out-of-line buffer that it is stored on. @@ -559,7 +610,6 @@ int xfs_attr_rmtval_remove( struct xfs_da_args *args) { - struct xfs_mount *mp = args->dp->i_mount; xfs_dablk_t lblkno; int blkcnt; int error; @@ -574,9 +624,6 @@ xfs_attr_rmtval_remove( blkcnt = args->rmtblkcnt; while (blkcnt > 0) { struct xfs_bmbt_irec map; - struct xfs_buf *bp; - xfs_daddr_t dblkno; - int dblkcnt; int nmap; /* @@ -588,21 +635,9 @@ xfs_attr_rmtval_remove( if (error) return error; ASSERT(nmap == 1); - ASSERT((map.br_startblock != DELAYSTARTBLOCK) && - (map.br_startblock != HOLESTARTBLOCK)); - - dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), - dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); - - /* - * If the "remote" value is in the cache, remove it. - */ - bp = xfs_buf_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK); - if (bp) { - xfs_buf_stale(bp); - xfs_buf_relse(bp); - bp = NULL; - } + error = xfs_attr_rmtval_stale(args->dp, &map, XBF_TRYLOCK); + if (error) + return error; lblkno += map.br_blockcount; blkcnt -= map.br_blockcount; diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h index 9d20b66ad379..6fb4572845ce 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.h +++ b/fs/xfs/libxfs/xfs_attr_remote.h @@ -11,5 +11,7 @@ int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen); int xfs_attr_rmtval_get(struct xfs_da_args *args); int xfs_attr_rmtval_set(struct xfs_da_args *args); int xfs_attr_rmtval_remove(struct xfs_da_args *args); +int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map, + xfs_buf_flags_t incore_flags); #endif /* __XFS_ATTR_REMOTE_H__ */ diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index c114d24be619..1e0fab62cd7d 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -192,14 +192,12 @@ xfs_default_attroffset( struct xfs_mount *mp = ip->i_mount; uint offset; - if (mp->m_sb.sb_inodesize == 256) { - offset = XFS_LITINO(mp, ip->i_d.di_version) - - XFS_BMDR_SPACE_CALC(MINABTPTRS); - } else { + if (mp->m_sb.sb_inodesize == 256) + offset = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS); + else offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS); - } - ASSERT(offset < XFS_LITINO(mp, ip->i_d.di_version)); + ASSERT(offset < XFS_LITINO(mp)); return offset; } @@ -729,6 +727,7 @@ xfs_bmap_extents_to_btree( xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); abp = xfs_btree_get_bufl(mp, tp, args.fsbno); if (!abp) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); error = -EFSCORRUPTED; goto out_unreserve_dquot; } @@ -1084,6 +1083,7 @@ xfs_bmap_add_attrfork( if (XFS_IFORK_Q(ip)) goto trans_cancel; if (ip->i_d.di_anextents != 0) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); error = -EFSCORRUPTED; goto trans_cancel; } @@ -1374,7 +1374,8 @@ xfs_bmap_last_before( case XFS_DINODE_FMT_EXTENTS: break; default: - return -EIO; + ASSERT(0); + return -EFSCORRUPTED; } if (!(ifp->if_flags & XFS_IFEXTENTS)) { @@ -1474,8 +1475,10 @@ xfs_bmap_last_offset( return 0; if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) - return -EIO; + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) { + ASSERT(0); + return -EFSCORRUPTED; + } error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty); if (error || is_empty) @@ -5871,8 +5874,9 @@ xfs_bmap_insert_extents( XFS_WANT_CORRUPTED_GOTO(mp, !isnullstartblock(got.br_startblock), del_cursor); - if (stop_fsb >= got.br_startoff + got.br_blockcount) { - error = -EIO; + if (stop_fsb > got.br_startoff) { + ASSERT(0); + error = -EFSCORRUPTED; goto del_cursor; } @@ -5919,8 +5923,8 @@ del_cursor: * @split_fsb is a block where the extents is split. If split_fsb lies in a * hole or the first block of extents, just return 0. */ -STATIC int -xfs_bmap_split_extent_at( +int +xfs_bmap_split_extent( struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t split_fsb) @@ -6031,34 +6035,6 @@ del_cursor: return error; } -int -xfs_bmap_split_extent( - struct xfs_inode *ip, - xfs_fileoff_t split_fsb) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_trans *tp; - int error; - - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, - XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp); - if (error) - return error; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - - error = xfs_bmap_split_extent_at(tp, ip, split_fsb); - if (error) - goto out; - - return xfs_trans_commit(tp); - -out: - xfs_trans_cancel(tp); - return error; -} - /* Deferred mapping is only for real extents in the data fork. */ static bool xfs_bmap_is_update_needed( diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 093716a074fb..a51c2b13a51a 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -158,17 +158,22 @@ static inline int xfs_bmapi_whichfork(int bmapi_flags) { BMAP_ATTRFORK, "ATTR" }, \ { BMAP_COWFORK, "COW" } +/* Return true if the extent is an allocated extent, written or not. */ +static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec) +{ + return irec->br_startblock != HOLESTARTBLOCK && + irec->br_startblock != DELAYSTARTBLOCK && + !isnullstartblock(irec->br_startblock); +} /* * Return true if the extent is a real, allocated extent, or false if it is a * delayed allocation, and unwritten extent or a hole. */ -static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec) +static inline bool xfs_bmap_is_written_extent(struct xfs_bmbt_irec *irec) { - return irec->br_state != XFS_EXT_UNWRITTEN && - irec->br_startblock != HOLESTARTBLOCK && - irec->br_startblock != DELAYSTARTBLOCK && - !isnullstartblock(irec->br_startblock); + return xfs_bmap_is_real_extent(irec) && + irec->br_state != XFS_EXT_UNWRITTEN; } /* @@ -222,7 +227,8 @@ int xfs_bmap_can_insert_extents(struct xfs_inode *ip, xfs_fileoff_t off, int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, bool *done, xfs_fileoff_t stop_fsb); -int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset); +int xfs_bmap_split_extent(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t split_offset); int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc, struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur, diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 71de937f9e64..121251651fea 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -354,20 +354,17 @@ xfs_btree_free_block( */ void xfs_btree_del_cursor( - xfs_btree_cur_t *cur, /* btree cursor */ - int error) /* del because of error */ + struct xfs_btree_cur *cur, /* btree cursor */ + int error) /* del because of error */ { - int i; /* btree level */ + int i; /* btree level */ /* - * Clear the buffer pointers, and release the buffers. - * If we're doing this in the face of an error, we - * need to make sure to inspect all of the entries - * in the bc_bufs array for buffers to be unlocked. - * This is because some of the btree code works from - * level n down to 0, and if we get an error along - * the way we won't have initialized all the entries - * down to 0. + * Clear the buffer pointers and release the buffers. If we're doing + * this because of an error, inspect all of the entries in the bc_bufs + * array for buffers to be unlocked. This is because some of the btree + * code works from level n down to 0, and if we get an error along the + * way we won't have initialized all the entries down to 0. */ for (i = 0; i < cur->bc_nlevels; i++) { if (cur->bc_bufs[i]) @@ -375,15 +372,10 @@ xfs_btree_del_cursor( else if (!error) break; } - /* - * Can't free a bmap cursor without having dealt with the - * allocated indirect blocks' accounting. - */ + ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || - cur->bc_private.b.allocated == 0); - /* - * Free the cursor. - */ + cur->bc_private.b.allocated == 0 || + XFS_FORCED_SHUTDOWN(cur->bc_mp)); kmem_zone_free(xfs_btree_cur_zone, cur); } @@ -1820,6 +1812,7 @@ xfs_btree_lookup_get_block( out_bad: *blkp = NULL; + xfs_buf_mark_corrupt(bp); xfs_trans_brelse(cur->bc_tp, bp); return -EFSCORRUPTED; } @@ -1867,8 +1860,10 @@ xfs_btree_lookup( XFS_BTREE_STATS_INC(cur, lookup); /* No such thing as a zero-level tree. */ - if (cur->bc_nlevels == 0) + if (cur->bc_nlevels == 0) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, cur->bc_mp); return -EFSCORRUPTED; + } block = NULL; keyno = 0; diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 4fd1223c1bd5..12ef16c157dc 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -504,6 +504,7 @@ xfs_da3_split( node = oldblk->bp->b_addr; if (node->hdr.info.forw) { if (be32_to_cpu(node->hdr.info.forw) != addblk->blkno) { + xfs_buf_mark_corrupt(oldblk->bp); error = -EFSCORRUPTED; goto out; } @@ -516,6 +517,7 @@ xfs_da3_split( node = oldblk->bp->b_addr; if (node->hdr.info.back) { if (be32_to_cpu(node->hdr.info.back) != addblk->blkno) { + xfs_buf_mark_corrupt(oldblk->bp); error = -EFSCORRUPTED; goto out; } @@ -1541,8 +1543,10 @@ xfs_da3_node_lookup_int( break; } - if (magic != XFS_DA_NODE_MAGIC && magic != XFS_DA3_NODE_MAGIC) + if (magic != XFS_DA_NODE_MAGIC && magic != XFS_DA3_NODE_MAGIC) { + xfs_buf_mark_corrupt(blk->bp); return -EFSCORRUPTED; + } blk->magic = XFS_DA_NODE_MAGIC; @@ -1554,15 +1558,18 @@ xfs_da3_node_lookup_int( btree = dp->d_ops->node_tree_p(node); /* Tree taller than we can handle; bail out! */ - if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH) + if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH) { + xfs_buf_mark_corrupt(blk->bp); return -EFSCORRUPTED; + } /* Check the level from the root. */ if (blkno == args->geo->leafblk) expected_level = nodehdr.level - 1; - else if (expected_level != nodehdr.level) + else if (expected_level != nodehdr.level) { + xfs_buf_mark_corrupt(blk->bp); return -EFSCORRUPTED; - else + } else expected_level--; max = nodehdr.count; @@ -1612,12 +1619,17 @@ xfs_da3_node_lookup_int( } /* We can't point back to the root. */ - if (blkno == args->geo->leafblk) + if (blkno == args->geo->leafblk) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, + dp->i_mount); return -EFSCORRUPTED; + } } - if (expected_level != 0) + if (expected_level != 0) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, dp->i_mount); return -EFSCORRUPTED; + } /* * A leaf block that ends in the hashval that we are interested in diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index ae0bbd20d9ca..588e4674e931 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -82,6 +82,7 @@ typedef struct xfs_da_args { #define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */ #define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */ #define XFS_DA_OP_ALLOCVAL 0x0020 /* lookup to alloc buffer if found */ +#define XFS_DA_OP_INCOMPLETE 0x0040 /* lookup INCOMPLETE attr keys */ #define XFS_DA_OP_FLAGS \ { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \ @@ -89,7 +90,8 @@ typedef struct xfs_da_args { { XFS_DA_OP_ADDNAME, "ADDNAME" }, \ { XFS_DA_OP_OKNOENT, "OKNOENT" }, \ { XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \ - { XFS_DA_OP_ALLOCVAL, "ALLOCVAL" } + { XFS_DA_OP_ALLOCVAL, "ALLOCVAL" }, \ + { XFS_DA_OP_INCOMPLETE, "INCOMPLETE" } /* * Storage for holding state during Btree searches and split/join ops. @@ -125,6 +127,19 @@ typedef struct xfs_da_state { } xfs_da_state_t; /* + * In-core version of the node header to abstract the differences in the v2 and + * v3 disk format of the headers. Callers need to convert to/from disk format as + * appropriate. + */ +struct xfs_da3_icnode_hdr { + uint32_t forw; + uint32_t back; + uint16_t magic; + uint16_t count; + uint16_t level; +}; + +/* * Utility macros to aid in logging changed structure fields. */ #define XFS_DA_LOGOFF(BASE, ADDR) ((char *)(ADDR) - (char *)(BASE)) diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c index b1ae572496b6..31bb250c1899 100644 --- a/fs/xfs/libxfs/xfs_da_format.c +++ b/fs/xfs/libxfs/xfs_da_format.c @@ -13,6 +13,7 @@ #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_dir2.h" +#include "xfs_dir2_priv.h" /* * Shortform directory ops diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index ae654e06b2fb..222ee48da5e8 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -94,19 +94,6 @@ struct xfs_da3_intnode { }; /* - * In-core version of the node header to abstract the differences in the v2 and - * v3 disk format of the headers. Callers need to convert to/from disk format as - * appropriate. - */ -struct xfs_da3_icnode_hdr { - uint32_t forw; - uint32_t back; - uint16_t magic; - uint16_t count; - uint16_t level; -}; - -/* * Directory version 2. * * There are 4 possible formats: @@ -434,14 +421,6 @@ struct xfs_dir3_leaf_hdr { __be32 pad; /* 64 bit alignment */ }; -struct xfs_dir3_icleaf_hdr { - uint32_t forw; - uint32_t back; - uint16_t magic; - uint16_t count; - uint16_t stale; -}; - /* * Leaf block entry. */ @@ -521,19 +500,6 @@ struct xfs_dir3_free { #define XFS_DIR3_FREE_CRC_OFF offsetof(struct xfs_dir3_free, hdr.hdr.crc) /* - * In core version of the free block header, abstracted away from on-disk format - * differences. Use this in the code, and convert to/from the disk version using - * xfs_dir3_free_hdr_from_disk/xfs_dir3_free_hdr_to_disk. - */ -struct xfs_dir3_icfree_hdr { - uint32_t magic; - uint32_t firstdb; - uint32_t nvalid; - uint32_t nused; - -}; - -/* * Single block format. * * The single block format looks like the following drawing on disk: @@ -710,29 +676,6 @@ struct xfs_attr3_leafblock { }; /* - * incore, neutral version of the attribute leaf header - */ -struct xfs_attr3_icleaf_hdr { - uint32_t forw; - uint32_t back; - uint16_t magic; - uint16_t count; - uint16_t usedbytes; - /* - * firstused is 32-bit here instead of 16-bit like the on-disk variant - * to support maximum fsb size of 64k without overflow issues throughout - * the attr code. Instead, the overflow condition is handled on - * conversion to/from disk. - */ - uint32_t firstused; - __u8 holes; - struct { - uint16_t base; - uint16_t size; - } freemap[XFS_ATTR_LEAF_MAPSIZE]; -}; - -/* * Special value to represent fs block size in the leaf header firstused field. * Only used when block size overflows the 2-bytes available on disk. */ @@ -740,8 +683,6 @@ struct xfs_attr3_icleaf_hdr { /* * Flags used in the leaf_entry[i].flags field. - * NOTE: the INCOMPLETE bit must not collide with the flags bits specified - * on the system call, they are "or"ed together for various operations. */ #define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */ #define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */ diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 22557527cfdb..3a78a189ea01 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -16,6 +16,8 @@ #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_log.h" /* * Deferred Operations in XFS @@ -178,6 +180,19 @@ static const struct xfs_defer_op_type *defer_op_types[] = { [XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type, }; +static void +xfs_defer_create_intent( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp, + bool sort) +{ + const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; + + if (!dfp->dfp_intent) + dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work, + dfp->dfp_count, sort); +} + /* * For each pending item in the intake list, log its intent item and the * associated extents, then add the entire intake list to the end of @@ -187,17 +202,11 @@ STATIC void xfs_defer_create_intents( struct xfs_trans *tp) { - struct list_head *li; struct xfs_defer_pending *dfp; - const struct xfs_defer_op_type *ops; list_for_each_entry(dfp, &tp->t_dfops, dfp_list) { - ops = defer_op_types[dfp->dfp_type]; - dfp->dfp_intent = ops->create_intent(tp, dfp->dfp_count); trace_xfs_defer_create_intent(tp->t_mountp, dfp); - list_sort(tp->t_mountp, &dfp->dfp_work, ops->diff_items); - list_for_each(li, &dfp->dfp_work) - ops->log_item(tp, dfp->dfp_intent, li); + xfs_defer_create_intent(tp, dfp, true); } } @@ -234,10 +243,13 @@ xfs_defer_trans_roll( struct xfs_log_item *lip; struct xfs_buf *bplist[XFS_DEFER_OPS_NR_BUFS]; struct xfs_inode *iplist[XFS_DEFER_OPS_NR_INODES]; + unsigned int ordered = 0; /* bitmap */ int bpcount = 0, ipcount = 0; int i; int error; + BUILD_BUG_ON(NBBY * sizeof(ordered) < XFS_DEFER_OPS_NR_BUFS); + list_for_each_entry(lip, &tp->t_items, li_trans) { switch (lip->li_type) { case XFS_LI_BUF: @@ -248,7 +260,10 @@ xfs_defer_trans_roll( ASSERT(0); return -EFSCORRUPTED; } - xfs_trans_dirty_buf(tp, bli->bli_buf); + if (bli->bli_flags & XFS_BLI_ORDERED) + ordered |= (1U << bpcount); + else + xfs_trans_dirty_buf(tp, bli->bli_buf); bplist[bpcount++] = bli->bli_buf; } break; @@ -289,6 +304,8 @@ xfs_defer_trans_roll( /* Rejoin the buffers and dirty them so the log moves forward. */ for (i = 0; i < bpcount; i++) { xfs_trans_bjoin(tp, bplist[i]); + if (ordered & (1U << i)) + xfs_trans_ordered_buf(tp, bplist[i]); xfs_trans_bhold(tp, bplist[i]); } @@ -346,6 +363,106 @@ xfs_defer_cancel_list( } /* + * Prevent a log intent item from pinning the tail of the log by logging a + * done item to release the intent item; and then log a new intent item. + * The caller should provide a fresh transaction and roll it after we're done. + */ +static int +xfs_defer_relog( + struct xfs_trans **tpp, + struct list_head *dfops) +{ + struct xlog *log = (*tpp)->t_mountp->m_log; + struct xfs_defer_pending *dfp; + xfs_lsn_t threshold_lsn = NULLCOMMITLSN; + + + ASSERT((*tpp)->t_flags & XFS_TRANS_PERM_LOG_RES); + + list_for_each_entry(dfp, dfops, dfp_list) { + /* + * If the log intent item for this deferred op is not a part of + * the current log checkpoint, relog the intent item to keep + * the log tail moving forward. We're ok with this being racy + * because an incorrect decision means we'll be a little slower + * at pushing the tail. + */ + if (dfp->dfp_intent == NULL || + xfs_log_item_in_current_chkpt(dfp->dfp_intent)) + continue; + + /* + * Figure out where we need the tail to be in order to maintain + * the minimum required free space in the log. Only sample + * the log threshold once per call. + */ + if (threshold_lsn == NULLCOMMITLSN) { + threshold_lsn = xlog_grant_push_threshold(log, 0); + if (threshold_lsn == NULLCOMMITLSN) + break; + } + if (XFS_LSN_CMP(dfp->dfp_intent->li_lsn, threshold_lsn) >= 0) + continue; + + trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp); + XFS_STATS_INC((*tpp)->t_mountp, defer_relog); + dfp->dfp_intent = xfs_trans_item_relog(dfp->dfp_intent, *tpp); + } + + if ((*tpp)->t_flags & XFS_TRANS_DIRTY) + return xfs_defer_trans_roll(tpp); + return 0; +} + +/* + * Log an intent-done item for the first pending intent, and finish the work + * items. + */ +static int +xfs_defer_finish_one( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp) +{ + const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; + void *state = NULL; + struct list_head *li, *n; + int error; + + trace_xfs_defer_pending_finish(tp->t_mountp, dfp); + + dfp->dfp_done = ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count); + list_for_each_safe(li, n, &dfp->dfp_work) { + list_del(li); + dfp->dfp_count--; + error = ops->finish_item(tp, li, dfp->dfp_done, &state); + if (error == -EAGAIN) { + /* + * Caller wants a fresh transaction; put the work item + * back on the list and log a new log intent item to + * replace the old one. See "Requesting a Fresh + * Transaction while Finishing Deferred Work" above. + */ + list_add(li, &dfp->dfp_work); + dfp->dfp_count++; + dfp->dfp_done = NULL; + dfp->dfp_intent = NULL; + xfs_defer_create_intent(tp, dfp, false); + } + + if (error) + goto out; + } + + /* Done with the dfp, free it. */ + list_del(&dfp->dfp_list); + kmem_free(dfp); +out: + if (ops->finish_cleanup) + ops->finish_cleanup(tp, state, error); + return error; +} + +/* * Finish all the pending work. This involves logging intent items for * any work items that wandered in since the last transaction roll (if * one has even happened), rolling the transaction, and finishing the @@ -358,11 +475,7 @@ xfs_defer_finish_noroll( struct xfs_trans **tp) { struct xfs_defer_pending *dfp; - struct list_head *li; - struct list_head *n; - void *state; int error = 0; - const struct xfs_defer_op_type *ops; LIST_HEAD(dop_pending); ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); @@ -371,87 +484,44 @@ xfs_defer_finish_noroll( /* Until we run out of pending work to finish... */ while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) { - /* log intents and pull in intake items */ - xfs_defer_create_intents(*tp); - list_splice_tail_init(&(*tp)->t_dfops, &dop_pending); - /* - * Roll the transaction. + * Deferred items that are created in the process of finishing + * other deferred work items should be queued at the head of + * the pending list, which puts them ahead of the deferred work + * that was created by the caller. This keeps the number of + * pending work items to a minimum, which decreases the amount + * of time that any one intent item can stick around in memory, + * pinning the log tail. */ + xfs_defer_create_intents(*tp); + list_splice_init(&(*tp)->t_dfops, &dop_pending); + error = xfs_defer_trans_roll(tp); if (error) - goto out; + goto out_shutdown; + + /* Possibly relog intent items to keep the log moving. */ + error = xfs_defer_relog(tp, &dop_pending); + if (error) + goto out_shutdown; - /* Log an intent-done item for the first pending item. */ dfp = list_first_entry(&dop_pending, struct xfs_defer_pending, dfp_list); - ops = defer_op_types[dfp->dfp_type]; - trace_xfs_defer_pending_finish((*tp)->t_mountp, dfp); - dfp->dfp_done = ops->create_done(*tp, dfp->dfp_intent, - dfp->dfp_count); - - /* Finish the work items. */ - state = NULL; - list_for_each_safe(li, n, &dfp->dfp_work) { - list_del(li); - dfp->dfp_count--; - error = ops->finish_item(*tp, li, dfp->dfp_done, - &state); - if (error == -EAGAIN) { - /* - * Caller wants a fresh transaction; - * put the work item back on the list - * and jump out. - */ - list_add(li, &dfp->dfp_work); - dfp->dfp_count++; - break; - } else if (error) { - /* - * Clean up after ourselves and jump out. - * xfs_defer_cancel will take care of freeing - * all these lists and stuff. - */ - if (ops->finish_cleanup) - ops->finish_cleanup(*tp, state, error); - goto out; - } - } - if (error == -EAGAIN) { - /* - * Caller wants a fresh transaction, so log a - * new log intent item to replace the old one - * and roll the transaction. See "Requesting - * a Fresh Transaction while Finishing - * Deferred Work" above. - */ - dfp->dfp_intent = ops->create_intent(*tp, - dfp->dfp_count); - dfp->dfp_done = NULL; - list_for_each(li, &dfp->dfp_work) - ops->log_item(*tp, dfp->dfp_intent, li); - } else { - /* Done with the dfp, free it. */ - list_del(&dfp->dfp_list); - kmem_free(dfp); - } - - if (ops->finish_cleanup) - ops->finish_cleanup(*tp, state, error); - } - -out: - if (error) { - xfs_defer_trans_abort(*tp, &dop_pending); - xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE); - trace_xfs_defer_finish_error(*tp, error); - xfs_defer_cancel_list((*tp)->t_mountp, &dop_pending); - xfs_defer_cancel(*tp); - return error; + error = xfs_defer_finish_one(*tp, dfp); + if (error && error != -EAGAIN) + goto out_shutdown; } trace_xfs_defer_finish_done(*tp, _RET_IP_); return 0; + +out_shutdown: + xfs_defer_trans_abort(*tp, &dop_pending); + xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE); + trace_xfs_defer_finish_error(*tp, error); + xfs_defer_cancel_list((*tp)->t_mountp, &dop_pending); + xfs_defer_cancel(*tp); + return error; } int @@ -552,3 +622,137 @@ xfs_defer_move( xfs_defer_reset(stp); } + +/* + * Prepare a chain of fresh deferred ops work items to be completed later. Log + * recovery requires the ability to put off until later the actual finishing + * work so that it can process unfinished items recovered from the log in + * correct order. + * + * Create and log intent items for all the work that we're capturing so that we + * can be assured that the items will get replayed if the system goes down + * before log recovery gets a chance to finish the work it put off. The entire + * deferred ops state is transferred to the capture structure and the + * transaction is then ready for the caller to commit it. If there are no + * intent items to capture, this function returns NULL. + * + * If capture_ip is not NULL, the capture structure will obtain an extra + * reference to the inode. + */ +static struct xfs_defer_capture * +xfs_defer_ops_capture( + struct xfs_trans *tp, + struct xfs_inode *capture_ip) +{ + struct xfs_defer_capture *dfc; + + if (list_empty(&tp->t_dfops)) + return NULL; + + /* Create an object to capture the defer ops. */ + dfc = kmem_zalloc(sizeof(*dfc), KM_NOFS); + INIT_LIST_HEAD(&dfc->dfc_list); + INIT_LIST_HEAD(&dfc->dfc_dfops); + + xfs_defer_create_intents(tp); + + /* Move the dfops chain and transaction state to the capture struct. */ + list_splice_init(&tp->t_dfops, &dfc->dfc_dfops); + dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE; + tp->t_flags &= ~XFS_TRANS_LOWMODE; + + /* Capture the remaining block reservations along with the dfops. */ + dfc->dfc_blkres = tp->t_blk_res - tp->t_blk_res_used; + dfc->dfc_rtxres = tp->t_rtx_res - tp->t_rtx_res_used; + + /* Preserve the log reservation size. */ + dfc->dfc_logres = tp->t_log_res; + + /* + * Grab an extra reference to this inode and attach it to the capture + * structure. + */ + if (capture_ip) { + ihold(VFS_I(capture_ip)); + dfc->dfc_capture_ip = capture_ip; + } + + return dfc; +} + +/* Release all resources that we used to capture deferred ops. */ +void +xfs_defer_ops_release( + struct xfs_mount *mp, + struct xfs_defer_capture *dfc) +{ + xfs_defer_cancel_list(mp, &dfc->dfc_dfops); + if (dfc->dfc_capture_ip) + xfs_irele(dfc->dfc_capture_ip); + kmem_free(dfc); +} + +/* + * Capture any deferred ops and commit the transaction. This is the last step + * needed to finish a log intent item that we recovered from the log. If any + * of the deferred ops operate on an inode, the caller must pass in that inode + * so that the reference can be transferred to the capture structure. The + * caller must hold ILOCK_EXCL on the inode, and must unlock it before calling + * xfs_defer_ops_continue. + */ +int +xfs_defer_ops_capture_and_commit( + struct xfs_trans *tp, + struct xfs_inode *capture_ip, + struct list_head *capture_list) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_defer_capture *dfc; + int error; + + ASSERT(!capture_ip || xfs_isilocked(capture_ip, XFS_ILOCK_EXCL)); + + /* If we don't capture anything, commit transaction and exit. */ + dfc = xfs_defer_ops_capture(tp, capture_ip); + if (!dfc) + return xfs_trans_commit(tp); + + /* Commit the transaction and add the capture structure to the list. */ + error = xfs_trans_commit(tp); + if (error) { + xfs_defer_ops_release(mp, dfc); + return error; + } + + list_add_tail(&dfc->dfc_list, capture_list); + return 0; +} + +/* + * Attach a chain of captured deferred ops to a new transaction and free the + * capture structure. If an inode was captured, it will be passed back to the + * caller with ILOCK_EXCL held and joined to the transaction with lockflags==0. + * The caller now owns the inode reference. + */ +void +xfs_defer_ops_continue( + struct xfs_defer_capture *dfc, + struct xfs_trans *tp, + struct xfs_inode **captured_ipp) +{ + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY)); + + /* Lock and join the captured inode to the new transaction. */ + if (dfc->dfc_capture_ip) { + xfs_ilock(dfc->dfc_capture_ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dfc->dfc_capture_ip, 0); + } + *captured_ipp = dfc->dfc_capture_ip; + + /* Move captured dfops chain and state to the transaction. */ + list_splice_init(&dfc->dfc_dfops, &tp->t_dfops); + tp->t_flags |= dfc->dfc_tpflags; + + kmem_free(dfc); +} diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 7c28d7608ac6..4c3248d47a35 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -7,6 +7,7 @@ #define __XFS_DEFER_H__ struct xfs_defer_op_type; +struct xfs_defer_capture; /* * Header for deferred operation list. @@ -28,7 +29,7 @@ enum xfs_defer_ops_type { struct xfs_defer_pending { struct list_head dfp_list; /* pending items */ struct list_head dfp_work; /* work items */ - void *dfp_intent; /* log intent item */ + struct xfs_log_item *dfp_intent; /* log intent item */ void *dfp_done; /* log done item */ unsigned int dfp_count; /* # extent items */ enum xfs_defer_ops_type dfp_type; @@ -43,15 +44,15 @@ void xfs_defer_move(struct xfs_trans *dtp, struct xfs_trans *stp); /* Description of a deferred type. */ struct xfs_defer_op_type { - void (*abort_intent)(void *); - void *(*create_done)(struct xfs_trans *, void *, unsigned int); + struct xfs_log_item *(*create_intent)(struct xfs_trans *tp, + struct list_head *items, unsigned int count, bool sort); + void (*abort_intent)(struct xfs_log_item *intent); + void *(*create_done)(struct xfs_trans *tp, struct xfs_log_item *intent, + unsigned int count); int (*finish_item)(struct xfs_trans *, struct list_head *, void *, void **); void (*finish_cleanup)(struct xfs_trans *, void *, int); void (*cancel_item)(struct list_head *); - int (*diff_items)(void *, struct list_head *, struct list_head *); - void *(*create_intent)(struct xfs_trans *, uint); - void (*log_item)(struct xfs_trans *, void *, struct list_head *); unsigned int max_items; }; @@ -61,4 +62,40 @@ extern const struct xfs_defer_op_type xfs_rmap_update_defer_type; extern const struct xfs_defer_op_type xfs_extent_free_defer_type; extern const struct xfs_defer_op_type xfs_agfl_free_defer_type; +/* + * This structure enables a dfops user to detach the chain of deferred + * operations from a transaction so that they can be continued later. + */ +struct xfs_defer_capture { + /* List of other capture structures. */ + struct list_head dfc_list; + + /* Deferred ops state saved from the transaction. */ + struct list_head dfc_dfops; + unsigned int dfc_tpflags; + + /* Block reservations for the data and rt devices. */ + unsigned int dfc_blkres; + unsigned int dfc_rtxres; + + /* Log reservation saved from the transaction. */ + unsigned int dfc_logres; + + /* + * An inode reference that must be maintained to complete the deferred + * work. + */ + struct xfs_inode *dfc_capture_ip; +}; + +/* + * Functions to capture a chain of deferred operations and continue them later. + * This doesn't normally happen except log recovery. + */ +int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp, + struct xfs_inode *capture_ip, struct list_head *capture_list); +void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp, + struct xfs_inode **captured_ipp); +void xfs_defer_ops_release(struct xfs_mount *mp, struct xfs_defer_capture *d); + #endif /* __XFS_DEFER_H__ */ diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 867c5dee0751..452d04ae10ce 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -600,8 +600,10 @@ xfs_dir2_isblock( if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK))) return rval; rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize; - if (rval != 0 && args->dp->i_d.di_size != args->geo->blksize) + if (rval != 0 && args->dp->i_d.di_size != args->geo->blksize) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, args->dp->i_mount); return -EFSCORRUPTED; + } *vp = rval; return 0; } diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index f54244779492..e170792c0acc 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -18,6 +18,8 @@ struct xfs_dir2_sf_entry; struct xfs_dir2_data_hdr; struct xfs_dir2_data_entry; struct xfs_dir2_data_unused; +struct xfs_dir3_icfree_hdr; +struct xfs_dir3_icleaf_hdr; extern struct xfs_name xfs_name_dotdot; diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 49e4bc39e7bb..d034d661957c 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -114,6 +114,23 @@ const struct xfs_buf_ops xfs_dir3_block_buf_ops = { .verify_struct = xfs_dir3_block_verify, }; +static xfs_failaddr_t +xfs_dir3_block_header_check( + struct xfs_inode *dp, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = dp->i_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (be64_to_cpu(hdr3->owner) != dp->i_ino) + return __this_address; + } + + return NULL; +} + int xfs_dir3_block_read( struct xfs_trans *tp, @@ -121,12 +138,24 @@ xfs_dir3_block_read( struct xfs_buf **bpp) { struct xfs_mount *mp = dp->i_mount; + xfs_failaddr_t fa; int err; err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp, XFS_DATA_FORK, &xfs_dir3_block_buf_ops); - if (!err && tp && *bpp) - xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF); + if (err || !*bpp) + return err; + + /* Check things that we can't do in the verifier. */ + fa = xfs_dir3_block_header_check(dp, *bpp); + if (fa) { + __xfs_buf_mark_corrupt(*bpp, fa); + xfs_trans_brelse(tp, *bpp); + *bpp = NULL; + return -EFSCORRUPTED; + } + + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF); return err; } diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index 2c79be4c3153..2d92bcd8c801 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -348,6 +348,22 @@ static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = { .verify_write = xfs_dir3_data_write_verify, }; +static xfs_failaddr_t +xfs_dir3_data_header_check( + struct xfs_inode *dp, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = dp->i_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_data_hdr *hdr3 = bp->b_addr; + + if (be64_to_cpu(hdr3->hdr.owner) != dp->i_ino) + return __this_address; + } + + return NULL; +} int xfs_dir3_data_read( @@ -357,12 +373,24 @@ xfs_dir3_data_read( xfs_daddr_t mapped_bno, struct xfs_buf **bpp) { + xfs_failaddr_t fa; int err; err = xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp, XFS_DATA_FORK, &xfs_dir3_data_buf_ops); - if (!err && tp && *bpp) - xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF); + if (err || !*bpp) + return err; + + /* Check things that we can't do in the verifier. */ + fa = xfs_dir3_data_header_check(dp, *bpp); + if (fa) { + __xfs_buf_mark_corrupt(*bpp, fa); + xfs_trans_brelse(tp, *bpp); + *bpp = NULL; + return -EFSCORRUPTED; + } + + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF); return err; } diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index a53e4585a2f3..c8ee3250b749 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -1343,8 +1343,10 @@ xfs_dir2_leaf_removename( oldbest = be16_to_cpu(bf[0].length); ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); bestsp = xfs_dir2_leaf_bests_p(ltp); - if (be16_to_cpu(bestsp[db]) != oldbest) + if (be16_to_cpu(bestsp[db]) != oldbest) { + xfs_buf_mark_corrupt(lbp); return -EFSCORRUPTED; + } /* * Mark the former data entry unused. */ diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index 99d5b2ed67f2..c8c3c3af539f 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -208,7 +208,7 @@ __xfs_dir3_free_read( /* Check things that we can't do in the verifier. */ fa = xfs_dir3_free_header_check(dp, fbno, *bpp); if (fa) { - xfs_verifier_error(*bpp, -EFSCORRUPTED, fa); + __xfs_buf_mark_corrupt(*bpp, fa); xfs_trans_brelse(tp, *bpp); *bpp = NULL; return -EFSCORRUPTED; @@ -374,8 +374,10 @@ xfs_dir2_leaf_to_node( leaf = lbp->b_addr; ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); if (be32_to_cpu(ltp->bestcount) > - (uint)dp->i_d.di_size / args->geo->blksize) + (uint)dp->i_d.di_size / args->geo->blksize) { + xfs_buf_mark_corrupt(lbp); return -EFSCORRUPTED; + } /* * Copy freespace entries from the leaf block to the new block. @@ -446,8 +448,10 @@ xfs_dir2_leafn_add( * Quick check just to make sure we are not going to index * into other peoples memory */ - if (index < 0) + if (index < 0) { + xfs_buf_mark_corrupt(bp); return -EFSCORRUPTED; + } /* * If there are already the maximum number of leaf entries in @@ -740,8 +744,10 @@ xfs_dir2_leafn_lookup_for_entry( ents = dp->d_ops->leaf_ents_p(leaf); xfs_dir3_leaf_check(dp, bp); - if (leafhdr.count <= 0) + if (leafhdr.count <= 0) { + xfs_buf_mark_corrupt(bp); return -EFSCORRUPTED; + } /* * Look up the hash value in the leaf entries. diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index 59f9fb2241a5..d2eaea663e7f 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -8,6 +8,25 @@ struct dir_context; +/* + * In-core version of the leaf and free block headers to abstract the + * differences in the v2 and v3 disk format of the headers. + */ +struct xfs_dir3_icleaf_hdr { + uint32_t forw; + uint32_t back; + uint16_t magic; + uint16_t count; + uint16_t stale; +}; + +struct xfs_dir3_icfree_hdr { + uint32_t magic; + uint32_t firstdb; + uint32_t nvalid; + uint32_t nused; +}; + /* xfs_dir2.c */ extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space, xfs_dir2_db_t *dbp); diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index ae16ca7c422a..f980c3f3d2f6 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -945,6 +945,27 @@ xfs_dir2_sf_removename( } /* + * Check whether the sf dir replace operation need more blocks. + */ +static bool +xfs_dir2_sf_replace_needblock( + struct xfs_inode *dp, + xfs_ino_t inum) +{ + int newsize; + struct xfs_dir2_sf_hdr *sfp; + + if (dp->i_d.di_format != XFS_DINODE_FMT_LOCAL) + return false; + + sfp = (struct xfs_dir2_sf_hdr *)dp->i_df.if_u1.if_data; + newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF; + + return inum > XFS_DIR2_MAX_SHORT_INUM && + sfp->i8count == 0 && newsize > XFS_IFORK_DSIZE(dp); +} + +/* * Replace the inode number of an entry in a shortform directory. */ int /* error */ @@ -980,17 +1001,14 @@ xfs_dir2_sf_replace( */ if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) { int error; /* error return value */ - int newsize; /* new inode size */ - newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF; /* * Won't fit as shortform, convert to block then do replace. */ - if (newsize > XFS_IFORK_DSIZE(dp)) { + if (xfs_dir2_sf_replace_needblock(dp, args->inumber)) { error = xfs_dir2_sf_to_block(args); - if (error) { + if (error) return error; - } return xfs_dir2_block_replace(args); } /* diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index e8bd688a4073..bedc1e752b60 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -35,10 +35,10 @@ xfs_calc_dquots_per_chunk( xfs_failaddr_t xfs_dquot_verify( - struct xfs_mount *mp, - xfs_disk_dquot_t *ddq, - xfs_dqid_t id, - uint type) /* used only during quotacheck */ + struct xfs_mount *mp, + struct xfs_disk_dquot *ddq, + xfs_dqid_t id, + uint type) /* used only during quotacheck */ { /* * We can encounter an uninitialized dquot buffer for 2 reasons: diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index c968b60cee15..31fa9ab2ab61 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -497,6 +497,23 @@ static inline bool xfs_sb_version_hascrc(struct xfs_sb *sbp) return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; } +/* + * v5 file systems support V3 inodes only, earlier file systems support + * v2 and v1 inodes. + */ +static inline bool xfs_sb_version_has_v3inode(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; +} + +static inline bool xfs_dinode_good_version(struct xfs_sb *sbp, + uint8_t version) +{ + if (xfs_sb_version_has_v3inode(sbp)) + return version == 3; + return version == 1 || version == 2; +} + static inline bool xfs_sb_version_has_pquotino(struct xfs_sb *sbp) { return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; @@ -946,8 +963,12 @@ typedef enum xfs_dinode_fmt { /* * Inode size for given fs. */ -#define XFS_LITINO(mp, version) \ - ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version))) +#define XFS_DINODE_SIZE(sbp) \ + (xfs_sb_version_has_v3inode(sbp) ? \ + sizeof(struct xfs_dinode) : \ + offsetof(struct xfs_dinode, di_crc)) +#define XFS_LITINO(mp) \ + ((mp)->m_sb.sb_inodesize - XFS_DINODE_SIZE(&(mp)->m_sb)) /* * Inode data & attribute fork sizes, per inode. @@ -956,13 +977,9 @@ typedef enum xfs_dinode_fmt { #define XFS_DFORK_BOFF(dip) ((int)((dip)->di_forkoff << 3)) #define XFS_DFORK_DSIZE(dip,mp) \ - (XFS_DFORK_Q(dip) ? \ - XFS_DFORK_BOFF(dip) : \ - XFS_LITINO(mp, (dip)->di_version)) + (XFS_DFORK_Q(dip) ? XFS_DFORK_BOFF(dip) : XFS_LITINO(mp)) #define XFS_DFORK_ASIZE(dip,mp) \ - (XFS_DFORK_Q(dip) ? \ - XFS_LITINO(mp, (dip)->di_version) - XFS_DFORK_BOFF(dip) : \ - 0) + (XFS_DFORK_Q(dip) ? XFS_LITINO(mp) - XFS_DFORK_BOFF(dip) : 0) #define XFS_DFORK_SIZE(dip,mp,w) \ ((w) == XFS_DATA_FORK ? \ XFS_DFORK_DSIZE(dip, mp) : \ @@ -1144,11 +1161,11 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) /* * This is the main portion of the on-disk representation of quota - * information for a user. This is the q_core of the xfs_dquot_t that + * information for a user. This is the q_core of the struct xfs_dquot that * is kept in kernel memory. We pad this with some more expansion room * to construct the on disk structure. */ -typedef struct xfs_disk_dquot { +struct xfs_disk_dquot { __be16 d_magic; /* dquot magic = XFS_DQUOT_MAGIC */ __u8 d_version; /* dquot version */ __u8 d_flags; /* XFS_DQ_USER/PROJ/GROUP */ @@ -1171,15 +1188,15 @@ typedef struct xfs_disk_dquot { __be32 d_rtbtimer; /* similar to above; for RT disk blocks */ __be16 d_rtbwarns; /* warnings issued wrt RT disk blocks */ __be16 d_pad; -} xfs_disk_dquot_t; +}; /* * This is what goes on disk. This is separated from the xfs_disk_dquot because * carrying the unnecessary padding would be a waste of memory. */ typedef struct xfs_dqblk { - xfs_disk_dquot_t dd_diskdq; /* portion that lives incore as well */ - char dd_fill[4]; /* filling for posterity */ + struct xfs_disk_dquot dd_diskdq; /* portion living incore as well */ + char dd_fill[4];/* filling for posterity */ /* * These two are only present on filesystems with the CRC bits set. @@ -1540,6 +1557,13 @@ typedef struct xfs_bmdr_block { #define BMBT_BLOCKCOUNT_BITLEN 21 #define BMBT_STARTOFF_MASK ((1ULL << BMBT_STARTOFF_BITLEN) - 1) +#define BMBT_BLOCKCOUNT_MASK ((1ULL << BMBT_BLOCKCOUNT_BITLEN) - 1) + +/* + * bmbt records have a file offset (block) field that is 54 bits wide, so this + * is the largest xfs_fileoff_t that we ever expect to see. + */ +#define XFS_MAX_FILEOFF (BMBT_STARTOFF_MASK + BMBT_BLOCKCOUNT_MASK) typedef struct xfs_bmbt_rec { __be64 l0, l1; diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 443cf33f6666..391e441d43a0 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -303,7 +303,7 @@ xfs_ialloc_inode_init( * That means for v3 inode we log the entire buffer rather than just the * inode cores. */ - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_sb_version_has_v3inode(&mp->m_sb)) { version = 3; ino = XFS_AGINO_TO_INO(mp, agno, XFS_AGB_TO_AGINO(mp, agbno)); @@ -339,7 +339,7 @@ xfs_ialloc_inode_init( xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length)); for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) { int ioffset = i << mp->m_sb.sb_inodelog; - uint isize = xfs_dinode_size(version); + uint isize = XFS_DINODE_SIZE(&mp->m_sb); free = xfs_make_iptr(mp, fbuf, i); free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); @@ -2818,7 +2818,7 @@ xfs_ialloc_setup_geometry( * cannot change the behavior. */ igeo->inode_cluster_size_raw = XFS_INODE_BIG_CLUSTER_SIZE; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_sb_version_has_v3inode(&mp->m_sb)) { int new_size = igeo->inode_cluster_size_raw; new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE; @@ -2854,3 +2854,67 @@ xfs_ialloc_setup_geometry( else igeo->ialloc_align = 0; } + +/* Compute the location of the root directory inode that is laid out by mkfs. */ +xfs_ino_t +xfs_ialloc_calc_rootino( + struct xfs_mount *mp, + int sunit) +{ + struct xfs_ino_geometry *igeo = M_IGEO(mp); + xfs_agblock_t first_bno; + + /* + * Pre-calculate the geometry of AG 0. We know what it looks like + * because libxfs knows how to create allocation groups now. + * + * first_bno is the first block in which mkfs could possibly have + * allocated the root directory inode, once we factor in the metadata + * that mkfs formats before it. Namely, the four AG headers... + */ + first_bno = howmany(4 * mp->m_sb.sb_sectsize, mp->m_sb.sb_blocksize); + + /* ...the two free space btree roots... */ + first_bno += 2; + + /* ...the inode btree root... */ + first_bno += 1; + + /* ...the initial AGFL... */ + first_bno += xfs_alloc_min_freelist(mp, NULL); + + /* ...the free inode btree root... */ + if (xfs_sb_version_hasfinobt(&mp->m_sb)) + first_bno++; + + /* ...the reverse mapping btree root... */ + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + first_bno++; + + /* ...the reference count btree... */ + if (xfs_sb_version_hasreflink(&mp->m_sb)) + first_bno++; + + /* + * ...and the log, if it is allocated in the first allocation group. + * + * This can happen with filesystems that only have a single + * allocation group, or very odd geometries created by old mkfs + * versions on very small filesystems. + */ + if (mp->m_sb.sb_logstart && + XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == 0) + first_bno += mp->m_sb.sb_logblocks; + + /* + * Now round first_bno up to whatever allocation alignment is given + * by the filesystem or was passed in. + */ + if (xfs_sb_version_hasdalign(&mp->m_sb) && igeo->ialloc_align > 0) + first_bno = roundup(first_bno, sunit); + else if (xfs_sb_version_hasalign(&mp->m_sb) && + mp->m_sb.sb_inoalignmt > 1) + first_bno = roundup(first_bno, mp->m_sb.sb_inoalignmt); + + return XFS_AGINO_TO_INO(mp, 0, XFS_AGB_TO_AGINO(mp, first_bno)); +} diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 323592d563d5..72b3468b97b1 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -152,5 +152,6 @@ int xfs_inobt_insert_rec(struct xfs_btree_cur *cur, uint16_t holemask, int xfs_ialloc_cluster_alignment(struct xfs_mount *mp); void xfs_ialloc_setup_geometry(struct xfs_mount *mp); +xfs_ino_t xfs_ialloc_calc_rootino(struct xfs_mount *mp, int sunit); #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 28ab3c5255e1..962e95dcdbff 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -44,17 +44,6 @@ xfs_inobp_check( } #endif -bool -xfs_dinode_good_version( - struct xfs_mount *mp, - __u8 version) -{ - if (xfs_sb_version_hascrc(&mp->m_sb)) - return version == 3; - - return version == 1 || version == 2; -} - /* * If we are doing readahead on an inode buffer, we might be in log recovery * reading an inode allocation buffer that hasn't yet been replayed, and hence @@ -93,7 +82,7 @@ xfs_inode_buf_verify( dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog)); unlinked_ino = be32_to_cpu(dip->di_next_unlinked); di_ok = xfs_verify_magic16(bp, dip->di_magic) && - xfs_dinode_good_version(mp, dip->di_version) && + xfs_dinode_good_version(&mp->m_sb, dip->di_version) && xfs_verify_agino_or_null(mp, agno, unlinked_ino); if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP))) { @@ -205,26 +194,23 @@ xfs_inode_from_disk( struct xfs_icdinode *to = &ip->i_d; struct inode *inode = VFS_I(ip); - /* * Convert v1 inodes immediately to v2 inode format as this is the * minimum inode version format we support in the rest of the code. + * They will also be unconditionally written back to disk as v2 inodes. */ - to->di_version = from->di_version; - if (to->di_version == 1) { + if (unlikely(from->di_version == 1)) { set_nlink(inode, be16_to_cpu(from->di_onlink)); - to->di_projid_lo = 0; - to->di_projid_hi = 0; - to->di_version = 2; + to->di_projid = 0; } else { set_nlink(inode, be32_to_cpu(from->di_nlink)); - to->di_projid_lo = be16_to_cpu(from->di_projid_lo); - to->di_projid_hi = be16_to_cpu(from->di_projid_hi); + to->di_projid = (prid_t)be16_to_cpu(from->di_projid_hi) << 16 | + be16_to_cpu(from->di_projid_lo); } to->di_format = from->di_format; - to->di_uid = be32_to_cpu(from->di_uid); - to->di_gid = be32_to_cpu(from->di_gid); + i_uid_write(inode, be32_to_cpu(from->di_uid)); + i_gid_write(inode, be32_to_cpu(from->di_gid)); to->di_flushiter = be16_to_cpu(from->di_flushiter); /* @@ -253,7 +239,7 @@ xfs_inode_from_disk( to->di_dmstate = be16_to_cpu(from->di_dmstate); to->di_flags = be16_to_cpu(from->di_flags); - if (to->di_version == 3) { + if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { inode_set_iversion_queried(inode, be64_to_cpu(from->di_changecount)); to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec); @@ -275,12 +261,11 @@ xfs_inode_to_disk( to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); to->di_onlink = 0; - to->di_version = from->di_version; to->di_format = from->di_format; - to->di_uid = cpu_to_be32(from->di_uid); - to->di_gid = cpu_to_be32(from->di_gid); - to->di_projid_lo = cpu_to_be16(from->di_projid_lo); - to->di_projid_hi = cpu_to_be16(from->di_projid_hi); + to->di_uid = cpu_to_be32(i_uid_read(inode)); + to->di_gid = cpu_to_be32(i_gid_read(inode)); + to->di_projid_lo = cpu_to_be16(from->di_projid & 0xffff); + to->di_projid_hi = cpu_to_be16(from->di_projid >> 16); memset(to->di_pad, 0, sizeof(to->di_pad)); to->di_atime.t_sec = cpu_to_be32(inode->i_atime.tv_sec); @@ -304,7 +289,8 @@ xfs_inode_to_disk( to->di_dmstate = cpu_to_be16(from->di_dmstate); to->di_flags = cpu_to_be16(from->di_flags); - if (from->di_version == 3) { + if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { + to->di_version = 3; to->di_changecount = cpu_to_be64(inode_peek_iversion(inode)); to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec); to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec); @@ -316,6 +302,7 @@ xfs_inode_to_disk( uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid); to->di_flushiter = 0; } else { + to->di_version = 2; to->di_flushiter = cpu_to_be16(from->di_flushiter); } } @@ -429,7 +416,7 @@ xfs_dinode_verify_forkoff( case XFS_DINODE_FMT_LOCAL: /* fall through ... */ case XFS_DINODE_FMT_EXTENTS: /* fall through ... */ case XFS_DINODE_FMT_BTREE: - if (dip->di_forkoff >= (XFS_LITINO(mp, dip->di_version) >> 3)) + if (dip->di_forkoff >= (XFS_LITINO(mp) >> 3)) return __this_address; break; default: @@ -455,7 +442,7 @@ xfs_dinode_verify( /* Verify v3 integrity information first */ if (dip->di_version >= 3) { - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_sb_version_has_v3inode(&mp->m_sb)) return __this_address; if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, XFS_DINODE_CRC_OFF)) @@ -630,12 +617,11 @@ xfs_iread( /* shortcut IO on inode allocation if possible */ if ((iget_flags & XFS_IGET_CREATE) && - xfs_sb_version_hascrc(&mp->m_sb) && + xfs_sb_version_has_v3inode(&mp->m_sb) && !(mp->m_flags & XFS_MOUNT_IKEEP)) { /* initialise the on-disk inode core */ memset(&ip->i_d, 0, sizeof(ip->i_d)); VFS_I(ip)->i_generation = prandom_u32(); - ip->i_d.di_version = 3; return 0; } @@ -677,7 +663,6 @@ xfs_iread( * Partial initialisation of the in-core inode. Just the bits * that xfs_ialloc won't overwrite or relies on being correct. */ - ip->i_d.di_version = dip->di_version; VFS_I(ip)->i_generation = be32_to_cpu(dip->di_gen); ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter); @@ -691,7 +676,6 @@ xfs_iread( VFS_I(ip)->i_mode = 0; } - ASSERT(ip->i_d.di_version >= 2); ip->i_delayed_blks = 0; /* diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index ab0f84165317..80b574579a21 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -16,13 +16,9 @@ struct xfs_dinode; * format specific structures at the appropriate time. */ struct xfs_icdinode { - int8_t di_version; /* inode version */ int8_t di_format; /* format of di_c data */ uint16_t di_flushiter; /* incremented on flush */ - uint32_t di_uid; /* owner's user id */ - uint32_t di_gid; /* owner's group id */ - uint16_t di_projid_lo; /* lower part of owner's project id */ - uint16_t di_projid_hi; /* higher part of owner's project id */ + uint32_t di_projid; /* owner's project id */ xfs_fsize_t di_size; /* number of bytes in file */ xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */ xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ @@ -62,8 +58,6 @@ void xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from); void xfs_log_dinode_to_disk(struct xfs_log_dinode *from, struct xfs_dinode *to); -bool xfs_dinode_good_version(struct xfs_mount *mp, __u8 version); - #if defined(DEBUG) void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); #else diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 8fdd0424070e..e758d74b2b62 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -75,11 +75,15 @@ xfs_iformat_fork( error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK); break; default: + xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, + dip, sizeof(*dip), __this_address); return -EFSCORRUPTED; } break; default: + xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, + sizeof(*dip), __this_address); return -EFSCORRUPTED; } if (error) @@ -110,6 +114,8 @@ xfs_iformat_fork( error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK); break; default: + xfs_inode_verifier_error(ip, error, __func__, dip, + sizeof(*dip), __this_address); error = -EFSCORRUPTED; break; } @@ -177,7 +183,7 @@ xfs_iformat_local( */ if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { xfs_warn(ip->i_mount, - "corrupt inode %Lu (bad size %d for local fork, size = %d).", + "corrupt inode %Lu (bad size %d for local fork, size = %zd).", (unsigned long long) ip->i_ino, size, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); xfs_inode_verifier_error(ip, -EFSCORRUPTED, @@ -586,7 +592,7 @@ void xfs_iflush_fork( xfs_inode_t *ip, xfs_dinode_t *dip, - xfs_inode_log_item_t *iip, + struct xfs_inode_log_item *iip, int whichfork) { char *cp; diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 7b845c052fb4..a84a1557d11c 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -46,14 +46,9 @@ struct xfs_ifork { (ip)->i_afp : \ (ip)->i_cowfp)) #define XFS_IFORK_DSIZE(ip) \ - (XFS_IFORK_Q(ip) ? \ - XFS_IFORK_BOFF(ip) : \ - XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version)) + (XFS_IFORK_Q(ip) ? XFS_IFORK_BOFF(ip) : XFS_LITINO((ip)->i_mount)) #define XFS_IFORK_ASIZE(ip) \ - (XFS_IFORK_Q(ip) ? \ - XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version) - \ - XFS_IFORK_BOFF(ip) : \ - 0) + (XFS_IFORK_Q(ip) ? XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : 0) #define XFS_IFORK_SIZE(ip,w) \ ((w) == XFS_DATA_FORK ? \ XFS_IFORK_DSIZE(ip) : \ diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index e5f97c69b320..d3b255f42789 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -424,12 +424,10 @@ struct xfs_log_dinode { /* structure must be padded to 64 bit alignment */ }; -static inline uint xfs_log_dinode_size(int version) -{ - if (version == 3) - return sizeof(struct xfs_log_dinode); - return offsetof(struct xfs_log_dinode, di_next_unlinked); -} +#define xfs_log_dinode_size(mp) \ + (xfs_sb_version_has_v3inode(&(mp)->m_sb) ? \ + sizeof(struct xfs_log_dinode) : \ + offsetof(struct xfs_log_dinode, di_next_unlinked)) /* * Buffer Log Format defintions diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 9a7fadb1361c..78236bd6c64f 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1591,8 +1591,10 @@ xfs_refcount_recover_extent( struct list_head *debris = priv; struct xfs_refcount_recovery *rr; - if (be32_to_cpu(rec->refc.rc_refcount) != 1) + if (be32_to_cpu(rec->refc.rc_refcount) != 1) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, cur->bc_mp); return -EFSCORRUPTED; + } rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), 0); xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec); diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 42085e70c01a..cf99e4cab627 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -15,7 +15,7 @@ #include "xfs_bmap.h" #include "xfs_trans.h" #include "xfs_rtalloc.h" - +#include "xfs_error.h" /* * Realtime allocator bitmap functions shared with userspace. @@ -70,8 +70,10 @@ xfs_rtbuf_get( if (error) return error; - if (nmap == 0 || !xfs_bmap_is_real_extent(&map)) + if (nmap == 0 || !xfs_bmap_is_written_extent(&map)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); return -EFSCORRUPTED; + } ASSERT(map.br_startblock != NULLFSBLOCK); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index c45acbd3add9..708feb8eac76 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -65,6 +65,7 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp, #define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */ #define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */ #define XFS_TRANS_NO_WRITECOUNT 0x40 /* do not elevate SB writecount */ +#define XFS_TRANS_RES_FDBLKS 0x80 /* reserve newly freed blocks */ /* * LOWMODE is used by the allocator to activate the lowspace algorithm - when * free space is running low the extent allocator may choose to allocate an diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index 0ba7368b9a5f..1d0e78e0099d 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -27,7 +27,7 @@ xfs_trans_ijoin( struct xfs_inode *ip, uint lock_flags) { - xfs_inode_log_item_t *iip; + struct xfs_inode_log_item *iip; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (ip->i_itemp == NULL) diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index b3584cd2cc16..8ece346def97 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -187,7 +187,7 @@ xfs_calc_inode_chunk_res( XFS_FSB_TO_B(mp, 1)); if (alloc) { /* icreate tx uses ordered buffers */ - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_sb_version_has_v3inode(&mp->m_sb)) return res; size = XFS_FSB_TO_B(mp, 1); } @@ -776,7 +776,7 @@ xfs_calc_clear_agi_bucket_reservation( /* * Adjusting quota limits. - * the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot) + * the disk quota buffer: sizeof(struct xfs_disk_dquot) */ STATIC uint xfs_calc_qm_setqlim_reservation(void) @@ -800,7 +800,7 @@ xfs_calc_qm_dqalloc_reservation( /* * Turning off quotas. - * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2 + * the quota off logitems: sizeof(struct xfs_qoff_logitem) * 2 * the superblock for the quota flags: sector size */ STATIC uint @@ -813,7 +813,7 @@ xfs_calc_qm_quotaoff_reservation( /* * End of turning off quotas. - * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2 + * the quota off logitems: sizeof(struct xfs_qoff_logitem) * 2 */ STATIC uint xfs_calc_qm_quotaoff_end_reservation(void) diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 96d7071cfa46..6788b0ca85eb 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -12,6 +12,7 @@ #include "xfs_inode.h" #include "xfs_attr.h" #include "xfs_trace.h" +#include "xfs_error.h" #include <linux/posix_acl_xattr.h> @@ -23,6 +24,7 @@ STATIC struct posix_acl * xfs_acl_from_disk( + struct xfs_mount *mp, const struct xfs_acl *aclp, int len, int max_entries) @@ -32,11 +34,18 @@ xfs_acl_from_disk( const struct xfs_acl_entry *ace; unsigned int count, i; - if (len < sizeof(*aclp)) + if (len < sizeof(*aclp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, aclp, + len); return ERR_PTR(-EFSCORRUPTED); + } + count = be32_to_cpu(aclp->acl_cnt); - if (count > max_entries || XFS_ACL_SIZE(count) != len) + if (count > max_entries || XFS_ACL_SIZE(count) != len) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, aclp, + len); return ERR_PTR(-EFSCORRUPTED); + } acl = posix_acl_alloc(count, GFP_KERNEL); if (!acl) @@ -57,10 +66,12 @@ xfs_acl_from_disk( switch (acl_e->e_tag) { case ACL_USER: - acl_e->e_uid = xfs_uid_to_kuid(be32_to_cpu(ace->ae_id)); + acl_e->e_uid = make_kuid(&init_user_ns, + be32_to_cpu(ace->ae_id)); break; case ACL_GROUP: - acl_e->e_gid = xfs_gid_to_kgid(be32_to_cpu(ace->ae_id)); + acl_e->e_gid = make_kgid(&init_user_ns, + be32_to_cpu(ace->ae_id)); break; case ACL_USER_OBJ: case ACL_GROUP_OBJ: @@ -93,10 +104,12 @@ xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl) ace->ae_tag = cpu_to_be32(acl_e->e_tag); switch (acl_e->e_tag) { case ACL_USER: - ace->ae_id = cpu_to_be32(xfs_kuid_to_uid(acl_e->e_uid)); + ace->ae_id = cpu_to_be32( + from_kuid(&init_user_ns, acl_e->e_uid)); break; case ACL_GROUP: - ace->ae_id = cpu_to_be32(xfs_kgid_to_gid(acl_e->e_gid)); + ace->ae_id = cpu_to_be32( + from_kgid(&init_user_ns, acl_e->e_gid)); break; default: ace->ae_id = cpu_to_be32(ACL_UNDEFINED_ID); @@ -145,7 +158,7 @@ xfs_get_acl(struct inode *inode, int type) if (error != -ENOATTR) acl = ERR_PTR(error); } else { - acl = xfs_acl_from_disk(xfs_acl, len, + acl = xfs_acl_from_disk(ip->i_mount, xfs_acl, len, XFS_ACL_MAX_ENTRIES(ip->i_mount)); kmem_free(xfs_acl); } diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index f16d5f196c6b..5d9f8e4c4cde 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -495,7 +495,7 @@ xfs_map_blocks( ssize_t count = i_blocksize(inode); xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); - xfs_fileoff_t cow_fsb = NULLFILEOFF; + xfs_fileoff_t cow_fsb; struct xfs_bmbt_irec imap; struct xfs_iext_cursor icur; int retries = 0; @@ -529,6 +529,8 @@ xfs_map_blocks( * landed in a hole and we skip the block. */ retry: + cow_fsb = NULLFILEOFF; + wpc->fork = XFS_DATA_FORK; xfs_ilock(ip, XFS_ILOCK_SHARED); ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || (ip->i_df.if_flags & XFS_IFEXTENTS)); diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index a640a285cc52..f052de128fa1 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -22,24 +22,21 @@ #include "xfs_attr_leaf.h" #include "xfs_quota.h" #include "xfs_dir2.h" +#include "xfs_error.h" /* - * Look at all the extents for this logical region, - * invalidate any buffers that are incore/in transactions. + * Invalidate any incore buffers associated with this remote attribute value + * extent. We never log remote attribute value buffers, which means that they + * won't be attached to a transaction and are therefore safe to mark stale. + * The actual bunmapi will be taken care of later. */ STATIC int -xfs_attr3_leaf_freextent( - struct xfs_trans **trans, +xfs_attr3_rmt_stale( struct xfs_inode *dp, xfs_dablk_t blkno, int blkcnt) { struct xfs_bmbt_irec map; - struct xfs_buf *bp; - xfs_dablk_t tblkno; - xfs_daddr_t dblkno; - int tblkcnt; - int dblkcnt; int nmap; int error; @@ -47,47 +44,28 @@ xfs_attr3_leaf_freextent( * Roll through the "value", invalidating the attribute value's * blocks. */ - tblkno = blkno; - tblkcnt = blkcnt; - while (tblkcnt > 0) { + while (blkcnt > 0) { /* * Try to remember where we decided to put the value. */ nmap = 1; - error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt, + error = xfs_bmapi_read(dp, (xfs_fileoff_t)blkno, blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); - if (error) { + if (error) return error; - } ASSERT(nmap == 1); - ASSERT(map.br_startblock != DELAYSTARTBLOCK); /* - * If it's a hole, these are already unmapped - * so there's nothing to invalidate. + * Mark any incore buffers for the remote value as stale. We + * never log remote attr value buffers, so the buffer should be + * easy to kill. */ - if (map.br_startblock != HOLESTARTBLOCK) { - - dblkno = XFS_FSB_TO_DADDR(dp->i_mount, - map.br_startblock); - dblkcnt = XFS_FSB_TO_BB(dp->i_mount, - map.br_blockcount); - bp = xfs_trans_get_buf(*trans, - dp->i_mount->m_ddev_targp, - dblkno, dblkcnt, 0); - if (!bp) - return -ENOMEM; - xfs_trans_binval(*trans, bp); - /* - * Roll to next transaction. - */ - error = xfs_trans_roll_inode(trans, dp); - if (error) - return error; - } + error = xfs_attr_rmtval_stale(dp, &map, 0); + if (error) + return error; - tblkno += map.br_blockcount; - tblkcnt -= map.br_blockcount; + blkno += map.br_blockcount; + blkcnt -= map.br_blockcount; } return 0; @@ -101,86 +79,45 @@ xfs_attr3_leaf_freextent( */ STATIC int xfs_attr3_leaf_inactive( - struct xfs_trans **trans, - struct xfs_inode *dp, - struct xfs_buf *bp) + struct xfs_trans **trans, + struct xfs_inode *dp, + struct xfs_buf *bp) { - struct xfs_attr_leafblock *leaf; - struct xfs_attr3_icleaf_hdr ichdr; - struct xfs_attr_leaf_entry *entry; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_mount *mp = bp->b_mount; + struct xfs_attr_leafblock *leaf = bp->b_addr; + struct xfs_attr_leaf_entry *entry; struct xfs_attr_leaf_name_remote *name_rmt; - struct xfs_attr_inactive_list *list; - struct xfs_attr_inactive_list *lp; - int error; - int count; - int size; - int tmp; - int i; - struct xfs_mount *mp = bp->b_mount; + int error = 0; + int i; - leaf = bp->b_addr; xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); /* - * Count the number of "remote" value extents. + * Find the remote value extents for this leaf and invalidate their + * incore buffers. */ - count = 0; entry = xfs_attr3_leaf_entryp(leaf); for (i = 0; i < ichdr.count; entry++, i++) { - if (be16_to_cpu(entry->nameidx) && - ((entry->flags & XFS_ATTR_LOCAL) == 0)) { - name_rmt = xfs_attr3_leaf_name_remote(leaf, i); - if (name_rmt->valueblk) - count++; - } - } - - /* - * If there are no "remote" values, we're done. - */ - if (count == 0) { - xfs_trans_brelse(*trans, bp); - return 0; - } + int blkcnt; - /* - * Allocate storage for a list of all the "remote" value extents. - */ - size = count * sizeof(xfs_attr_inactive_list_t); - list = kmem_alloc(size, 0); - - /* - * Identify each of the "remote" value extents. - */ - lp = list; - entry = xfs_attr3_leaf_entryp(leaf); - for (i = 0; i < ichdr.count; entry++, i++) { - if (be16_to_cpu(entry->nameidx) && - ((entry->flags & XFS_ATTR_LOCAL) == 0)) { - name_rmt = xfs_attr3_leaf_name_remote(leaf, i); - if (name_rmt->valueblk) { - lp->valueblk = be32_to_cpu(name_rmt->valueblk); - lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount, - be32_to_cpu(name_rmt->valuelen)); - lp++; - } - } - } - xfs_trans_brelse(*trans, bp); /* unlock for trans. in freextent() */ + if (!entry->nameidx || (entry->flags & XFS_ATTR_LOCAL)) + continue; - /* - * Invalidate each of the "remote" value extents. - */ - error = 0; - for (lp = list, i = 0; i < count; i++, lp++) { - tmp = xfs_attr3_leaf_freextent(trans, dp, - lp->valueblk, lp->valuelen); + name_rmt = xfs_attr3_leaf_name_remote(leaf, i); + if (!name_rmt->valueblk) + continue; - if (error == 0) - error = tmp; /* save only the 1st errno */ + blkcnt = xfs_attr3_rmt_blocks(dp->i_mount, + be32_to_cpu(name_rmt->valuelen)); + error = xfs_attr3_rmt_stale(dp, + be32_to_cpu(name_rmt->valueblk), blkcnt); + if (error) + goto err; } - kmem_free(list); + xfs_trans_brelse(*trans, bp); +err: return error; } @@ -208,8 +145,9 @@ xfs_attr3_node_inactive( * Since this code is recursive (gasp!) we must protect ourselves. */ if (level > XFS_DA_NODE_MAXDEPTH) { + xfs_buf_mark_corrupt(bp); xfs_trans_brelse(*trans, bp); /* no locks for later trans */ - return -EIO; + return -EFSCORRUPTED; } node = bp->b_addr; @@ -258,8 +196,9 @@ xfs_attr3_node_inactive( error = xfs_attr3_leaf_inactive(trans, dp, child_bp); break; default: - error = -EIO; + xfs_buf_mark_corrupt(child_bp); xfs_trans_brelse(*trans, child_bp); + error = -EFSCORRUPTED; break; } if (error) @@ -341,7 +280,8 @@ xfs_attr3_root_inactive( error = xfs_attr3_leaf_inactive(trans, dp, bp); break; default: - error = -EIO; + error = -EFSCORRUPTED; + xfs_buf_mark_corrupt(bp); xfs_trans_brelse(*trans, bp); break; } diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 00758fdc2fec..8c0972834449 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -258,8 +258,10 @@ xfs_attr_node_list_lookup( return 0; /* We can't point back to the root. */ - if (cursor->blkno == 0) + if (cursor->blkno == 0) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); return -EFSCORRUPTED; + } } if (expected_level != 0) @@ -269,6 +271,7 @@ xfs_attr_node_list_lookup( return 0; out_corruptbuf: + xfs_buf_mark_corrupt(bp); xfs_trans_brelse(tp, bp); return -EFSCORRUPTED; } diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 83d24e983d4c..7b0c4d9679d9 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -21,7 +21,8 @@ #include "xfs_icache.h" #include "xfs_bmap_btree.h" #include "xfs_trans_space.h" - +#include "xfs_error.h" +#include "xfs_quota.h" kmem_zone_t *xfs_bui_zone; kmem_zone_t *xfs_bud_zone; @@ -124,34 +125,6 @@ xfs_bui_item_release( xfs_bui_release(BUI_ITEM(lip)); } -static const struct xfs_item_ops xfs_bui_item_ops = { - .iop_size = xfs_bui_item_size, - .iop_format = xfs_bui_item_format, - .iop_unpin = xfs_bui_item_unpin, - .iop_release = xfs_bui_item_release, -}; - -/* - * Allocate and initialize an bui item with the given number of extents. - */ -struct xfs_bui_log_item * -xfs_bui_init( - struct xfs_mount *mp) - -{ - struct xfs_bui_log_item *buip; - - buip = kmem_zone_zalloc(xfs_bui_zone, 0); - - xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops); - buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS; - buip->bui_format.bui_id = (uintptr_t)(void *)buip; - atomic_set(&buip->bui_next_extent, 0); - atomic_set(&buip->bui_refcount, 2); - - return buip; -} - static inline struct xfs_bud_log_item *BUD_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_bud_log_item, bud_item); @@ -278,27 +251,6 @@ xfs_bmap_update_diff_items( return ba->bi_owner->i_ino - bb->bi_owner->i_ino; } -/* Get an BUI. */ -STATIC void * -xfs_bmap_update_create_intent( - struct xfs_trans *tp, - unsigned int count) -{ - struct xfs_bui_log_item *buip; - - ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS); - ASSERT(tp != NULL); - - buip = xfs_bui_init(tp->t_mountp); - ASSERT(buip != NULL); - - /* - * Get a log_item_desc to point at the new item. - */ - xfs_trans_add_item(tp, &buip->bui_item); - return buip; -} - /* Set the map extent flags for this mapping. */ static void xfs_trans_set_bmap_flags( @@ -326,16 +278,12 @@ xfs_trans_set_bmap_flags( STATIC void xfs_bmap_update_log_item( struct xfs_trans *tp, - void *intent, - struct list_head *item) + struct xfs_bui_log_item *buip, + struct xfs_bmap_intent *bmap) { - struct xfs_bui_log_item *buip = intent; - struct xfs_bmap_intent *bmap; uint next_extent; struct xfs_map_extent *map; - bmap = container_of(item, struct xfs_bmap_intent, bi_list); - tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags); @@ -355,14 +303,35 @@ xfs_bmap_update_log_item( bmap->bi_bmap.br_state); } +static struct xfs_log_item * +xfs_bmap_update_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_bui_log_item *buip = xfs_bui_init(mp); + struct xfs_bmap_intent *bmap; + + ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS); + + xfs_trans_add_item(tp, &buip->bui_item); + if (sort) + list_sort(mp, items, xfs_bmap_update_diff_items); + list_for_each_entry(bmap, items, bi_list) + xfs_bmap_update_log_item(tp, buip, bmap); + return &buip->bui_item; +} + /* Get an BUD so we can process all the deferred rmap updates. */ STATIC void * xfs_bmap_update_create_done( struct xfs_trans *tp, - void *intent, + struct xfs_log_item *intent, unsigned int count) { - return xfs_trans_get_bud(tp, intent); + return xfs_trans_get_bud(tp, BUI_ITEM(intent)); } /* Process a deferred rmap update. */ @@ -398,9 +367,9 @@ xfs_bmap_update_finish_item( /* Abort all pending BUIs. */ STATIC void xfs_bmap_update_abort_intent( - void *intent) + struct xfs_log_item *intent) { - xfs_bui_release(intent); + xfs_bui_release(BUI_ITEM(intent)); } /* Cancel a deferred rmap update. */ @@ -416,10 +385,8 @@ xfs_bmap_update_cancel_item( const struct xfs_defer_op_type xfs_bmap_update_defer_type = { .max_items = XFS_BUI_MAX_FAST_EXTENTS, - .diff_items = xfs_bmap_update_diff_items, .create_intent = xfs_bmap_update_create_intent, .abort_intent = xfs_bmap_update_abort_intent, - .log_item = xfs_bmap_update_log_item, .create_done = xfs_bmap_update_create_done, .finish_item = xfs_bmap_update_finish_item, .cancel_item = xfs_bmap_update_cancel_item, @@ -431,8 +398,8 @@ const struct xfs_defer_op_type xfs_bmap_update_defer_type = { */ int xfs_bui_recover( - struct xfs_trans *parent_tp, - struct xfs_bui_log_item *buip) + struct xfs_bui_log_item *buip, + struct list_head *capture_list) { int error = 0; unsigned int bui_type; @@ -440,15 +407,13 @@ xfs_bui_recover( xfs_fsblock_t startblock_fsb; xfs_fsblock_t inode_fsb; xfs_filblks_t count; - bool op_ok; struct xfs_bud_log_item *budp; - enum xfs_bmap_intent_type type; int whichfork; xfs_exntst_t state; struct xfs_trans *tp; struct xfs_inode *ip = NULL; struct xfs_bmbt_irec irec; - struct xfs_mount *mp = parent_tp->t_mountp; + struct xfs_mount *mp = buip->bui_item.li_mountp; ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags)); @@ -456,7 +421,7 @@ xfs_bui_recover( if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) { set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); xfs_bui_release(buip); - return -EIO; + return -EFSCORRUPTED; } /* @@ -468,16 +433,19 @@ xfs_bui_recover( XFS_FSB_TO_DADDR(mp, bmap->me_startblock)); inode_fsb = XFS_BB_TO_FSB(mp, XFS_FSB_TO_DADDR(mp, XFS_INO_TO_FSB(mp, bmap->me_owner))); - switch (bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) { + state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? + XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; + switch (bui_type) { case XFS_BMAP_MAP: case XFS_BMAP_UNMAP: - op_ok = true; break; default: - op_ok = false; - break; + return -EFSCORRUPTED; } - if (!op_ok || startblock_fsb == 0 || + if (startblock_fsb == 0 || bmap->me_len == 0 || inode_fsb == 0 || startblock_fsb >= mp->m_sb.sb_dblocks || @@ -490,54 +458,40 @@ xfs_bui_recover( */ set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); xfs_bui_release(buip); - return -EIO; + return -EFSCORRUPTED; } - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, - XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp); + /* Grab the inode. */ + error = xfs_iget(mp, NULL, bmap->me_owner, 0, 0, &ip); if (error) return error; - /* - * Recovery stashes all deferred ops during intent processing and - * finishes them on completion. Transfer current dfops state to this - * transaction and transfer the result back before we return. - */ - xfs_defer_move(tp, parent_tp); - budp = xfs_trans_get_bud(tp, buip); - /* Grab the inode. */ - error = xfs_iget(mp, tp, bmap->me_owner, 0, XFS_ILOCK_EXCL, &ip); + error = xfs_qm_dqattach(ip); if (error) - goto err_inode; + goto err_rele; if (VFS_I(ip)->i_nlink == 0) xfs_iflags_set(ip, XFS_IRECOVERY); - /* Process deferred bmap item. */ - state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? - XFS_EXT_UNWRITTEN : XFS_EXT_NORM; - whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; - bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; - switch (bui_type) { - case XFS_BMAP_MAP: - case XFS_BMAP_UNMAP: - type = bui_type; - break; - default: - error = -EFSCORRUPTED; - goto err_inode; - } + /* Allocate transaction and do the work. */ + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp); + if (error) + goto err_rele; + + budp = xfs_trans_get_bud(tp, buip); + xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); count = bmap->me_len; - error = xfs_trans_log_finish_bmap_update(tp, budp, type, ip, whichfork, - bmap->me_startoff, bmap->me_startblock, &count, state); + error = xfs_trans_log_finish_bmap_update(tp, budp, bui_type, ip, + whichfork, bmap->me_startoff, bmap->me_startblock, + &count, state); if (error) - goto err_inode; + goto err_cancel; if (count > 0) { - ASSERT(type == XFS_BMAP_UNMAP); + ASSERT(bui_type == XFS_BMAP_UNMAP); irec.br_startblock = bmap->me_startblock; irec.br_blockcount = count; irec.br_startoff = bmap->me_startoff; @@ -546,19 +500,78 @@ xfs_bui_recover( } set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); - xfs_defer_move(parent_tp, tp); - error = xfs_trans_commit(tp); + /* + * Commit transaction, which frees the transaction and saves the inode + * for later replay activities. + */ + error = xfs_defer_ops_capture_and_commit(tp, ip, capture_list); + if (error) + goto err_unlock; + xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_irele(ip); + return 0; - return error; - -err_inode: - xfs_defer_move(parent_tp, tp); +err_cancel: xfs_trans_cancel(tp); - if (ip) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_irele(ip); - } +err_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); +err_rele: + xfs_irele(ip); return error; } + +/* Relog an intent item to push the log tail forward. */ +static struct xfs_log_item * +xfs_bui_item_relog( + struct xfs_log_item *intent, + struct xfs_trans *tp) +{ + struct xfs_bud_log_item *budp; + struct xfs_bui_log_item *buip; + struct xfs_map_extent *extp; + unsigned int count; + + count = BUI_ITEM(intent)->bui_format.bui_nextents; + extp = BUI_ITEM(intent)->bui_format.bui_extents; + + tp->t_flags |= XFS_TRANS_DIRTY; + budp = xfs_trans_get_bud(tp, BUI_ITEM(intent)); + set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags); + + buip = xfs_bui_init(tp->t_mountp); + memcpy(buip->bui_format.bui_extents, extp, count * sizeof(*extp)); + atomic_set(&buip->bui_next_extent, count); + xfs_trans_add_item(tp, &buip->bui_item); + set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags); + return &buip->bui_item; +} + +static const struct xfs_item_ops xfs_bui_item_ops = { + .iop_size = xfs_bui_item_size, + .iop_format = xfs_bui_item_format, + .iop_unpin = xfs_bui_item_unpin, + .iop_release = xfs_bui_item_release, + .iop_relog = xfs_bui_item_relog, +}; + +/* + * Allocate and initialize an bui item with the given number of extents. + */ +struct xfs_bui_log_item * +xfs_bui_init( + struct xfs_mount *mp) + +{ + struct xfs_bui_log_item *buip; + + buip = kmem_zone_zalloc(xfs_bui_zone, 0); + + xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops); + buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS; + buip->bui_format.bui_id = (uintptr_t)(void *)buip; + atomic_set(&buip->bui_next_extent, 0); + atomic_set(&buip->bui_refcount, 2); + + return buip; +} diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h index ad479cc73de8..a95e99c26979 100644 --- a/fs/xfs/xfs_bmap_item.h +++ b/fs/xfs/xfs_bmap_item.h @@ -77,6 +77,7 @@ extern struct kmem_zone *xfs_bud_zone; struct xfs_bui_log_item *xfs_bui_init(struct xfs_mount *); void xfs_bui_item_free(struct xfs_bui_log_item *); void xfs_bui_release(struct xfs_bui_log_item *); -int xfs_bui_recover(struct xfs_trans *parent_tp, struct xfs_bui_log_item *buip); +int xfs_bui_recover(struct xfs_bui_log_item *buip, + struct list_head *capture_list); #endif /* __XFS_BMAP_ITEM_H__ */ diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index d6d78e127625..32170ce2984c 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -125,7 +125,7 @@ xfs_bmap_rtalloc( * pick an extent that will space things out in the rt area. */ if (ap->eof && ap->offset == 0) { - xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */ + xfs_rtblock_t rtx; /* realtime extent no */ error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx); if (error) @@ -1167,6 +1167,7 @@ xfs_prepare_shift( struct xfs_inode *ip, loff_t offset) { + struct xfs_mount *mp = ip->i_mount; int error; /* @@ -1180,6 +1181,17 @@ xfs_prepare_shift( } /* + * Shift operations must stabilize the start block offset boundary along + * with the full range of the operation. If we don't, a COW writeback + * completion could race with an insert, front merge with the start + * extent (after split) during the shift and corrupt the file. Start + * with the block just prior to the start to stabilize the boundary. + */ + offset = round_down(offset, 1 << mp->m_sb.sb_blocklog); + if (offset) + offset -= (1 << mp->m_sb.sb_blocklog); + + /* * Writeback and invalidate cache for the remainder of the file as we're * about to shift down every extent from offset to EOF. */ @@ -1225,7 +1237,6 @@ xfs_collapse_file_space( int error; xfs_fileoff_t next_fsb = XFS_B_TO_FSB(mp, offset + len); xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len); - uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); bool done = false; ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); @@ -1241,32 +1252,34 @@ xfs_collapse_file_space( if (error) return error; - while (!error && !done) { - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, - &tp); - if (error) - break; + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp); + if (error) + return error; - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, - ip->i_gdquot, ip->i_pdquot, resblks, 0, - XFS_QMOPT_RES_REGBLKS); - if (error) - goto out_trans_cancel; - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + while (!done) { error = xfs_bmap_collapse_extents(tp, ip, &next_fsb, shift_fsb, &done); if (error) goto out_trans_cancel; + if (done) + break; - error = xfs_trans_commit(tp); + /* finish any deferred frees and roll the transaction */ + error = xfs_defer_finish(&tp); + if (error) + goto out_trans_cancel; } + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; out_trans_cancel: xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -1309,35 +1322,41 @@ xfs_insert_file_space( if (error) return error; + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, + XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + /* * The extent shifting code works on extent granularity. So, if stop_fsb * is not the starting block of extent, we need to split the extent at * stop_fsb. */ - error = xfs_bmap_split_extent(ip, stop_fsb); + error = xfs_bmap_split_extent(tp, ip, stop_fsb); if (error) - return error; + goto out_trans_cancel; - while (!error && !done) { - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, - &tp); + do { + error = xfs_defer_finish(&tp); if (error) - break; + goto out_trans_cancel; - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); error = xfs_bmap_insert_extents(tp, ip, &next_fsb, shift_fsb, &done, stop_fsb); if (error) goto out_trans_cancel; + } while (!done); - error = xfs_trans_commit(tp); - } - + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; out_trans_cancel: xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -1605,12 +1624,12 @@ xfs_swap_extent_forks( * event of a crash. Set the owner change log flags now and leave the * bmbt scan as the last step. */ - if (ip->i_d.di_version == 3 && - ip->i_d.di_format == XFS_DINODE_FMT_BTREE) - (*target_log_flags) |= XFS_ILOG_DOWNER; - if (tip->i_d.di_version == 3 && - tip->i_d.di_format == XFS_DINODE_FMT_BTREE) - (*src_log_flags) |= XFS_ILOG_DOWNER; + if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { + if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) + (*target_log_flags) |= XFS_ILOG_DOWNER; + if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) + (*src_log_flags) |= XFS_ILOG_DOWNER; + } /* * Swap the data forks of the inodes @@ -1645,7 +1664,7 @@ xfs_swap_extent_forks( (*src_log_flags) |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: - ASSERT(ip->i_d.di_version < 3 || + ASSERT(!xfs_sb_version_has_v3inode(&ip->i_mount->m_sb) || (*src_log_flags & XFS_ILOG_DOWNER)); (*src_log_flags) |= XFS_ILOG_DBROOT; break; @@ -1657,7 +1676,7 @@ xfs_swap_extent_forks( break; case XFS_DINODE_FMT_BTREE: (*target_log_flags) |= XFS_ILOG_DBROOT; - ASSERT(tip->i_d.di_version < 3 || + ASSERT(!xfs_sb_version_has_v3inode(&ip->i_mount->m_sb) || (*target_log_flags & XFS_ILOG_DOWNER)); break; } @@ -1721,6 +1740,7 @@ xfs_swap_extents( int lock_flags; uint64_t f; int resblks = 0; + unsigned int flags = 0; /* * Lock the inodes against other IO, page faults and truncate to @@ -1776,17 +1796,16 @@ xfs_swap_extents( resblks += XFS_SWAP_RMAP_SPACE_RES(mp, tipnext, w); /* - * Handle the corner case where either inode might straddle the - * btree format boundary. If so, the inode could bounce between - * btree <-> extent format on unmap -> remap cycles, freeing and - * allocating a bmapbt block each time. + * If either inode straddles a bmapbt block allocation boundary, + * the rmapbt algorithm triggers repeated allocs and frees as + * extents are remapped. This can exhaust the block reservation + * prematurely and cause shutdown. Return freed blocks to the + * transaction reservation to counter this behavior. */ - if (ipnext == (XFS_IFORK_MAXEXT(ip, w) + 1)) - resblks += XFS_IFORK_MAXEXT(ip, w); - if (tipnext == (XFS_IFORK_MAXEXT(tip, w) + 1)) - resblks += XFS_IFORK_MAXEXT(tip, w); + flags |= XFS_TRANS_RES_FDBLKS; } - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, flags, + &tp); if (error) goto out_unlock; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 1264ac63e4e5..4f18457aae2a 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1547,6 +1547,28 @@ xfs_buf_zero( } /* + * Log a message about and stale a buffer that a caller has decided is corrupt. + * + * This function should be called for the kinds of metadata corruption that + * cannot be detect from a verifier, such as incorrect inter-block relationship + * data. Do /not/ call this function from a verifier function. + * + * The buffer must be XBF_DONE prior to the call. Afterwards, the buffer will + * be marked stale, but b_error will not be set. The caller is responsible for + * releasing the buffer or fixing it. + */ +void +__xfs_buf_mark_corrupt( + struct xfs_buf *bp, + xfs_failaddr_t fa) +{ + ASSERT(bp->b_flags & XBF_DONE); + + xfs_buf_corruption_error(bp, fa); + xfs_buf_stale(bp); +} + +/* * Handling of buffer targets (buftargs). */ diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index f6ce17d8d848..621467ab17c8 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -270,6 +270,8 @@ static inline int xfs_buf_submit(struct xfs_buf *bp) } void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize); +void __xfs_buf_mark_corrupt(struct xfs_buf *bp, xfs_failaddr_t fa); +#define xfs_buf_mark_corrupt(bp) __xfs_buf_mark_corrupt((bp), __this_address) /* Buffer Utility Routines */ extern void *xfs_buf_offset(struct xfs_buf *, size_t); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index d74fbd1e9d3e..f98260ed6d51 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -328,7 +328,7 @@ xfs_buf_item_format( * occurs during recovery. */ if (bip->bli_flags & XFS_BLI_INODE_BUF) { - if (xfs_sb_version_hascrc(&lip->li_mountp->m_sb) || + if (xfs_sb_version_has_v3inode(&lip->li_mountp->m_sb) || !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && xfs_log_item_in_current_chkpt(lip))) bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF; @@ -956,7 +956,7 @@ xfs_buf_item_relse( struct xfs_buf_log_item *bip = bp->b_log_item; trace_xfs_buf_item_relse(bp, _RET_IP_); - ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL)); + ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); bp->b_log_item = NULL; if (list_empty(&bp->b_li_list)) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 3cbf248af51f..672286f1762f 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -48,7 +48,7 @@ static struct lock_class_key xfs_dquot_project_class; */ void xfs_qm_dqdestroy( - xfs_dquot_t *dqp) + struct xfs_dquot *dqp) { ASSERT(list_empty(&dqp->q_lru)); @@ -113,8 +113,8 @@ xfs_qm_adjust_dqlimits( */ void xfs_qm_adjust_dqtimers( - xfs_mount_t *mp, - xfs_disk_dquot_t *d) + struct xfs_mount *mp, + struct xfs_disk_dquot *d) { ASSERT(d->d_id); @@ -205,16 +205,18 @@ xfs_qm_adjust_dqtimers( */ STATIC void xfs_qm_init_dquot_blk( - xfs_trans_t *tp, - xfs_mount_t *mp, - xfs_dqid_t id, - uint type, - xfs_buf_t *bp) + struct xfs_trans *tp, + struct xfs_mount *mp, + xfs_dqid_t id, + uint type, + struct xfs_buf *bp) { struct xfs_quotainfo *q = mp->m_quotainfo; - xfs_dqblk_t *d; - xfs_dqid_t curid; - int i; + struct xfs_dqblk *d; + xfs_dqid_t curid; + unsigned int qflag; + unsigned int blftype; + int i; ASSERT(tp); ASSERT(xfs_buf_islocked(bp)); @@ -238,11 +240,39 @@ xfs_qm_init_dquot_blk( } } - xfs_trans_dquot_buf(tp, bp, - (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF : - ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF : - XFS_BLF_GDQUOT_BUF))); - xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); + if (type & XFS_DQ_USER) { + qflag = XFS_UQUOTA_CHKD; + blftype = XFS_BLF_UDQUOT_BUF; + } else if (type & XFS_DQ_PROJ) { + qflag = XFS_PQUOTA_CHKD; + blftype = XFS_BLF_PDQUOT_BUF; + } else { + qflag = XFS_GQUOTA_CHKD; + blftype = XFS_BLF_GDQUOT_BUF; + } + + xfs_trans_dquot_buf(tp, bp, blftype); + + /* + * quotacheck uses delayed writes to update all the dquots on disk in an + * efficient manner instead of logging the individual dquot changes as + * they are made. However if we log the buffer allocated here and crash + * after quotacheck while the logged initialisation is still in the + * active region of the log, log recovery can replay the dquot buffer + * initialisation over the top of the checked dquots and corrupt quota + * accounting. + * + * To avoid this problem, quotacheck cannot log the initialised buffer. + * We must still dirty the buffer and write it back before the + * allocation transaction clears the log. Therefore, mark the buffer as + * ordered instead of logging it directly. This is safe for quotacheck + * because it detects and repairs allocated but initialized dquot blocks + * in the quota inodes. + */ + if (!(mp->m_qflags & qflag)) + xfs_trans_ordered_buf(tp, bp); + else + xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); } /* @@ -497,7 +527,7 @@ xfs_dquot_from_disk( struct xfs_disk_dquot *ddqp = bp->b_addr + dqp->q_bufoffset; /* copy everything from disk dquot to the incore dquot */ - memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t)); + memcpy(&dqp->q_core, ddqp, sizeof(struct xfs_disk_dquot)); /* * Reservation counters are defined as reservation plus current usage @@ -829,11 +859,11 @@ xfs_qm_id_for_quotatype( { switch (type) { case XFS_DQ_USER: - return ip->i_d.di_uid; + return i_uid_read(VFS_I(ip)); case XFS_DQ_GROUP: - return ip->i_d.di_gid; + return i_gid_read(VFS_I(ip)); case XFS_DQ_PROJ: - return xfs_get_projid(ip); + return ip->i_d.di_projid; } ASSERT(0); return 0; @@ -989,7 +1019,7 @@ xfs_qm_dqput( */ void xfs_qm_dqrele( - xfs_dquot_t *dqp) + struct xfs_dquot *dqp) { if (!dqp) return; @@ -1018,8 +1048,8 @@ xfs_qm_dqflush_done( struct xfs_buf *bp, struct xfs_log_item *lip) { - xfs_dq_logitem_t *qip = (struct xfs_dq_logitem *)lip; - xfs_dquot_t *dqp = qip->qli_dquot; + struct xfs_dq_logitem *qip = (struct xfs_dq_logitem *)lip; + struct xfs_dquot *dqp = qip->qli_dquot; struct xfs_ail *ailp = lip->li_ailp; /* @@ -1105,8 +1135,8 @@ xfs_qm_dqflush( * Get the buffer containing the on-disk dquot */ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, 0, &bp, - &xfs_dquot_buf_ops); + mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK, + &bp, &xfs_dquot_buf_ops); if (error) goto out_unlock; @@ -1125,11 +1155,11 @@ xfs_qm_dqflush( xfs_buf_relse(bp); xfs_dqfunlock(dqp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - return -EIO; + return -EFSCORRUPTED; } /* This is the only portion of data that needs to persist */ - memcpy(ddqp, &dqp->q_core, sizeof(xfs_disk_dquot_t)); + memcpy(ddqp, &dqp->q_core, sizeof(struct xfs_disk_dquot)); /* * Clear the dirty field and remember the flush lsn for later use. @@ -1176,7 +1206,7 @@ xfs_qm_dqflush( out_unlock: xfs_dqfunlock(dqp); - return -EIO; + return error; } /* @@ -1187,8 +1217,8 @@ out_unlock: */ void xfs_dqlock2( - xfs_dquot_t *d1, - xfs_dquot_t *d2) + struct xfs_dquot *d1, + struct xfs_dquot *d2) { if (d1 && d2) { ASSERT(d1 != d2); diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 4fe85709d55d..fe3e46df604b 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -30,33 +30,36 @@ enum { /* * The incore dquot structure */ -typedef struct xfs_dquot { - uint dq_flags; /* various flags (XFS_DQ_*) */ - struct list_head q_lru; /* global free list of dquots */ - struct xfs_mount*q_mount; /* filesystem this relates to */ - uint q_nrefs; /* # active refs from inodes */ - xfs_daddr_t q_blkno; /* blkno of dquot buffer */ - int q_bufoffset; /* off of dq in buffer (# dquots) */ - xfs_fileoff_t q_fileoffset; /* offset in quotas file */ - - xfs_disk_dquot_t q_core; /* actual usage & quotas */ - xfs_dq_logitem_t q_logitem; /* dquot log item */ - xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */ - xfs_qcnt_t q_res_icount; /* total inos allocd+reserved */ - xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */ - xfs_qcnt_t q_prealloc_lo_wmark;/* prealloc throttle wmark */ - xfs_qcnt_t q_prealloc_hi_wmark;/* prealloc disabled wmark */ - int64_t q_low_space[XFS_QLOWSP_MAX]; - struct mutex q_qlock; /* quota lock */ - struct completion q_flush; /* flush completion queue */ - atomic_t q_pincount; /* dquot pin count */ - wait_queue_head_t q_pinwait; /* dquot pinning wait queue */ -} xfs_dquot_t; +struct xfs_dquot { + uint dq_flags; + struct list_head q_lru; + struct xfs_mount *q_mount; + uint q_nrefs; + xfs_daddr_t q_blkno; + int q_bufoffset; + xfs_fileoff_t q_fileoffset; + + struct xfs_disk_dquot q_core; + struct xfs_dq_logitem q_logitem; + /* total regular nblks used+reserved */ + xfs_qcnt_t q_res_bcount; + /* total inos allocd+reserved */ + xfs_qcnt_t q_res_icount; + /* total realtime blks used+reserved */ + xfs_qcnt_t q_res_rtbcount; + xfs_qcnt_t q_prealloc_lo_wmark; + xfs_qcnt_t q_prealloc_hi_wmark; + int64_t q_low_space[XFS_QLOWSP_MAX]; + struct mutex q_qlock; + struct completion q_flush; + atomic_t q_pincount; + struct wait_queue_head q_pinwait; +}; /* * Lock hierarchy for q_qlock: * XFS_QLOCK_NORMAL is the implicit default, - * XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2 + * XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2 */ enum { XFS_QLOCK_NORMAL = 0, @@ -64,21 +67,21 @@ enum { }; /* - * Manage the q_flush completion queue embedded in the dquot. This completion + * Manage the q_flush completion queue embedded in the dquot. This completion * queue synchronizes processes attempting to flush the in-core dquot back to * disk. */ -static inline void xfs_dqflock(xfs_dquot_t *dqp) +static inline void xfs_dqflock(struct xfs_dquot *dqp) { wait_for_completion(&dqp->q_flush); } -static inline bool xfs_dqflock_nowait(xfs_dquot_t *dqp) +static inline bool xfs_dqflock_nowait(struct xfs_dquot *dqp) { return try_wait_for_completion(&dqp->q_flush); } -static inline void xfs_dqfunlock(xfs_dquot_t *dqp) +static inline void xfs_dqfunlock(struct xfs_dquot *dqp) { complete(&dqp->q_flush); } @@ -112,7 +115,7 @@ static inline int xfs_this_quota_on(struct xfs_mount *mp, int type) } } -static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type) +static inline struct xfs_dquot *xfs_inode_dquot(struct xfs_inode *ip, int type) { switch (type & XFS_DQ_ALLTYPES) { case XFS_DQ_USER: @@ -147,31 +150,30 @@ static inline bool xfs_dquot_lowsp(struct xfs_dquot *dqp) #define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ) #define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP) -extern void xfs_qm_dqdestroy(xfs_dquot_t *); -extern int xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **); -extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); -extern void xfs_qm_adjust_dqtimers(xfs_mount_t *, - xfs_disk_dquot_t *); -extern void xfs_qm_adjust_dqlimits(struct xfs_mount *, - struct xfs_dquot *); -extern xfs_dqid_t xfs_qm_id_for_quotatype(struct xfs_inode *ip, - uint type); -extern int xfs_qm_dqget(struct xfs_mount *mp, xfs_dqid_t id, +void xfs_qm_dqdestroy(struct xfs_dquot *dqp); +int xfs_qm_dqflush(struct xfs_dquot *dqp, struct xfs_buf **bpp); +void xfs_qm_dqunpin_wait(struct xfs_dquot *dqp); +void xfs_qm_adjust_dqtimers(struct xfs_mount *mp, + struct xfs_disk_dquot *d); +void xfs_qm_adjust_dqlimits(struct xfs_mount *mp, + struct xfs_dquot *d); +xfs_dqid_t xfs_qm_id_for_quotatype(struct xfs_inode *ip, uint type); +int xfs_qm_dqget(struct xfs_mount *mp, xfs_dqid_t id, uint type, bool can_alloc, struct xfs_dquot **dqpp); -extern int xfs_qm_dqget_inode(struct xfs_inode *ip, uint type, - bool can_alloc, - struct xfs_dquot **dqpp); -extern int xfs_qm_dqget_next(struct xfs_mount *mp, xfs_dqid_t id, +int xfs_qm_dqget_inode(struct xfs_inode *ip, uint type, + bool can_alloc, + struct xfs_dquot **dqpp); +int xfs_qm_dqget_next(struct xfs_mount *mp, xfs_dqid_t id, uint type, struct xfs_dquot **dqpp); -extern int xfs_qm_dqget_uncached(struct xfs_mount *mp, - xfs_dqid_t id, uint type, - struct xfs_dquot **dqpp); -extern void xfs_qm_dqput(xfs_dquot_t *); +int xfs_qm_dqget_uncached(struct xfs_mount *mp, + xfs_dqid_t id, uint type, + struct xfs_dquot **dqpp); +void xfs_qm_dqput(struct xfs_dquot *dqp); -extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); +void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); -extern void xfs_dquot_set_prealloc_limits(struct xfs_dquot *); +void xfs_dquot_set_prealloc_limits(struct xfs_dquot *); static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) { diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index d60647d7197b..baad1748d0d1 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -189,7 +189,8 @@ xfs_qm_dquot_logitem_push( if (!xfs_buf_delwri_queue(bp, buffer_list)) rval = XFS_ITEM_FLUSHING; xfs_buf_relse(bp); - } + } else if (error == -EAGAIN) + rval = XFS_ITEM_LOCKED; spin_lock(&lip->li_ailp->ail_lock); out_unlock: @@ -307,36 +308,62 @@ xfs_qm_qoffend_logitem_committed( { struct xfs_qoff_logitem *qfe = QOFF_ITEM(lip); struct xfs_qoff_logitem *qfs = qfe->qql_start_lip; - struct xfs_ail *ailp = qfs->qql_item.li_ailp; - /* - * Delete the qoff-start logitem from the AIL. - * xfs_trans_ail_delete() drops the AIL lock. - */ - spin_lock(&ailp->ail_lock); - xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR); + xfs_qm_qoff_logitem_relse(qfs); - kmem_free(qfs->qql_item.li_lv_shadow); kmem_free(lip->li_lv_shadow); - kmem_free(qfs); kmem_free(qfe); return (xfs_lsn_t)-1; } +STATIC void +xfs_qm_qoff_logitem_release( + struct xfs_log_item *lip) +{ + struct xfs_qoff_logitem *qoff = QOFF_ITEM(lip); + + if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) { + if (qoff->qql_start_lip) + xfs_qm_qoff_logitem_relse(qoff->qql_start_lip); + xfs_qm_qoff_logitem_relse(qoff); + } +} + static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { .iop_size = xfs_qm_qoff_logitem_size, .iop_format = xfs_qm_qoff_logitem_format, .iop_committed = xfs_qm_qoffend_logitem_committed, .iop_push = xfs_qm_qoff_logitem_push, + .iop_release = xfs_qm_qoff_logitem_release, }; static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = { .iop_size = xfs_qm_qoff_logitem_size, .iop_format = xfs_qm_qoff_logitem_format, .iop_push = xfs_qm_qoff_logitem_push, + .iop_release = xfs_qm_qoff_logitem_release, }; /* + * Delete the quotaoff intent from the AIL and free it. On success, + * this should only be called for the start item. It can be used for + * either on shutdown or abort. + */ +void +xfs_qm_qoff_logitem_relse( + struct xfs_qoff_logitem *qoff) +{ + struct xfs_log_item *lip = &qoff->qql_item; + + ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags) || + test_bit(XFS_LI_ABORTED, &lip->li_flags) || + XFS_FORCED_SHUTDOWN(lip->li_mountp)); + xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR); + kmem_free(lip->li_lv_shadow); + kmem_free(qoff); +} + +/* * Allocate and initialize an quotaoff item of the correct quota type(s). */ struct xfs_qoff_logitem * diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h index 1aed34ccdabc..2b86a43d7ce2 100644 --- a/fs/xfs/xfs_dquot_item.h +++ b/fs/xfs/xfs_dquot_item.h @@ -11,25 +11,28 @@ struct xfs_trans; struct xfs_mount; struct xfs_qoff_logitem; -typedef struct xfs_dq_logitem { - struct xfs_log_item qli_item; /* common portion */ - struct xfs_dquot *qli_dquot; /* dquot ptr */ - xfs_lsn_t qli_flush_lsn; /* lsn at last flush */ -} xfs_dq_logitem_t; +struct xfs_dq_logitem { + struct xfs_log_item qli_item; /* common portion */ + struct xfs_dquot *qli_dquot; /* dquot ptr */ + xfs_lsn_t qli_flush_lsn; /* lsn at last flush */ +}; -typedef struct xfs_qoff_logitem { - struct xfs_log_item qql_item; /* common portion */ - struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */ +struct xfs_qoff_logitem { + struct xfs_log_item qql_item; /* common portion */ + struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */ unsigned int qql_flags; -} xfs_qoff_logitem_t; +}; -extern void xfs_qm_dquot_logitem_init(struct xfs_dquot *); -extern xfs_qoff_logitem_t *xfs_qm_qoff_logitem_init(struct xfs_mount *, - struct xfs_qoff_logitem *, uint); -extern xfs_qoff_logitem_t *xfs_trans_get_qoff_item(struct xfs_trans *, - struct xfs_qoff_logitem *, uint); -extern void xfs_trans_log_quotaoff_item(struct xfs_trans *, - struct xfs_qoff_logitem *); +void xfs_qm_dquot_logitem_init(struct xfs_dquot *dqp); +struct xfs_qoff_logitem *xfs_qm_qoff_logitem_init(struct xfs_mount *mp, + struct xfs_qoff_logitem *start, + uint flags); +void xfs_qm_qoff_logitem_relse(struct xfs_qoff_logitem *); +struct xfs_qoff_logitem *xfs_trans_get_qoff_item(struct xfs_trans *tp, + struct xfs_qoff_logitem *startqoff, + uint flags); +void xfs_trans_log_quotaoff_item(struct xfs_trans *tp, + struct xfs_qoff_logitem *qlp); #endif /* __XFS_DQUOT_ITEM_H__ */ diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 849fd4476950..182b70464b71 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -329,19 +329,43 @@ xfs_corruption_error( const char *tag, int level, struct xfs_mount *mp, - void *buf, + const void *buf, size_t bufsize, const char *filename, int linenum, xfs_failaddr_t failaddr) { - if (level <= xfs_error_level) + if (buf && level <= xfs_error_level) xfs_hex_dump(buf, bufsize); xfs_error_report(tag, level, mp, filename, linenum, failaddr); xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair"); } /* + * Complain about the kinds of metadata corruption that we can't detect from a + * verifier, such as incorrect inter-block relationship data. Does not set + * bp->b_error. + * + * Call xfs_buf_mark_corrupt, not this function. + */ +void +xfs_buf_corruption_error( + struct xfs_buf *bp, + xfs_failaddr_t fa) +{ + struct xfs_mount *mp = bp->b_mount; + + xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR, + "Metadata corruption detected at %pS, %s block 0x%llx", + fa, bp->b_ops->name, bp->b_bn); + + xfs_alert(mp, "Unmount and run xfs_repair"); + + if (xfs_error_level >= XFS_ERRLEVEL_HIGH) + xfs_stack_trace(); +} + +/* * Warnings specifically for verifier errors. Differentiate CRC vs. invalid * values, and omit the stack trace unless the error level is tuned high. */ @@ -350,7 +374,7 @@ xfs_buf_verifier_error( struct xfs_buf *bp, int error, const char *name, - void *buf, + const void *buf, size_t bufsz, xfs_failaddr_t failaddr) { @@ -402,7 +426,7 @@ xfs_inode_verifier_error( struct xfs_inode *ip, int error, const char *name, - void *buf, + const void *buf, size_t bufsz, xfs_failaddr_t failaddr) { diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 602aa7d62b66..c6bb7d7a2161 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -12,16 +12,17 @@ extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, const char *filename, int linenum, xfs_failaddr_t failaddr); extern void xfs_corruption_error(const char *tag, int level, - struct xfs_mount *mp, void *buf, size_t bufsize, + struct xfs_mount *mp, const void *buf, size_t bufsize, const char *filename, int linenum, xfs_failaddr_t failaddr); +void xfs_buf_corruption_error(struct xfs_buf *bp, xfs_failaddr_t fa); extern void xfs_buf_verifier_error(struct xfs_buf *bp, int error, - const char *name, void *buf, size_t bufsz, + const char *name, const void *buf, size_t bufsz, xfs_failaddr_t failaddr); extern void xfs_verifier_error(struct xfs_buf *bp, int error, xfs_failaddr_t failaddr); extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error, - const char *name, void *buf, size_t bufsz, + const char *name, const void *buf, size_t bufsz, xfs_failaddr_t failaddr); #define XFS_ERROR_REPORT(e, lvl, mp) \ diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index f1372f9046e3..5a4b0119143a 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -15,7 +15,6 @@ #include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_icache.h" -#include "xfs_log.h" #include "xfs_pnfs.h" /* @@ -221,18 +220,7 @@ STATIC int xfs_fs_nfs_commit_metadata( struct inode *inode) { - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - xfs_lsn_t lsn = 0; - - xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_ipincount(ip)) - lsn = ip->i_itemp->ili_last_lsn; - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - if (!lsn) - return 0; - return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); + return xfs_log_force_inode(XFS_I(inode)); } const struct export_operations xfs_export_operations = { diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 2183d87be4cf..ef17c1f6db32 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -344,7 +344,6 @@ xfs_extent_busy_trim( ASSERT(*len > 0); spin_lock(&args->pag->pagb_lock); -restart: fbno = *bno; flen = *len; rbp = args->pag->pagb_tree.rb_node; @@ -363,19 +362,6 @@ restart: continue; } - /* - * If this is a metadata allocation, try to reuse the busy - * extent instead of trimming the allocation. - */ - if (!xfs_alloc_is_userdata(args->datatype) && - !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) { - if (!xfs_extent_busy_update_extent(args->mp, args->pag, - busyp, fbno, flen, - false)) - goto restart; - continue; - } - if (bbno <= fbno) { /* start overlap */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index e44efc41a041..de3cdce892fd 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -21,7 +21,7 @@ #include "xfs_alloc.h" #include "xfs_bmap.h" #include "xfs_trace.h" - +#include "xfs_error.h" kmem_zone_t *xfs_efi_zone; kmem_zone_t *xfs_efd_zone; @@ -139,44 +139,6 @@ xfs_efi_item_release( xfs_efi_release(EFI_ITEM(lip)); } -static const struct xfs_item_ops xfs_efi_item_ops = { - .iop_size = xfs_efi_item_size, - .iop_format = xfs_efi_item_format, - .iop_unpin = xfs_efi_item_unpin, - .iop_release = xfs_efi_item_release, -}; - - -/* - * Allocate and initialize an efi item with the given number of extents. - */ -struct xfs_efi_log_item * -xfs_efi_init( - struct xfs_mount *mp, - uint nextents) - -{ - struct xfs_efi_log_item *efip; - uint size; - - ASSERT(nextents > 0); - if (nextents > XFS_EFI_MAX_FAST_EXTENTS) { - size = (uint)(sizeof(xfs_efi_log_item_t) + - ((nextents - 1) * sizeof(xfs_extent_t))); - efip = kmem_zalloc(size, 0); - } else { - efip = kmem_zone_zalloc(xfs_efi_zone, 0); - } - - xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); - efip->efi_format.efi_nextents = nextents; - efip->efi_format.efi_id = (uintptr_t)(void *)efip; - atomic_set(&efip->efi_next_extent, 0); - atomic_set(&efip->efi_refcount, 2); - - return efip; -} - /* * Copy an EFI format buffer from the given buf, and into the destination * EFI format structure. @@ -228,6 +190,7 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) } return 0; } + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); return -EFSCORRUPTED; } @@ -411,41 +374,16 @@ xfs_extent_free_diff_items( XFS_FSB_TO_AGNO(mp, rb->xefi_startblock); } -/* Get an EFI. */ -STATIC void * -xfs_extent_free_create_intent( - struct xfs_trans *tp, - unsigned int count) -{ - struct xfs_efi_log_item *efip; - - ASSERT(tp != NULL); - ASSERT(count > 0); - - efip = xfs_efi_init(tp->t_mountp, count); - ASSERT(efip != NULL); - - /* - * Get a log_item_desc to point at the new item. - */ - xfs_trans_add_item(tp, &efip->efi_item); - return efip; -} - /* Log a free extent to the intent item. */ STATIC void xfs_extent_free_log_item( struct xfs_trans *tp, - void *intent, - struct list_head *item) + struct xfs_efi_log_item *efip, + struct xfs_extent_free_item *free) { - struct xfs_efi_log_item *efip = intent; - struct xfs_extent_free_item *free; uint next_extent; struct xfs_extent *extp; - free = container_of(item, struct xfs_extent_free_item, xefi_list); - tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags); @@ -461,14 +399,35 @@ xfs_extent_free_log_item( extp->ext_len = free->xefi_blockcount; } +static struct xfs_log_item * +xfs_extent_free_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_efi_log_item *efip = xfs_efi_init(mp, count); + struct xfs_extent_free_item *free; + + ASSERT(count > 0); + + xfs_trans_add_item(tp, &efip->efi_item); + if (sort) + list_sort(mp, items, xfs_extent_free_diff_items); + list_for_each_entry(free, items, xefi_list) + xfs_extent_free_log_item(tp, efip, free); + return &efip->efi_item; +} + /* Get an EFD so we can process all the free extents. */ STATIC void * xfs_extent_free_create_done( struct xfs_trans *tp, - void *intent, + struct xfs_log_item *intent, unsigned int count) { - return xfs_trans_get_efd(tp, intent, count); + return xfs_trans_get_efd(tp, EFI_ITEM(intent), count); } /* Process a free extent. */ @@ -494,9 +453,9 @@ xfs_extent_free_finish_item( /* Abort all pending EFIs. */ STATIC void xfs_extent_free_abort_intent( - void *intent) + struct xfs_log_item *intent) { - xfs_efi_release(intent); + xfs_efi_release(EFI_ITEM(intent)); } /* Cancel a free extent. */ @@ -512,10 +471,8 @@ xfs_extent_free_cancel_item( const struct xfs_defer_op_type xfs_extent_free_defer_type = { .max_items = XFS_EFI_MAX_FAST_EXTENTS, - .diff_items = xfs_extent_free_diff_items, .create_intent = xfs_extent_free_create_intent, .abort_intent = xfs_extent_free_abort_intent, - .log_item = xfs_extent_free_log_item, .create_done = xfs_extent_free_create_done, .finish_item = xfs_extent_free_finish_item, .cancel_item = xfs_extent_free_cancel_item, @@ -578,10 +535,8 @@ xfs_agfl_free_finish_item( /* sub-type with special handling for AGFL deferred frees */ const struct xfs_defer_op_type xfs_agfl_free_defer_type = { .max_items = XFS_EFI_MAX_FAST_EXTENTS, - .diff_items = xfs_extent_free_diff_items, .create_intent = xfs_extent_free_create_intent, .abort_intent = xfs_extent_free_abort_intent, - .log_item = xfs_extent_free_log_item, .create_done = xfs_extent_free_create_done, .finish_item = xfs_agfl_free_finish_item, .cancel_item = xfs_extent_free_cancel_item, @@ -593,9 +548,10 @@ const struct xfs_defer_op_type xfs_agfl_free_defer_type = { */ int xfs_efi_recover( - struct xfs_mount *mp, - struct xfs_efi_log_item *efip) + struct xfs_efi_log_item *efip, + struct list_head *capture_list) { + struct xfs_mount *mp = efip->efi_item.li_mountp; struct xfs_efd_log_item *efdp; struct xfs_trans *tp; int i; @@ -624,7 +580,7 @@ xfs_efi_recover( */ set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); xfs_efi_release(efip); - return -EIO; + return -EFSCORRUPTED; } } @@ -644,10 +600,76 @@ xfs_efi_recover( } set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); - error = xfs_trans_commit(tp); - return error; + + return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list); abort_error: xfs_trans_cancel(tp); return error; } + +/* Relog an intent item to push the log tail forward. */ +static struct xfs_log_item * +xfs_efi_item_relog( + struct xfs_log_item *intent, + struct xfs_trans *tp) +{ + struct xfs_efd_log_item *efdp; + struct xfs_efi_log_item *efip; + struct xfs_extent *extp; + unsigned int count; + + count = EFI_ITEM(intent)->efi_format.efi_nextents; + extp = EFI_ITEM(intent)->efi_format.efi_extents; + + tp->t_flags |= XFS_TRANS_DIRTY; + efdp = xfs_trans_get_efd(tp, EFI_ITEM(intent), count); + efdp->efd_next_extent = count; + memcpy(efdp->efd_format.efd_extents, extp, count * sizeof(*extp)); + set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags); + + efip = xfs_efi_init(tp->t_mountp, count); + memcpy(efip->efi_format.efi_extents, extp, count * sizeof(*extp)); + atomic_set(&efip->efi_next_extent, count); + xfs_trans_add_item(tp, &efip->efi_item); + set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags); + return &efip->efi_item; +} + +static const struct xfs_item_ops xfs_efi_item_ops = { + .iop_size = xfs_efi_item_size, + .iop_format = xfs_efi_item_format, + .iop_unpin = xfs_efi_item_unpin, + .iop_release = xfs_efi_item_release, + .iop_relog = xfs_efi_item_relog, +}; + +/* + * Allocate and initialize an efi item with the given number of extents. + */ +struct xfs_efi_log_item * +xfs_efi_init( + struct xfs_mount *mp, + uint nextents) + +{ + struct xfs_efi_log_item *efip; + uint size; + + ASSERT(nextents > 0); + if (nextents > XFS_EFI_MAX_FAST_EXTENTS) { + size = (uint)(sizeof(struct xfs_efi_log_item) + + ((nextents - 1) * sizeof(xfs_extent_t))); + efip = kmem_zalloc(size, 0); + } else { + efip = kmem_zone_zalloc(xfs_efi_zone, 0); + } + + xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); + efip->efi_format.efi_nextents = nextents; + efip->efi_format.efi_id = (uintptr_t)(void *)efip; + atomic_set(&efip->efi_next_extent, 0); + atomic_set(&efip->efi_refcount, 2); + + return efip; +} diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index 16aaab06d4ec..883f0f1d8989 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -50,25 +50,25 @@ struct kmem_zone; * of commit failure or log I/O errors. Note that the EFD is not inserted in the * AIL, so at this point both the EFI and EFD are freed. */ -typedef struct xfs_efi_log_item { +struct xfs_efi_log_item { struct xfs_log_item efi_item; atomic_t efi_refcount; atomic_t efi_next_extent; unsigned long efi_flags; /* misc flags */ xfs_efi_log_format_t efi_format; -} xfs_efi_log_item_t; +}; /* * This is the "extent free done" log item. It is used to log * the fact that some extents earlier mentioned in an efi item * have been freed. */ -typedef struct xfs_efd_log_item { +struct xfs_efd_log_item { struct xfs_log_item efd_item; - xfs_efi_log_item_t *efd_efip; + struct xfs_efi_log_item *efd_efip; uint efd_next_extent; xfs_efd_log_format_t efd_format; -} xfs_efd_log_item_t; +}; /* * Max number of extents in fast allocation path. @@ -78,13 +78,13 @@ typedef struct xfs_efd_log_item { extern struct kmem_zone *xfs_efi_zone; extern struct kmem_zone *xfs_efd_zone; -xfs_efi_log_item_t *xfs_efi_init(struct xfs_mount *, uint); +struct xfs_efi_log_item *xfs_efi_init(struct xfs_mount *, uint); int xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt); -void xfs_efi_item_free(xfs_efi_log_item_t *); +void xfs_efi_item_free(struct xfs_efi_log_item *); void xfs_efi_release(struct xfs_efi_log_item *); -int xfs_efi_recover(struct xfs_mount *mp, - struct xfs_efi_log_item *efip); +int xfs_efi_recover(struct xfs_efi_log_item *efip, + struct list_head *capture_list); #endif /* __XFS_EXTFREE_ITEM_H__ */ diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 203065a64765..b651715da8c6 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -80,19 +80,9 @@ xfs_dir_fsync( int datasync) { struct xfs_inode *ip = XFS_I(file->f_mapping->host); - struct xfs_mount *mp = ip->i_mount; - xfs_lsn_t lsn = 0; trace_xfs_dir_fsync(ip); - - xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_ipincount(ip)) - lsn = ip->i_itemp->ili_last_lsn; - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - if (!lsn) - return 0; - return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); + return xfs_log_force_inode(ip); } STATIC int @@ -187,7 +177,12 @@ xfs_file_dio_aio_read( file_accessed(iocb->ki_filp); - xfs_ilock(ip, XFS_IOLOCK_SHARED); + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) + return -EAGAIN; + } else { + xfs_ilock(ip, XFS_IOLOCK_SHARED); + } ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL); xfs_iunlock(ip, XFS_IOLOCK_SHARED); @@ -995,6 +990,21 @@ xfs_file_fadvise( return ret; } +/* Does this file, inode, or mount want synchronous writes? */ +static inline bool xfs_file_sync_writes(struct file *filp) +{ + struct xfs_inode *ip = XFS_I(file_inode(filp)); + + if (ip->i_mount->m_flags & XFS_MOUNT_WSYNC) + return true; + if (filp->f_flags & (__O_SYNC | O_DSYNC)) + return true; + if (IS_SYNC(file_inode(filp))) + return true; + + return false; +} + STATIC loff_t xfs_file_remap_range( struct file *file_in, @@ -1049,7 +1059,11 @@ xfs_file_remap_range( ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize, remap_flags); + if (ret) + goto out_unlock; + if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out)) + xfs_log_force_inode(dest); out_unlock: xfs_reflink_remap_unlock(file_in, file_out); if (ret) @@ -1253,10 +1267,23 @@ xfs_filemap_pfn_mkwrite( return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true); } +static void +xfs_filemap_map_pages( + struct vm_fault *vmf, + pgoff_t start_pgoff, + pgoff_t end_pgoff) +{ + struct inode *inode = file_inode(vmf->vma->vm_file); + + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + filemap_map_pages(vmf, start_pgoff, end_pgoff); + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); +} + static const struct vm_operations_struct xfs_file_vm_ops = { .fault = xfs_filemap_fault, .huge_fault = xfs_filemap_huge_fault, - .map_pages = filemap_map_pages, + .map_pages = xfs_filemap_map_pages, .page_mkwrite = xfs_filemap_page_mkwrite, .pfn_mkwrite = xfs_filemap_pfn_mkwrite, }; diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 01c0933a4d10..79e8af8f4669 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -146,6 +146,7 @@ xfs_fsmap_owner_from_rmap( dest->fmr_owner = XFS_FMR_OWN_FREE; break; default: + ASSERT(0); return -EFSCORRUPTED; } return 0; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index a1135b86e79f..f1451642ce38 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -289,6 +289,8 @@ xfs_reinit_inode( uint64_t version = inode_peek_iversion(inode); umode_t mode = inode->i_mode; dev_t dev = inode->i_rdev; + kuid_t uid = inode->i_uid; + kgid_t gid = inode->i_gid; error = inode_init_always(mp->m_super, inode); @@ -297,6 +299,8 @@ xfs_reinit_inode( inode_set_iversion_queried(inode, version); inode->i_mode = mode; inode->i_rdev = dev; + inode->i_uid = uid; + inode->i_gid = gid; return error; } @@ -1430,7 +1434,7 @@ xfs_inode_match_id( return 0; if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && - xfs_get_projid(ip) != eofb->eof_prid) + ip->i_d.di_projid != eofb->eof_prid) return 0; return 1; @@ -1454,7 +1458,7 @@ xfs_inode_match_id_union( return 1; if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && - xfs_get_projid(ip) == eofb->eof_prid) + ip->i_d.di_projid == eofb->eof_prid) return 1; return 0; diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c index 3ebd1b7f49d8..7d940b289db5 100644 --- a/fs/xfs/xfs_icreate_item.c +++ b/fs/xfs/xfs_icreate_item.c @@ -10,6 +10,7 @@ #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_icreate_item.h" +#include "xfs_log_priv.h" #include "xfs_log.h" kmem_zone_t *xfs_icreate_zone; /* inode create item zone */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 7a9048c4c2f9..568a9332efd2 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -750,6 +750,7 @@ xfs_ialloc( xfs_buf_t **ialloc_context, xfs_inode_t **ipp) { + struct inode *dir = pip ? VFS_I(pip) : NULL; struct xfs_mount *mp = tp->t_mountp; xfs_ino_t ino; xfs_inode_t *ip; @@ -795,26 +796,17 @@ xfs_ialloc( return error; ASSERT(ip != NULL); inode = VFS_I(ip); - - /* - * We always convert v1 inodes to v2 now - we only support filesystems - * with >= v2 inode capability, so there is no reason for ever leaving - * an inode in v1 format. - */ - if (ip->i_d.di_version == 1) - ip->i_d.di_version = 2; - - inode->i_mode = mode; set_nlink(inode, nlink); - ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid()); - ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid()); inode->i_rdev = rdev; - xfs_set_projid(ip, prid); + ip->i_d.di_projid = prid; - if (pip && XFS_INHERIT_GID(pip)) { - ip->i_d.di_gid = pip->i_d.di_gid; - if ((VFS_I(pip)->i_mode & S_ISGID) && S_ISDIR(mode)) - inode->i_mode |= S_ISGID; + if (dir && !(dir->i_mode & S_ISGID) && + (mp->m_flags & XFS_MOUNT_GRPID)) { + inode->i_uid = current_fsuid(); + inode->i_gid = dir->i_gid; + inode->i_mode = mode; + } else { + inode_init_owner(inode, dir, mode); } /* @@ -822,9 +814,8 @@ xfs_ialloc( * ID or one of the supplementary group IDs, the S_ISGID bit is cleared * (and only if the irix_sgid_inherit compatibility variable is set). */ - if ((irix_sgid_inherit) && - (inode->i_mode & S_ISGID) && - (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid)))) + if (irix_sgid_inherit && + (inode->i_mode & S_ISGID) && !in_group_p(inode->i_gid)) inode->i_mode &= ~S_ISGID; ip->i_d.di_size = 0; @@ -841,7 +832,7 @@ xfs_ialloc( ip->i_d.di_dmstate = 0; ip->i_d.di_flags = 0; - if (ip->i_d.di_version == 3) { + if (xfs_sb_version_has_v3inode(&mp->m_sb)) { inode_set_iversion(inode, 1); ip->i_d.di_flags2 = 0; ip->i_d.di_cowextsize = 0; @@ -849,7 +840,6 @@ xfs_ialloc( ip->i_d.di_crtime.t_nsec = (int32_t)tv.tv_nsec; } - flags = XFS_ILOG_CORE; switch (mode & S_IFMT) { case S_IFIFO: @@ -902,20 +892,13 @@ xfs_ialloc( ip->i_d.di_flags |= di_flags; } - if (pip && - (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) && - pip->i_d.di_version == 3 && - ip->i_d.di_version == 3) { - uint64_t di_flags2 = 0; - + if (pip && (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY)) { if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) { - di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; + ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; ip->i_d.di_cowextsize = pip->i_d.di_cowextsize; } if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) - di_flags2 |= XFS_DIFLAG2_DAX; - - ip->i_d.di_flags2 |= di_flags2; + ip->i_d.di_flags2 |= XFS_DIFLAG2_DAX; } /* FALLTHROUGH */ case S_IFLNK: @@ -1117,7 +1100,6 @@ xfs_bumplink( { xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); - ASSERT(ip->i_d.di_version > 1); inc_nlink(VFS_I(ip)); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } @@ -1153,8 +1135,7 @@ xfs_create( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()), - xfs_kgid_to_gid(current_fsgid()), prid, + error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -1304,8 +1285,7 @@ xfs_create_tmpfile( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()), - xfs_kgid_to_gid(current_fsgid()), prid, + error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -1418,7 +1398,7 @@ xfs_link( * the tree quota mechanism could be circumvented. */ if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && - (xfs_get_projid(tdp) != xfs_get_projid(sip)))) { + tdp->i_d.di_projid != sip->i_d.di_projid)) { error = -EXDEV; goto error_return; } @@ -1513,10 +1493,8 @@ xfs_itruncate_extents_flags( struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp = *tpp; xfs_fileoff_t first_unmap_block; - xfs_fileoff_t last_block; xfs_filblks_t unmap_len; int error = 0; - int done = 0; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(!atomic_read(&VFS_I(ip)->i_count) || @@ -1536,21 +1514,22 @@ xfs_itruncate_extents_flags( * the end of the file (in a crash where the space is allocated * but the inode size is not yet updated), simply remove any * blocks which show up between the new EOF and the maximum - * possible file size. If the first block to be removed is - * beyond the maximum file size (ie it is the same as last_block), - * then there is nothing to do. + * possible file size. + * + * We have to free all the blocks to the bmbt maximum offset, even if + * the page cache can't scale that far. */ first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); - last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); - if (first_unmap_block == last_block) + if (first_unmap_block >= XFS_MAX_FILEOFF) { + WARN_ON_ONCE(first_unmap_block > XFS_MAX_FILEOFF); return 0; + } - ASSERT(first_unmap_block < last_block); - unmap_len = last_block - first_unmap_block + 1; - while (!done) { + unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1; + while (unmap_len > 0) { ASSERT(tp->t_firstblock == NULLFSBLOCK); - error = xfs_bunmapi(tp, ip, first_unmap_block, unmap_len, flags, - XFS_ITRUNC_MAX_EXTENTS, &done); + error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len, + flags, XFS_ITRUNC_MAX_EXTENTS); if (error) goto out; @@ -1570,7 +1549,7 @@ xfs_itruncate_extents_flags( if (whichfork == XFS_DATA_FORK) { /* Remove all pending CoW reservations. */ error = xfs_reflink_cancel_cow_blocks(ip, &tp, - first_unmap_block, last_block, true); + first_unmap_block, XFS_MAX_FILEOFF, true); if (error) goto out; @@ -2149,8 +2128,10 @@ xfs_iunlink_update_bucket( * passed in because either we're adding or removing ourselves from the * head of the list. */ - if (old_value == new_agino) + if (old_value == new_agino) { + xfs_buf_mark_corrupt(agibp); return -EFSCORRUPTED; + } agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino); offset = offsetof(struct xfs_agi, agi_unlinked) + @@ -2213,6 +2194,8 @@ xfs_iunlink_update_inode( /* Make sure the old pointer isn't garbage. */ old_value = be32_to_cpu(dip->di_next_unlinked); if (!xfs_verify_agino_or_null(mp, agno, old_value)) { + xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, + sizeof(*dip), __this_address); error = -EFSCORRUPTED; goto out; } @@ -2224,8 +2207,11 @@ xfs_iunlink_update_inode( */ *old_next_agino = old_value; if (old_value == next_agino) { - if (next_agino != NULLAGINO) + if (next_agino != NULLAGINO) { + xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, + dip, sizeof(*dip), __this_address); error = -EFSCORRUPTED; + } goto out; } @@ -2276,8 +2262,10 @@ xfs_iunlink( */ next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); if (next_agino == agino || - !xfs_verify_agino_or_null(mp, agno, next_agino)) + !xfs_verify_agino_or_null(mp, agno, next_agino)) { + xfs_buf_mark_corrupt(agibp); return -EFSCORRUPTED; + } if (next_agino != NULLAGINO) { struct xfs_perag *pag; @@ -2547,7 +2535,7 @@ xfs_ifree_cluster( xfs_daddr_t blkno; xfs_buf_t *bp; xfs_inode_t *ip; - xfs_inode_log_item_t *iip; + struct xfs_inode_log_item *iip; struct xfs_log_item *lip; struct xfs_perag *pag; struct xfs_ino_geometry *igeo = M_IGEO(mp); @@ -2584,8 +2572,10 @@ xfs_ifree_cluster( mp->m_bsize * igeo->blocks_per_cluster, XBF_UNMAPPED); - if (!bp) + if (!bp) { + xfs_perag_put(pag); return -ENOMEM; + } /* * This buffer may not have been correctly initialised as we @@ -2607,7 +2597,7 @@ xfs_ifree_cluster( */ list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { if (lip->li_type == XFS_LI_INODE) { - iip = (xfs_inode_log_item_t *)lip; + iip = (struct xfs_inode_log_item *)lip; ASSERT(iip->ili_logged == 1); lip->li_cb = xfs_istale_done; xfs_trans_ail_copy_lsn(mp->m_ail, @@ -3215,6 +3205,7 @@ xfs_rename( struct xfs_trans *tp; struct xfs_inode *wip = NULL; /* whiteout inode */ struct xfs_inode *inodes[__XFS_SORT_INODES]; + int i; int num_inodes = __XFS_SORT_INODES; bool new_parent = (src_dp != target_dp); bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode); @@ -3288,7 +3279,7 @@ xfs_rename( * tree quota mechanism would be circumvented. */ if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && - (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) { + target_dp->i_d.di_projid != src_ip->i_d.di_projid)) { error = -EXDEV; goto out_trans_cancel; } @@ -3327,6 +3318,30 @@ xfs_rename( } /* + * Lock the AGI buffers we need to handle bumping the nlink of the + * whiteout inode off the unlinked list and to handle dropping the + * nlink of the target inode. Per locking order rules, do this in + * increasing AG order and before directory block allocation tries to + * grab AGFs because we grab AGIs before AGFs. + * + * The (vfs) caller must ensure that if src is a directory then + * target_ip is either null or an empty directory. + */ + for (i = 0; i < num_inodes && inodes[i] != NULL; i++) { + if (inodes[i] == wip || + (inodes[i] == target_ip && + (VFS_I(target_ip)->i_nlink == 1 || src_is_directory))) { + struct xfs_buf *bp; + xfs_agnumber_t agno; + + agno = XFS_INO_TO_AGNO(mp, inodes[i]->i_ino); + error = xfs_read_agi(mp, tp, agno, &bp); + if (error) + goto out_trans_cancel; + } + } + + /* * Directory entry creation below may acquire the AGF. Remove * the whiteout from the unlinked list first to preserve correct * AGI/AGF locking order. This dirties the transaction so failures @@ -3796,7 +3811,6 @@ xfs_iflush_int( ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); ASSERT(iip != NULL && iip->ili_fields != 0); - ASSERT(ip->i_d.di_version > 1); /* set *dip = inode's place in the buffer */ dip = xfs_buf_offset(bp, ip->i_imap.im_boffset); @@ -3857,7 +3871,7 @@ xfs_iflush_int( * backwards compatibility with old kernels that predate logging all * inode changes. */ - if (ip->i_d.di_version < 3) + if (!xfs_sb_version_has_v3inode(&mp->m_sb)) ip->i_d.di_flushiter++; /* Check the inline fork data before we write out. */ @@ -3940,3 +3954,22 @@ xfs_irele( trace_xfs_irele(ip, _RET_IP_); iput(VFS_I(ip)); } + +/* + * Ensure all commited transactions touching the inode are written to the log. + */ +int +xfs_log_force_inode( + struct xfs_inode *ip) +{ + xfs_lsn_t lsn = 0; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (xfs_ipincount(ip)) + lsn = ip->i_itemp->ili_last_lsn; + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (!lsn) + return 0; + return xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, NULL); +} diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 558173f95a03..62b963d3b23d 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -177,30 +177,11 @@ xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned short flags) return ret; } -/* - * Project quota id helpers (previously projid was 16bit only - * and using two 16bit values to hold new 32bit projid was chosen - * to retain compatibility with "old" filesystems). - */ -static inline prid_t -xfs_get_projid(struct xfs_inode *ip) -{ - return (prid_t)ip->i_d.di_projid_hi << 16 | ip->i_d.di_projid_lo; -} - -static inline void -xfs_set_projid(struct xfs_inode *ip, - prid_t projid) -{ - ip->i_d.di_projid_hi = (uint16_t) (projid >> 16); - ip->i_d.di_projid_lo = (uint16_t) (projid & 0xffff); -} - static inline prid_t xfs_get_initial_prid(struct xfs_inode *dp) { if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) - return xfs_get_projid(dp); + return dp->i_d.di_projid; return XFS_PROJID_DEFAULT; } @@ -441,6 +422,7 @@ int xfs_itruncate_extents_flags(struct xfs_trans **, struct xfs_inode *, int, xfs_fsize_t, int); void xfs_iext_realloc(xfs_inode_t *, int, int); +int xfs_log_force_inode(struct xfs_inode *ip); void xfs_iunpin_wait(xfs_inode_t *); #define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index bb8f076805b9..83bf96b6cf5d 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -17,6 +17,7 @@ #include "xfs_trans_priv.h" #include "xfs_buf_item.h" #include "xfs_log.h" +#include "xfs_error.h" #include <linux/iversion.h> @@ -124,7 +125,7 @@ xfs_inode_item_size( *nvecs += 2; *nbytes += sizeof(struct xfs_inode_log_format) + - xfs_log_dinode_size(ip->i_d.di_version); + xfs_log_dinode_size(ip->i_mount); xfs_inode_item_data_fork_size(iip, nvecs, nbytes); if (XFS_IFORK_Q(ip)) @@ -304,13 +305,11 @@ xfs_inode_to_log_dinode( struct inode *inode = VFS_I(ip); to->di_magic = XFS_DINODE_MAGIC; - - to->di_version = from->di_version; to->di_format = from->di_format; - to->di_uid = from->di_uid; - to->di_gid = from->di_gid; - to->di_projid_lo = from->di_projid_lo; - to->di_projid_hi = from->di_projid_hi; + to->di_uid = i_uid_read(inode); + to->di_gid = i_gid_read(inode); + to->di_projid_lo = from->di_projid & 0xffff; + to->di_projid_hi = from->di_projid >> 16; memset(to->di_pad, 0, sizeof(to->di_pad)); memset(to->di_pad3, 0, sizeof(to->di_pad3)); @@ -338,7 +337,8 @@ xfs_inode_to_log_dinode( /* log a dummy value to ensure log structure is fully initialised */ to->di_next_unlinked = NULLAGINO; - if (from->di_version == 3) { + if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { + to->di_version = 3; to->di_changecount = inode_peek_iversion(inode); to->di_crtime.t_sec = from->di_crtime.t_sec; to->di_crtime.t_nsec = from->di_crtime.t_nsec; @@ -350,6 +350,7 @@ xfs_inode_to_log_dinode( uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid); to->di_flushiter = 0; } else { + to->di_version = 2; to->di_flushiter = from->di_flushiter; } } @@ -369,7 +370,7 @@ xfs_inode_item_format_core( dic = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_ICORE); xfs_inode_to_log_dinode(ip, dic, ip->i_itemp->ili_item.li_lsn); - xlog_finish_iovec(lv, *vecp, xfs_log_dinode_size(ip->i_d.di_version)); + xlog_finish_iovec(lv, *vecp, xfs_log_dinode_size(ip->i_mount)); } /* @@ -394,8 +395,6 @@ xfs_inode_item_format( struct xfs_log_iovec *vecp = NULL; struct xfs_inode_log_format *ilf; - ASSERT(ip->i_d.di_version > 1); - ilf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_IFORMAT); ilf->ilf_type = XFS_LI_INODE; ilf->ilf_ino = ip->i_ino; @@ -731,29 +730,27 @@ xfs_iflush_done( * holding the lock before removing the inode from the AIL. */ if (need_ail) { - bool mlip_changed = false; + xfs_lsn_t tail_lsn = 0; /* this is an opencoded batch version of xfs_trans_ail_delete */ spin_lock(&ailp->ail_lock); list_for_each_entry(blip, &tmp, li_bio_list) { if (INODE_ITEM(blip)->ili_logged && - blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) - mlip_changed |= xfs_ail_delete_one(ailp, blip); - else { + blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) { + /* + * xfs_ail_update_finish() only cares about the + * lsn of the first tail item removed, any + * others will be at the same or higher lsn so + * we just ignore them. + */ + xfs_lsn_t lsn = xfs_ail_delete_one(ailp, blip); + if (!tail_lsn && lsn) + tail_lsn = lsn; + } else { xfs_clear_li_failed(blip); } } - - if (mlip_changed) { - if (!XFS_FORCED_SHUTDOWN(ailp->ail_mount)) - xlog_assign_tail_lsn_locked(ailp->ail_mount); - if (list_empty(&ailp->ail_head)) - wake_up_all(&ailp->ail_empty); - } - spin_unlock(&ailp->ail_lock); - - if (mlip_changed) - xfs_log_space_wake(ailp->ail_mount); + xfs_ail_update_finish(ailp, tail_lsn); } /* @@ -782,7 +779,7 @@ xfs_iflush_abort( xfs_inode_t *ip, bool stale) { - xfs_inode_log_item_t *iip = ip->i_itemp; + struct xfs_inode_log_item *iip = ip->i_itemp; if (iip) { if (test_bit(XFS_LI_IN_AIL, &iip->ili_item.li_flags)) { @@ -828,8 +825,10 @@ xfs_inode_item_format_convert( { struct xfs_inode_log_format_32 *in_f32 = buf->i_addr; - if (buf->i_len != sizeof(*in_f32)) + if (buf->i_len != sizeof(*in_f32)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); return -EFSCORRUPTED; + } in_f->ilf_type = in_f32->ilf_type; in_f->ilf_size = in_f32->ilf_size; diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 07a60e74c39c..ad667fd4ae62 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -13,7 +13,7 @@ struct xfs_bmbt_rec; struct xfs_inode; struct xfs_mount; -typedef struct xfs_inode_log_item { +struct xfs_inode_log_item { struct xfs_log_item ili_item; /* common portion */ struct xfs_inode *ili_inode; /* inode ptr */ xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ @@ -23,7 +23,7 @@ typedef struct xfs_inode_log_item { unsigned int ili_last_fields; /* fields when flushed */ unsigned int ili_fields; /* fields to be logged */ unsigned int ili_fsync_fields; /* logged since last fsync */ -} xfs_inode_log_item_t; +}; static inline int xfs_inode_clean(xfs_inode_t *ip) { diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 7b7a009425e2..e7356e527260 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1144,7 +1144,7 @@ xfs_fill_fsxattr( fa->fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog; fa->fsx_cowextsize = ip->i_d.di_cowextsize << ip->i_mount->m_sb.sb_blocklog; - fa->fsx_projid = xfs_get_projid(ip); + fa->fsx_projid = ip->i_d.di_projid; if (attr) { if (ip->i_afp) { @@ -1299,7 +1299,7 @@ xfs_ioctl_setattr_xflags( /* diflags2 only valid for v3 inodes. */ di_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags); - if (di_flags2 && ip->i_d.di_version < 3) + if (di_flags2 && !xfs_sb_version_has_v3inode(&mp->m_sb)) return -EINVAL; ip->i_d.di_flags = xfs_flags2diflags(ip, fa->fsx_xflags); @@ -1510,8 +1510,7 @@ xfs_ioctl_setattr_check_cowextsize( if (!(fa->fsx_xflags & FS_XFLAG_COWEXTSIZE)) return 0; - if (!xfs_sb_version_hasreflink(&ip->i_mount->m_sb) || - ip->i_d.di_version != 3) + if (!xfs_sb_version_hasreflink(&ip->i_mount->m_sb)) return -EINVAL; if (fa->fsx_cowextsize == 0) @@ -1572,9 +1571,9 @@ xfs_ioctl_setattr( * because the i_*dquot fields will get updated anyway. */ if (XFS_IS_QUOTA_ON(mp)) { - code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid, - ip->i_d.di_gid, fa->fsx_projid, - XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp); + code = xfs_qm_vop_dqalloc(ip, VFS_I(ip)->i_uid, + VFS_I(ip)->i_gid, fa->fsx_projid, + XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp); if (code) return code; } @@ -1597,7 +1596,7 @@ xfs_ioctl_setattr( } if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) && - xfs_get_projid(ip) != fa->fsx_projid) { + ip->i_d.di_projid != fa->fsx_projid) { code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL, pdqp, capable(CAP_FOWNER) ? XFS_QMOPT_FORCE_RES : 0); if (code) /* out of quota */ @@ -1634,13 +1633,12 @@ xfs_ioctl_setattr( VFS_I(ip)->i_mode &= ~(S_ISUID|S_ISGID); /* Change the ownerships and register project quota modifications */ - if (xfs_get_projid(ip) != fa->fsx_projid) { + if (ip->i_d.di_projid != fa->fsx_projid) { if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) { olddquot = xfs_qm_vop_chown(tp, ip, &ip->i_pdquot, pdqp); } - ASSERT(ip->i_d.di_version > 1); - xfs_set_projid(ip, fa->fsx_projid); + ip->i_d.di_projid = fa->fsx_projid; } /* @@ -1652,7 +1650,7 @@ xfs_ioctl_setattr( ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog; else ip->i_d.di_extsize = 0; - if (ip->i_d.di_version == 3 && + if (xfs_sb_version_has_v3inode(&mp->m_sb) && (ip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)) ip->i_d.di_cowextsize = fa->fsx_cowextsize >> mp->m_sb.sb_blocklog; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 239c9548b156..70880422057d 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -765,6 +765,11 @@ xfs_iomap_write_unwritten( */ resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; + /* Attach dquots so that bmbt splits are accounted correctly. */ + error = xfs_qm_dqattach(ip); + if (error) + return error; + do { /* * Set up a transaction to convert the range of extents @@ -783,6 +788,11 @@ xfs_iomap_write_unwritten( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); + error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, + XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES); + if (error) + goto error_on_bmapi_transaction; + /* * Modify the unwritten extent state of the buffer. */ @@ -1055,6 +1065,13 @@ xfs_file_iomap_begin( trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap); out_finish: + /* + * Writes that span EOF might trigger an IO size update on completion, + * so consider them to be dirty for the purposes of O_DSYNC even if + * there is no other metadata changes pending or have been made here. + */ + if ((flags & IOMAP_WRITE) && offset + length > i_size_read(inode)) + iomap->flags |= IOMAP_F_DIRTY; return xfs_bmbt_to_iomap(ip, iomap, &imap, shared); out_found: diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index ca8c763902b9..a7efc8896e5e 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -20,6 +20,7 @@ #include "xfs_symlink.h" #include "xfs_dir2.h" #include "xfs_iomap.h" +#include "xfs_error.h" #include <linux/xattr.h> #include <linux/posix_acl.h> @@ -470,17 +471,20 @@ xfs_vn_get_link_inline( struct inode *inode, struct delayed_call *done) { + struct xfs_inode *ip = XFS_I(inode); char *link; - ASSERT(XFS_I(inode)->i_df.if_flags & XFS_IFINLINE); + ASSERT(ip->i_df.if_flags & XFS_IFINLINE); /* * The VFS crashes on a NULL pointer, so return -EFSCORRUPTED if * if_data is junk. */ - link = XFS_I(inode)->i_df.if_u1.if_data; - if (!link) + link = ip->i_df.if_u1.if_data; + if (!link) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, ip->i_mount); return ERR_PTR(-EFSCORRUPTED); + } return link; } @@ -513,7 +517,7 @@ xfs_vn_getattr( stat->blocks = XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks); - if (ip->i_d.di_version == 3) { + if (xfs_sb_version_has_v3inode(&mp->m_sb)) { if (request_mask & STATX_BTIME) { stat->result_mask |= STATX_BTIME; stat->btime.tv_sec = ip->i_d.di_crtime.t_sec; @@ -662,9 +666,7 @@ xfs_setattr_nonsize( */ ASSERT(udqp == NULL); ASSERT(gdqp == NULL); - error = xfs_qm_vop_dqalloc(ip, xfs_kuid_to_uid(uid), - xfs_kgid_to_gid(gid), - xfs_get_projid(ip), + error = xfs_qm_vop_dqalloc(ip, uid, gid, ip->i_d.di_projid, qflags, &udqp, &gdqp, NULL); if (error) return error; @@ -733,7 +735,6 @@ xfs_setattr_nonsize( olddquot1 = xfs_qm_vop_chown(tp, ip, &ip->i_udquot, udqp); } - ip->i_d.di_uid = xfs_kuid_to_uid(uid); inode->i_uid = uid; } if (!gid_eq(igid, gid)) { @@ -745,7 +746,6 @@ xfs_setattr_nonsize( olddquot2 = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp); } - ip->i_d.di_gid = xfs_kgid_to_gid(gid); inode->i_gid = gid; } } @@ -1284,9 +1284,6 @@ xfs_setup_inode( /* make the inode look hashed for the writeback code */ inode_fake_hash(inode); - inode->i_uid = xfs_uid_to_kuid(ip->i_d.di_uid); - inode->i_gid = xfs_gid_to_kgid(ip->i_d.di_gid); - i_size_write(inode, ip->i_d.di_size); xfs_diflags_to_iflags(inode, ip); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 884950adbd16..42e93779374c 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -84,10 +84,10 @@ xfs_bulkstat_one_int( /* xfs_iget returns the following without needing * further change. */ - buf->bs_projectid = xfs_get_projid(ip); + buf->bs_projectid = ip->i_d.di_projid; buf->bs_ino = ino; - buf->bs_uid = dic->di_uid; - buf->bs_gid = dic->di_gid; + buf->bs_uid = i_uid_read(inode); + buf->bs_gid = i_gid_read(inode); buf->bs_size = dic->di_size; buf->bs_nlink = inode->i_nlink; @@ -110,7 +110,7 @@ xfs_bulkstat_one_int( buf->bs_forkoff = XFS_IFORK_BOFF(ip); buf->bs_version = XFS_BULKSTAT_VERSION_V5; - if (dic->di_version == 3) { + if (xfs_sb_version_has_v3inode(&mp->m_sb)) { if (dic->di_flags2 & XFS_DIFLAG2_COWEXTSIZE) buf->bs_cowextsize_blks = dic->di_cowextsize; } diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index aa375cf53021..cc5c0c835884 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -55,6 +55,9 @@ struct xfs_iwalk_ag { /* Where do we start the traversal? */ xfs_ino_t startino; + /* What was the last inode number we saw when iterating the inobt? */ + xfs_ino_t lastino; + /* Array of inobt records we cache. */ struct xfs_inobt_rec_incore *recs; @@ -300,6 +303,9 @@ xfs_iwalk_ag_start( return error; XFS_WANT_CORRUPTED_RETURN(mp, *has_more == 1); + iwag->lastino = XFS_AGINO_TO_INO(mp, agno, + irec->ir_startino + XFS_INODES_PER_CHUNK - 1); + /* * If the LE lookup yielded an inobt record before the cursor position, * skip it and see if there's another one after it. @@ -346,15 +352,17 @@ xfs_iwalk_run_callbacks( struct xfs_mount *mp = iwag->mp; struct xfs_trans *tp = iwag->tp; struct xfs_inobt_rec_incore *irec; - xfs_agino_t restart; + xfs_agino_t next_agino; int error; + next_agino = XFS_INO_TO_AGINO(mp, iwag->lastino) + 1; + ASSERT(iwag->nr_recs > 0); /* Delete cursor but remember the last record we cached... */ xfs_iwalk_del_inobt(tp, curpp, agi_bpp, 0); irec = &iwag->recs[iwag->nr_recs - 1]; - restart = irec->ir_startino + XFS_INODES_PER_CHUNK - 1; + ASSERT(next_agino >= irec->ir_startino + XFS_INODES_PER_CHUNK); error = xfs_iwalk_ag_recs(iwag); if (error) @@ -371,7 +379,7 @@ xfs_iwalk_run_callbacks( if (error) return error; - return xfs_inobt_lookup(*curpp, restart, XFS_LOOKUP_GE, has_more); + return xfs_inobt_lookup(*curpp, next_agino, XFS_LOOKUP_GE, has_more); } /* Walk all inodes in a single AG, from @iwag->startino to the end of the AG. */ @@ -395,6 +403,7 @@ xfs_iwalk_ag( while (!error && has_more) { struct xfs_inobt_rec_incore *irec; + xfs_ino_t rec_fsino; cond_resched(); if (xfs_pwork_want_abort(&iwag->pwork)) @@ -406,6 +415,15 @@ xfs_iwalk_ag( if (error || !has_more) break; + /* Make sure that we always move forward. */ + rec_fsino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino); + if (iwag->lastino != NULLFSINO && iwag->lastino >= rec_fsino) { + ASSERT(iwag->lastino < rec_fsino); + error = -EFSCORRUPTED; + goto out; + } + iwag->lastino = rec_fsino + XFS_INODES_PER_CHUNK - 1; + /* No allocated inodes in this chunk; skip it. */ if (iwag->skip_empty && irec->ir_freecount == irec->ir_count) { error = xfs_btree_increment(cur, 0, &has_more); @@ -534,6 +552,7 @@ xfs_iwalk( .trim_start = 1, .skip_empty = 1, .pwork = XFS_PWORK_SINGLE_THREADED, + .lastino = NULLFSINO, }; xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); int error; @@ -622,6 +641,7 @@ xfs_iwalk_threaded( iwag->data = data; iwag->startino = startino; iwag->sz_recs = xfs_iwalk_prefetch(inode_records); + iwag->lastino = NULLFSINO; xfs_pwork_queue(&pctl, &iwag->pwork); startino = XFS_AGINO_TO_INO(mp, agno + 1, 0); if (flags & XFS_INOBT_WALK_SAME_AG) @@ -695,6 +715,7 @@ xfs_inobt_walk( .startino = startino, .sz_recs = xfs_inobt_walk_prefetch(inobt_records), .pwork = XFS_PWORK_SINGLE_THREADED, + .lastino = NULLFSINO, }; xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); int error; diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index ca15105681ca..4f6f09157f0d 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -163,32 +163,6 @@ struct xstats { extern struct xstats xfsstats; -/* Kernel uid/gid conversion. These are used to convert to/from the on disk - * uid_t/gid_t types to the kuid_t/kgid_t types that the kernel uses internally. - * The conversion here is type only, the value will remain the same since we - * are converting to the init_user_ns. The uid is later mapped to a particular - * user namespace value when crossing the kernel/user boundary. - */ -static inline uint32_t xfs_kuid_to_uid(kuid_t uid) -{ - return from_kuid(&init_user_ns, uid); -} - -static inline kuid_t xfs_uid_to_kuid(uint32_t uid) -{ - return make_kuid(&init_user_ns, uid); -} - -static inline uint32_t xfs_kgid_to_gid(kgid_t gid) -{ - return from_kgid(&init_user_ns, gid); -} - -static inline kgid_t xfs_gid_to_kgid(uint32_t gid) -{ - return make_kgid(&init_user_ns, gid); -} - static inline dev_t xfs_to_linux_dev_t(xfs_dev_t dev) { return MKDEV(sysv_major(dev) & 0x1ff, sysv_minor(dev)); @@ -243,6 +217,12 @@ int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count, #endif /* XFS_WARN */ #endif /* DEBUG */ +#define XFS_IS_CORRUPT(mp, expr) \ + (unlikely(expr) ? xfs_corruption_error(#expr, XFS_ERRLEVEL_LOW, (mp), \ + NULL, 0, __FILE__, __LINE__, \ + __this_address), \ + true : false) + #define STATIC static noinline #ifdef CONFIG_XFS_RT diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 7b0d9ad8cb1a..03a52b3919b8 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -369,6 +369,25 @@ xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type) tic->t_res_num++; } +bool +xfs_log_writable( + struct xfs_mount *mp) +{ + /* + * Never write to the log on norecovery mounts, if the block device is + * read-only, or if the filesystem is shutdown. Read-only mounts still + * allow internal writes for log recovery and unmount purposes, so don't + * restrict that case here. + */ + if (mp->m_flags & XFS_MOUNT_NORECOVERY) + return false; + if (xfs_readonly_buftarg(mp->m_log->l_targ)) + return false; + if (XFS_FORCED_SHUTDOWN(mp)) + return false; + return true; +} + /* * Replenish the byte reservation required by moving the grant write head. */ @@ -837,19 +856,6 @@ xfs_log_write_unmount_record( if (error) goto out_err; - /* - * If we think the summary counters are bad, clear the unmount header - * flag in the unmount record so that the summary counters will be - * recalculated during log recovery at next mount. Refer to - * xlog_check_unmount_rec for more details. - */ - if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp, - XFS_ERRTAG_FORCE_SUMMARY_RECALC)) { - xfs_alert(mp, "%s: will fix summary counters at next mount", - __func__); - flags &= ~XLOG_UNMOUNT_TRANS; - } - /* remove inited flag, and account for space used */ tic->t_flags = 0; tic->t_curr_res -= sizeof(magic); @@ -908,15 +914,8 @@ xfs_log_unmount_write(xfs_mount_t *mp) #endif int error; - /* - * Don't write out unmount record on norecovery mounts or ro devices. - * Or, if we are doing a forced umount (typically because of IO errors). - */ - if (mp->m_flags & XFS_MOUNT_NORECOVERY || - xfs_readonly_buftarg(log->l_targ)) { - ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); + if (!xfs_log_writable(mp)) return 0; - } error = xfs_log_force(mp, XFS_LOG_SYNC); ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log))); @@ -932,6 +931,19 @@ xfs_log_unmount_write(xfs_mount_t *mp) } while (iclog != first_iclog); #endif if (! (XLOG_FORCED_SHUTDOWN(log))) { + /* + * If we think the summary counters are bad, avoid writing the + * unmount record to force log recovery at next mount, after + * which the summary counters will be recalculated. Refer to + * xlog_check_unmount_rec for more details. + */ + if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), + mp, XFS_ERRTAG_FORCE_SUMMARY_RECALC)) { + xfs_alert(mp, + "%s: will fix summary counters at next mount", + __func__); + return 0; + } xfs_log_write_unmount_record(mp); } else { /* @@ -1537,14 +1549,14 @@ xlog_commit_record( } /* - * Push on the buffer cache code if we ever use more than 75% of the on-disk - * log space. This code pushes on the lsn which would supposedly free up - * the 25% which we want to leave free. We may need to adopt a policy which - * pushes on an lsn which is further along in the log once we reach the high - * water mark. In this manner, we would be creating a low water mark. + * Compute the LSN that we'd need to push the log tail towards in order to have + * (a) enough on-disk log space to log the number of bytes specified, (b) at + * least 25% of the log space free, and (c) at least 256 blocks free. If the + * log free space already meets all three thresholds, this function returns + * NULLCOMMITLSN. */ -STATIC void -xlog_grant_push_ail( +xfs_lsn_t +xlog_grant_push_threshold( struct xlog *log, int need_bytes) { @@ -1570,7 +1582,7 @@ xlog_grant_push_ail( free_threshold = max(free_threshold, (log->l_logBBsize >> 2)); free_threshold = max(free_threshold, 256); if (free_blocks >= free_threshold) - return; + return NULLCOMMITLSN; xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle, &threshold_block); @@ -1590,13 +1602,33 @@ xlog_grant_push_ail( if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0) threshold_lsn = last_sync_lsn; + return threshold_lsn; +} + +/* + * Push the tail of the log if we need to do so to maintain the free log space + * thresholds set out by xlog_grant_push_threshold. We may need to adopt a + * policy which pushes on an lsn which is further along in the log once we + * reach the high water mark. In this manner, we would be creating a low water + * mark. + */ +STATIC void +xlog_grant_push_ail( + struct xlog *log, + int need_bytes) +{ + xfs_lsn_t threshold_lsn; + + threshold_lsn = xlog_grant_push_threshold(log, need_bytes); + if (threshold_lsn == NULLCOMMITLSN || XLOG_FORCED_SHUTDOWN(log)) + return; + /* * Get the transaction layer to kick the dirty buffers out to * disk asynchronously. No point in trying to do this if * the filesystem is shutting down. */ - if (!XLOG_FORCED_SHUTDOWN(log)) - xfs_ail_push(log->l_ailp, threshold_lsn); + xfs_ail_push(log->l_ailp, threshold_lsn); } /* diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 84e06805160f..dc9229e7ddaa 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -132,6 +132,7 @@ int xfs_log_reserve(struct xfs_mount *mp, int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic); void xfs_log_unmount(struct xfs_mount *mp); int xfs_log_force_umount(struct xfs_mount *mp, int logerror); +bool xfs_log_writable(struct xfs_mount *mp); struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); void xfs_log_ticket_put(struct xlog_ticket *ticket); @@ -146,4 +147,6 @@ void xfs_log_quiesce(struct xfs_mount *mp); bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t); bool xfs_log_in_recovery(struct xfs_mount *); +xfs_lsn_t xlog_grant_push_threshold(struct xlog *log, int need_bytes); + #endif /* __XFS_LOG_H__ */ diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index ef652abd112c..ae9b8efcfa54 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -671,6 +671,12 @@ xlog_cil_push( ASSERT(push_seq <= ctx->sequence); /* + * Wake up any background push waiters now this context is being pushed. + */ + if (ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) + wake_up_all(&cil->xc_push_wait); + + /* * Check if we've anything to push. If there is nothing, then we don't * move on to a new sequence number and so we have to be able to push * this sequence again later. @@ -740,7 +746,7 @@ xlog_cil_push( /* * initialise the new context and attach it to the CIL. Then attach - * the current context to the CIL committing lsit so it can be found + * the current context to the CIL committing list so it can be found * during log forces to extract the commit lsn of the sequence that * needs to be forced. */ @@ -900,7 +906,7 @@ xlog_cil_push_work( */ static void xlog_cil_push_background( - struct xlog *log) + struct xlog *log) __releases(cil->xc_ctx_lock) { struct xfs_cil *cil = log->l_cilp; @@ -914,14 +920,36 @@ xlog_cil_push_background( * don't do a background push if we haven't used up all the * space available yet. */ - if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) + if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) { + up_read(&cil->xc_ctx_lock); return; + } spin_lock(&cil->xc_push_lock); if (cil->xc_push_seq < cil->xc_current_sequence) { cil->xc_push_seq = cil->xc_current_sequence; queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work); } + + /* + * Drop the context lock now, we can't hold that if we need to sleep + * because we are over the blocking threshold. The push_lock is still + * held, so blocking threshold sleep/wakeup is still correctly + * serialised here. + */ + up_read(&cil->xc_ctx_lock); + + /* + * If we are well over the space limit, throttle the work that is being + * done until the push work on this context has begun. + */ + if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) { + trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket); + ASSERT(cil->xc_ctx->space_used < log->l_logsize); + xlog_wait(&cil->xc_push_wait, &cil->xc_push_lock); + return; + } + spin_unlock(&cil->xc_push_lock); } @@ -1038,9 +1066,9 @@ xfs_log_commit_cil( if (lip->li_ops->iop_committing) lip->li_ops->iop_committing(lip, xc_commit_lsn); } - xlog_cil_push_background(log); - up_read(&cil->xc_ctx_lock); + /* xlog_cil_push_background() releases cil->xc_ctx_lock */ + xlog_cil_push_background(log); } /* @@ -1150,21 +1178,19 @@ out_shutdown: */ bool xfs_log_item_in_current_chkpt( - struct xfs_log_item *lip) + struct xfs_log_item *lip) { - struct xfs_cil_ctx *ctx; + struct xfs_cil *cil = lip->li_mountp->m_log->l_cilp; if (list_empty(&lip->li_cil)) return false; - ctx = lip->li_mountp->m_log->l_cilp->xc_ctx; - /* * li_seq is written on the first commit of a log item to record the * first checkpoint it is written to. Hence if it is different to the * current sequence, we're in a new checkpoint. */ - if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0) + if (XFS_LSN_CMP(lip->li_seq, READ_ONCE(cil->xc_current_sequence)) != 0) return false; return true; } @@ -1194,6 +1220,7 @@ xlog_cil_init( INIT_LIST_HEAD(&cil->xc_committing); spin_lock_init(&cil->xc_cil_lock); spin_lock_init(&cil->xc_push_lock); + init_waitqueue_head(&cil->xc_push_wait); init_rwsem(&cil->xc_ctx_lock); init_waitqueue_head(&cil->xc_commit_wait); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index b880c23cb6e4..3a5d7fb09c43 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -280,6 +280,7 @@ struct xfs_cil { wait_queue_head_t xc_commit_wait; xfs_lsn_t xc_current_sequence; struct work_struct xc_push_work; + wait_queue_head_t xc_push_wait; /* background push throttle */ } ____cacheline_aligned_in_smp; /* @@ -323,13 +324,53 @@ struct xfs_cil { * tries to keep 25% of the log free, so we need to keep below that limit or we * risk running out of free log space to start any new transactions. * - * In order to keep background CIL push efficient, we will set a lower - * threshold at which background pushing is attempted without blocking current - * transaction commits. A separate, higher bound defines when CIL pushes are - * enforced to ensure we stay within our maximum checkpoint size bounds. - * threshold, yet give us plenty of space for aggregation on large logs. + * In order to keep background CIL push efficient, we only need to ensure the + * CIL is large enough to maintain sufficient in-memory relogging to avoid + * repeated physical writes of frequently modified metadata. If we allow the CIL + * to grow to a substantial fraction of the log, then we may be pinning hundreds + * of megabytes of metadata in memory until the CIL flushes. This can cause + * issues when we are running low on memory - pinned memory cannot be reclaimed, + * and the CIL consumes a lot of memory. Hence we need to set an upper physical + * size limit for the CIL that limits the maximum amount of memory pinned by the + * CIL but does not limit performance by reducing relogging efficiency + * significantly. + * + * As such, the CIL push threshold ends up being the smaller of two thresholds: + * - a threshold large enough that it allows CIL to be pushed and progress to be + * made without excessive blocking of incoming transaction commits. This is + * defined to be 12.5% of the log space - half the 25% push threshold of the + * AIL. + * - small enough that it doesn't pin excessive amounts of memory but maintains + * close to peak relogging efficiency. This is defined to be 16x the iclog + * buffer window (32MB) as measurements have shown this to be roughly the + * point of diminishing performance increases under highly concurrent + * modification workloads. + * + * To prevent the CIL from overflowing upper commit size bounds, we introduce a + * new threshold at which we block committing transactions until the background + * CIL commit commences and switches to a new context. While this is not a hard + * limit, it forces the process committing a transaction to the CIL to block and + * yeild the CPU, giving the CIL push work a chance to be scheduled and start + * work. This prevents a process running lots of transactions from overfilling + * the CIL because it is not yielding the CPU. We set the blocking limit at + * twice the background push space threshold so we keep in line with the AIL + * push thresholds. + * + * Note: this is not a -hard- limit as blocking is applied after the transaction + * is inserted into the CIL and the push has been triggered. It is largely a + * throttling mechanism that allows the CIL push to be scheduled and run. A hard + * limit will be difficult to implement without introducing global serialisation + * in the CIL commit fast path, and it's not at all clear that we actually need + * such hard limits given the ~7 years we've run without a hard limit before + * finding the first situation where a checkpoint size overflow actually + * occurred. Hence the simple throttle, and an ASSERT check to tell us that + * we've overrun the max size. */ -#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3) +#define XLOG_CIL_SPACE_LIMIT(log) \ + min_t(int, (log)->l_logsize >> 3, BBTOB(XLOG_TOTAL_REC_SHIFT(log)) << 4) + +#define XLOG_CIL_BLOCKING_SPACE_LIMIT(log) \ + (XLOG_CIL_SPACE_LIMIT(log) * 2) /* * ticket grant locks, queues and accounting have their own cachlines diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index c1a514ffff55..d9b906d75dfa 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -471,7 +471,7 @@ xlog_find_verify_log_record( xfs_warn(log->l_mp, "Log inconsistent (didn't find previous header)"); ASSERT(0); - error = -EIO; + error = -EFSCORRUPTED; goto out; } @@ -1347,10 +1347,11 @@ xlog_find_tail( error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, buffer, &rhead_blk, &rhead, &wrapped); if (error < 0) - return error; + goto done; if (!error) { xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__); - return -EIO; + error = -EFSCORRUPTED; + goto done; } *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn)); @@ -2205,6 +2206,8 @@ xlog_recover_get_buf_lsn( case XFS_ABTC_MAGIC: case XFS_RMAP_CRC_MAGIC: case XFS_REFC_CRC_MAGIC: + case XFS_FIBT_CRC_MAGIC: + case XFS_FIBT_MAGIC: case XFS_IBT_CRC_MAGIC: case XFS_IBT_MAGIC: { struct xfs_btree_block *btb = blk; @@ -2576,6 +2579,7 @@ xlog_recover_do_reg_buffer( int bit; int nbits; xfs_failaddr_t fa; + const size_t size_disk_dquot = sizeof(struct xfs_disk_dquot); trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); @@ -2618,7 +2622,7 @@ xlog_recover_do_reg_buffer( "XFS: NULL dquot in %s.", __func__); goto next; } - if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) { + if (item->ri_buf[i].i_len < size_disk_dquot) { xfs_alert(mp, "XFS: dquot too small (%d) in %s.", item->ri_buf[i].i_len, __func__); @@ -2779,6 +2783,16 @@ xlog_recover_buffer_pass2( if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { trace_xfs_log_recover_buf_skip(log, buf_f); xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN); + + /* + * We're skipping replay of this buffer log item due to the log + * item LSN being behind the ondisk buffer. Verify the buffer + * contents since we aren't going to run the write verifier. + */ + if (bp->b_ops) { + bp->b_ops->verify_read(bp); + error = bp->b_error; + } goto out_release; } @@ -2875,8 +2889,8 @@ xfs_recover_inode_owner_change( return -ENOMEM; /* instantiate the inode */ + ASSERT(dip->di_version >= 3); xfs_inode_from_disk(ip, dip); - ASSERT(ip->i_d.di_version >= 3); error = xfs_iformat_fork(ip, dip); if (error) @@ -3014,7 +3028,7 @@ xlog_recover_inode_pass2( * superblock flag to determine whether we need to look at di_flushiter * to skip replay when the on disk inode is newer than the log one */ - if (!xfs_sb_version_hascrc(&mp->m_sb) && + if (!xfs_sb_version_has_v3inode(&mp->m_sb) && ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) { /* * Deal with the wrap case, DI_MAX_FLUSH is less @@ -3085,7 +3099,7 @@ xlog_recover_inode_pass2( error = -EFSCORRUPTED; goto out_release; } - isize = xfs_log_dinode_size(ldip->di_version); + isize = xfs_log_dinode_size(mp); if (unlikely(item->ri_buf[1].i_len > isize)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", XFS_ERRLEVEL_LOW, mp, ldip, @@ -3166,7 +3180,7 @@ xlog_recover_inode_pass2( default: xfs_warn(log->l_mp, "%s: Invalid flag", __func__); ASSERT(0); - error = -EIO; + error = -EFSCORRUPTED; goto out_release; } } @@ -3247,12 +3261,12 @@ xlog_recover_dquot_pass2( recddq = item->ri_buf[1].i_addr; if (recddq == NULL) { xfs_alert(log->l_mp, "NULL dquot in %s.", __func__); - return -EIO; + return -EFSCORRUPTED; } - if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) { + if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) { xfs_alert(log->l_mp, "dquot too small (%d) in %s.", item->ri_buf[1].i_len, __func__); - return -EIO; + return -EFSCORRUPTED; } /* @@ -3279,7 +3293,7 @@ xlog_recover_dquot_pass2( if (fa) { xfs_alert(mp, "corrupt dquot ID 0x%x in log at %pS", dq_f->qlf_id, fa); - return -EIO; + return -EFSCORRUPTED; } ASSERT(dq_f->qlf_len == 1); @@ -3382,7 +3396,7 @@ xlog_recover_efd_pass2( struct xlog_recover_item *item) { xfs_efd_log_format_t *efd_formatp; - xfs_efi_log_item_t *efip = NULL; + struct xfs_efi_log_item *efip = NULL; struct xfs_log_item *lip; uint64_t efi_id; struct xfs_ail_cursor cur; @@ -3403,7 +3417,7 @@ xlog_recover_efd_pass2( lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); while (lip != NULL) { if (lip->li_type == XFS_LI_EFI) { - efip = (xfs_efi_log_item_t *)lip; + efip = (struct xfs_efi_log_item *)lip; if (efip->efi_format.efi_id == efi_id) { /* * Drop the EFD reference to the EFI. This @@ -3537,6 +3551,7 @@ xfs_cui_copy_format( memcpy(dst_cui_fmt, src_cui_fmt, len); return 0; } + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); return -EFSCORRUPTED; } @@ -3601,8 +3616,10 @@ xlog_recover_cud_pass2( struct xfs_ail *ailp = log->l_ailp; cud_formatp = item->ri_buf[0].i_addr; - if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) + if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); return -EFSCORRUPTED; + } cui_id = cud_formatp->cud_cui_id; /* @@ -3654,6 +3671,7 @@ xfs_bui_copy_format( memcpy(dst_bui_fmt, src_bui_fmt, len); return 0; } + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); return -EFSCORRUPTED; } @@ -3677,8 +3695,10 @@ xlog_recover_bui_pass2( bui_formatp = item->ri_buf[0].i_addr; - if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) + if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); return -EFSCORRUPTED; + } buip = xfs_bui_init(mp); error = xfs_bui_copy_format(&item->ri_buf[0], &buip->bui_format); if (error) { @@ -3720,8 +3740,10 @@ xlog_recover_bud_pass2( struct xfs_ail *ailp = log->l_ailp; bud_formatp = item->ri_buf[0].i_addr; - if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format)) + if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); return -EFSCORRUPTED; + } bui_id = bud_formatp->bud_bui_id; /* @@ -4018,7 +4040,7 @@ xlog_recover_commit_pass1( xfs_warn(log->l_mp, "%s: invalid item type (%d)", __func__, ITEM_TYPE(item)); ASSERT(0); - return -EIO; + return -EFSCORRUPTED; } } @@ -4066,7 +4088,7 @@ xlog_recover_commit_pass2( xfs_warn(log->l_mp, "%s: invalid item type (%d)", __func__, ITEM_TYPE(item)); ASSERT(0); - return -EIO; + return -EFSCORRUPTED; } } @@ -4187,7 +4209,7 @@ xlog_recover_add_to_cont_trans( ASSERT(len <= sizeof(struct xfs_trans_header)); if (len > sizeof(struct xfs_trans_header)) { xfs_warn(log->l_mp, "%s: bad header length", __func__); - return -EIO; + return -EFSCORRUPTED; } xlog_recover_add_item(&trans->r_itemq); @@ -4243,13 +4265,13 @@ xlog_recover_add_to_trans( xfs_warn(log->l_mp, "%s: bad header magic number", __func__); ASSERT(0); - return -EIO; + return -EFSCORRUPTED; } if (len > sizeof(struct xfs_trans_header)) { xfs_warn(log->l_mp, "%s: bad header length", __func__); ASSERT(0); - return -EIO; + return -EFSCORRUPTED; } /* @@ -4285,7 +4307,7 @@ xlog_recover_add_to_trans( in_f->ilf_size); ASSERT(0); kmem_free(ptr); - return -EIO; + return -EFSCORRUPTED; } item->ri_total = in_f->ilf_size; @@ -4293,7 +4315,16 @@ xlog_recover_add_to_trans( kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t), 0); } - ASSERT(item->ri_total > item->ri_cnt); + + if (item->ri_total <= item->ri_cnt) { + xfs_warn(log->l_mp, + "log item region count (%d) overflowed size (%d)", + item->ri_cnt, item->ri_total); + ASSERT(0); + kmem_free(ptr); + return -EFSCORRUPTED; + } + /* Description region is ri_buf[0] */ item->ri_buf[item->ri_cnt].i_addr = ptr; item->ri_buf[item->ri_cnt].i_len = len; @@ -4380,7 +4411,7 @@ xlog_recovery_process_trans( default: xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags); ASSERT(0); - error = -EIO; + error = -EFSCORRUPTED; break; } if (error || freeit) @@ -4460,7 +4491,7 @@ xlog_recover_process_ophdr( xfs_warn(log->l_mp, "%s: bad clientid 0x%x", __func__, ohead->oh_clientid); ASSERT(0); - return -EIO; + return -EFSCORRUPTED; } /* @@ -4470,7 +4501,7 @@ xlog_recover_process_ophdr( if (dp + len > end) { xfs_warn(log->l_mp, "%s: bad length 0x%x", __func__, len); WARN_ON(1); - return -EIO; + return -EFSCORRUPTED; } trans = xlog_recover_ophdr_to_trans(rhash, rhead, ohead); @@ -4566,9 +4597,9 @@ xlog_recover_process_data( /* Recover the EFI if necessary. */ STATIC int xlog_recover_process_efi( - struct xfs_mount *mp, struct xfs_ail *ailp, - struct xfs_log_item *lip) + struct xfs_log_item *lip, + struct list_head *capture_list) { struct xfs_efi_log_item *efip; int error; @@ -4581,7 +4612,7 @@ xlog_recover_process_efi( return 0; spin_unlock(&ailp->ail_lock); - error = xfs_efi_recover(mp, efip); + error = xfs_efi_recover(efip, capture_list); spin_lock(&ailp->ail_lock); return error; @@ -4606,9 +4637,9 @@ xlog_recover_cancel_efi( /* Recover the RUI if necessary. */ STATIC int xlog_recover_process_rui( - struct xfs_mount *mp, struct xfs_ail *ailp, - struct xfs_log_item *lip) + struct xfs_log_item *lip, + struct list_head *capture_list) { struct xfs_rui_log_item *ruip; int error; @@ -4621,7 +4652,7 @@ xlog_recover_process_rui( return 0; spin_unlock(&ailp->ail_lock); - error = xfs_rui_recover(mp, ruip); + error = xfs_rui_recover(ruip, capture_list); spin_lock(&ailp->ail_lock); return error; @@ -4646,9 +4677,9 @@ xlog_recover_cancel_rui( /* Recover the CUI if necessary. */ STATIC int xlog_recover_process_cui( - struct xfs_trans *parent_tp, struct xfs_ail *ailp, - struct xfs_log_item *lip) + struct xfs_log_item *lip, + struct list_head *capture_list) { struct xfs_cui_log_item *cuip; int error; @@ -4661,7 +4692,7 @@ xlog_recover_process_cui( return 0; spin_unlock(&ailp->ail_lock); - error = xfs_cui_recover(parent_tp, cuip); + error = xfs_cui_recover(cuip, capture_list); spin_lock(&ailp->ail_lock); return error; @@ -4686,9 +4717,9 @@ xlog_recover_cancel_cui( /* Recover the BUI if necessary. */ STATIC int xlog_recover_process_bui( - struct xfs_trans *parent_tp, struct xfs_ail *ailp, - struct xfs_log_item *lip) + struct xfs_log_item *lip, + struct list_head *capture_list) { struct xfs_bui_log_item *buip; int error; @@ -4701,7 +4732,7 @@ xlog_recover_process_bui( return 0; spin_unlock(&ailp->ail_lock); - error = xfs_bui_recover(parent_tp, buip); + error = xfs_bui_recover(buip, capture_list); spin_lock(&ailp->ail_lock); return error; @@ -4740,37 +4771,66 @@ static inline bool xlog_item_is_intent(struct xfs_log_item *lip) /* Take all the collected deferred ops and finish them in order. */ static int xlog_finish_defer_ops( - struct xfs_trans *parent_tp) + struct xfs_mount *mp, + struct list_head *capture_list) { - struct xfs_mount *mp = parent_tp->t_mountp; + struct xfs_defer_capture *dfc, *next; struct xfs_trans *tp; - int64_t freeblks; - uint resblks; - int error; + struct xfs_inode *ip; + int error = 0; - /* - * We're finishing the defer_ops that accumulated as a result of - * recovering unfinished intent items during log recovery. We - * reserve an itruncate transaction because it is the largest - * permanent transaction type. Since we're the only user of the fs - * right now, take 93% (15/16) of the available free blocks. Use - * weird math to avoid a 64-bit division. - */ - freeblks = percpu_counter_sum(&mp->m_fdblocks); - if (freeblks <= 0) - return -ENOSPC; - resblks = min_t(int64_t, UINT_MAX, freeblks); - resblks = (resblks * 15) >> 4; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks, - 0, XFS_TRANS_RESERVE, &tp); - if (error) - return error; - /* transfer all collected dfops to this transaction */ - xfs_defer_move(tp, parent_tp); + list_for_each_entry_safe(dfc, next, capture_list, dfc_list) { + struct xfs_trans_res resv; + + /* + * Create a new transaction reservation from the captured + * information. Set logcount to 1 to force the new transaction + * to regrant every roll so that we can make forward progress + * in recovery no matter how full the log might be. + */ + resv.tr_logres = dfc->dfc_logres; + resv.tr_logcount = 1; + resv.tr_logflags = XFS_TRANS_PERM_LOG_RES; + + error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres, + dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp); + if (error) + return error; - return xfs_trans_commit(tp); + /* + * Transfer to this new transaction all the dfops we captured + * from recovering a single intent item. + */ + list_del_init(&dfc->dfc_list); + xfs_defer_ops_continue(dfc, tp, &ip); + + error = xfs_trans_commit(tp); + if (ip) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_irele(ip); + } + if (error) + return error; + } + + ASSERT(list_empty(capture_list)); + return 0; } +/* Release all the captured defer ops and capture structures in this list. */ +static void +xlog_abort_defer_ops( + struct xfs_mount *mp, + struct list_head *capture_list) +{ + struct xfs_defer_capture *dfc; + struct xfs_defer_capture *next; + + list_for_each_entry_safe(dfc, next, capture_list, dfc_list) { + list_del_init(&dfc->dfc_list); + xfs_defer_ops_release(mp, dfc); + } +} /* * When this is called, all of the log intent items which did not have * corresponding log done items should be in the AIL. What we do now @@ -4791,35 +4851,23 @@ STATIC int xlog_recover_process_intents( struct xlog *log) { - struct xfs_trans *parent_tp; + LIST_HEAD(capture_list); struct xfs_ail_cursor cur; struct xfs_log_item *lip; struct xfs_ail *ailp; - int error; + int error = 0; #if defined(DEBUG) || defined(XFS_WARN) xfs_lsn_t last_lsn; #endif - /* - * The intent recovery handlers commit transactions to complete recovery - * for individual intents, but any new deferred operations that are - * queued during that process are held off until the very end. The - * purpose of this transaction is to serve as a container for deferred - * operations. Each intent recovery handler must transfer dfops here - * before its local transaction commits, and we'll finish the entire - * list below. - */ - error = xfs_trans_alloc_empty(log->l_mp, &parent_tp); - if (error) - return error; - ailp = log->l_ailp; spin_lock(&ailp->ail_lock); - lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); #if defined(DEBUG) || defined(XFS_WARN) last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block); #endif - while (lip != NULL) { + for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); + lip != NULL; + lip = xfs_trans_ail_cursor_next(ailp, &cur)) { /* * We're done when we see something other than an intent. * There should be no intents left in the AIL now. @@ -4841,35 +4889,40 @@ xlog_recover_process_intents( /* * NOTE: If your intent processing routine can create more - * deferred ops, you /must/ attach them to the dfops in this - * routine or else those subsequent intents will get + * deferred ops, you /must/ attach them to the capture list in + * the recover routine or else those subsequent intents will be * replayed in the wrong order! */ switch (lip->li_type) { case XFS_LI_EFI: - error = xlog_recover_process_efi(log->l_mp, ailp, lip); + error = xlog_recover_process_efi(ailp, lip, &capture_list); break; case XFS_LI_RUI: - error = xlog_recover_process_rui(log->l_mp, ailp, lip); + error = xlog_recover_process_rui(ailp, lip, &capture_list); break; case XFS_LI_CUI: - error = xlog_recover_process_cui(parent_tp, ailp, lip); + error = xlog_recover_process_cui(ailp, lip, &capture_list); break; case XFS_LI_BUI: - error = xlog_recover_process_bui(parent_tp, ailp, lip); + error = xlog_recover_process_bui(ailp, lip, &capture_list); break; } if (error) - goto out; - lip = xfs_trans_ail_cursor_next(ailp, &cur); + break; } -out: + xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->ail_lock); - if (!error) - error = xlog_finish_defer_ops(parent_tp); - xfs_trans_cancel(parent_tp); + if (error) + goto err; + + error = xlog_finish_defer_ops(log->l_mp, &capture_list); + if (error) + goto err; + return 0; +err: + xlog_abort_defer_ops(log->l_mp, &capture_list); return error; } @@ -5172,8 +5225,10 @@ xlog_recover_process( * If the filesystem is CRC enabled, this mismatch becomes a * fatal log corruption failure. */ - if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) + if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); return -EFSCORRUPTED; + } } xlog_unpack_data(rhead, dp, log); @@ -5200,7 +5255,7 @@ xlog_valid_rec_header( (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) { xfs_warn(log->l_mp, "%s: unrecognised log version (%d).", __func__, be32_to_cpu(rhead->h_version)); - return -EIO; + return -EFSCORRUPTED; } /* LR body must have data or it wouldn't have been written */ @@ -5296,8 +5351,12 @@ xlog_do_recovery_pass( "invalid iclog size (%d bytes), using lsunit (%d bytes)", h_size, log->l_mp->m_logbsize); h_size = log->l_mp->m_logbsize; - } else - return -EFSCORRUPTED; + } else { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, + log->l_mp); + error = -EFSCORRUPTED; + goto bread_err1; + } } if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) && diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index 9804efe525a9..c57e8ad39712 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -105,7 +105,7 @@ assfail(char *expr, char *file, int line) } void -xfs_hex_dump(void *p, int length) +xfs_hex_dump(const void *p, int length) { print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_OFFSET, 16, 1, p, length, 1); } diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index 34447dca97d1..7f040b04b739 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -60,6 +60,6 @@ do { \ extern void assfail(char *expr, char *f, int l); extern void asswarn(char *expr, char *f, int l); -extern void xfs_hex_dump(void *p, int length); +extern void xfs_hex_dump(const void *p, int length); #endif /* __XFS_MESSAGE_H */ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 5a0ce0c2c4bb..2277f21c4f14 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -31,7 +31,7 @@ #include "xfs_reflink.h" #include "xfs_extent_busy.h" #include "xfs_health.h" - +#include "xfs_trace.h" static DEFINE_MUTEX(xfs_uuid_table_mutex); static int xfs_uuid_table_size; @@ -365,66 +365,119 @@ release_buf: } /* - * Update alignment values based on mount options and sb values + * If the sunit/swidth change would move the precomputed root inode value, we + * must reject the ondisk change because repair will stumble over that. + * However, we allow the mount to proceed because we never rejected this + * combination before. Returns true to update the sb, false otherwise. + */ +static inline int +xfs_check_new_dalign( + struct xfs_mount *mp, + int new_dalign, + bool *update_sb) +{ + struct xfs_sb *sbp = &mp->m_sb; + xfs_ino_t calc_ino; + + calc_ino = xfs_ialloc_calc_rootino(mp, new_dalign); + trace_xfs_check_new_dalign(mp, new_dalign, calc_ino); + + if (sbp->sb_rootino == calc_ino) { + *update_sb = true; + return 0; + } + + xfs_warn(mp, +"Cannot change stripe alignment; would require moving root inode."); + + /* + * XXX: Next time we add a new incompat feature, this should start + * returning -EINVAL to fail the mount. Until then, spit out a warning + * that we're ignoring the administrator's instructions. + */ + xfs_warn(mp, "Skipping superblock stripe alignment update."); + *update_sb = false; + return 0; +} + +/* + * If we were provided with new sunit/swidth values as mount options, make sure + * that they pass basic alignment and superblock feature checks, and convert + * them into the same units (FSB) that everything else expects. This step + * /must/ be done before computing the inode geometry. */ STATIC int -xfs_update_alignment(xfs_mount_t *mp) +xfs_validate_new_dalign( + struct xfs_mount *mp) { - xfs_sb_t *sbp = &(mp->m_sb); + if (mp->m_dalign == 0) + return 0; - if (mp->m_dalign) { + /* + * If stripe unit and stripe width are not multiples + * of the fs blocksize turn off alignment. + */ + if ((BBTOB(mp->m_dalign) & mp->m_blockmask) || + (BBTOB(mp->m_swidth) & mp->m_blockmask)) { + xfs_warn(mp, + "alignment check failed: sunit/swidth vs. blocksize(%d)", + mp->m_sb.sb_blocksize); + return -EINVAL; + } else { /* - * If stripe unit and stripe width are not multiples - * of the fs blocksize turn off alignment. + * Convert the stripe unit and width to FSBs. */ - if ((BBTOB(mp->m_dalign) & mp->m_blockmask) || - (BBTOB(mp->m_swidth) & mp->m_blockmask)) { + mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); + if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) { xfs_warn(mp, - "alignment check failed: sunit/swidth vs. blocksize(%d)", - sbp->sb_blocksize); + "alignment check failed: sunit/swidth vs. agsize(%d)", + mp->m_sb.sb_agblocks); return -EINVAL; - } else { - /* - * Convert the stripe unit and width to FSBs. - */ - mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); - if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) { - xfs_warn(mp, - "alignment check failed: sunit/swidth vs. agsize(%d)", - sbp->sb_agblocks); - return -EINVAL; - } else if (mp->m_dalign) { - mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); - } else { - xfs_warn(mp, - "alignment check failed: sunit(%d) less than bsize(%d)", - mp->m_dalign, sbp->sb_blocksize); - return -EINVAL; - } - } - - /* - * Update superblock with new values - * and log changes - */ - if (xfs_sb_version_hasdalign(sbp)) { - if (sbp->sb_unit != mp->m_dalign) { - sbp->sb_unit = mp->m_dalign; - mp->m_update_sb = true; - } - if (sbp->sb_width != mp->m_swidth) { - sbp->sb_width = mp->m_swidth; - mp->m_update_sb = true; - } + } else if (mp->m_dalign) { + mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); } else { xfs_warn(mp, - "cannot change alignment: superblock does not support data alignment"); + "alignment check failed: sunit(%d) less than bsize(%d)", + mp->m_dalign, mp->m_sb.sb_blocksize); return -EINVAL; } + } + + if (!xfs_sb_version_hasdalign(&mp->m_sb)) { + xfs_warn(mp, +"cannot change alignment: superblock does not support data alignment"); + return -EINVAL; + } + + return 0; +} + +/* Update alignment values based on mount options and sb values. */ +STATIC int +xfs_update_alignment( + struct xfs_mount *mp) +{ + struct xfs_sb *sbp = &mp->m_sb; + + if (mp->m_dalign) { + bool update_sb; + int error; + + if (sbp->sb_unit == mp->m_dalign && + sbp->sb_width == mp->m_swidth) + return 0; + + error = xfs_check_new_dalign(mp, mp->m_dalign, &update_sb); + if (error || !update_sb) + return error; + + sbp->sb_unit = mp->m_dalign; + sbp->sb_width = mp->m_swidth; + mp->m_update_sb = true; } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN && xfs_sb_version_hasdalign(&mp->m_sb)) { - mp->m_dalign = sbp->sb_unit; - mp->m_swidth = sbp->sb_width; + mp->m_dalign = sbp->sb_unit; + mp->m_swidth = sbp->sb_width; } return 0; @@ -622,6 +675,47 @@ xfs_check_summary_counts( } /* + * Flush and reclaim dirty inodes in preparation for unmount. Inodes and + * internal inode structures can be sitting in the CIL and AIL at this point, + * so we need to unpin them, write them back and/or reclaim them before unmount + * can proceed. + * + * An inode cluster that has been freed can have its buffer still pinned in + * memory because the transaction is still sitting in a iclog. The stale inodes + * on that buffer will be pinned to the buffer until the transaction hits the + * disk and the callbacks run. Pushing the AIL will skip the stale inodes and + * may never see the pinned buffer, so nothing will push out the iclog and + * unpin the buffer. + * + * Hence we need to force the log to unpin everything first. However, log + * forces don't wait for the discards they issue to complete, so we have to + * explicitly wait for them to complete here as well. + * + * Then we can tell the world we are unmounting so that error handling knows + * that the filesystem is going away and we should error out anything that we + * have been retrying in the background. This will prevent never-ending + * retries in AIL pushing from hanging the unmount. + * + * Finally, we can push the AIL to clean all the remaining dirty objects, then + * reclaim the remaining inodes that are still in memory at this point in time. + */ +static void +xfs_unmount_flush_inodes( + struct xfs_mount *mp) +{ + xfs_log_force(mp, XFS_LOG_SYNC); + xfs_extent_busy_wait_all(mp); + flush_workqueue(xfs_discard_wq); + + mp->m_flags |= XFS_MOUNT_UNMOUNTING; + + xfs_ail_push_all_sync(mp->m_ail); + cancel_delayed_work_sync(&mp->m_reclaim_work); + xfs_reclaim_inodes(mp, SYNC_WAIT); + xfs_health_unmount(mp); +} + +/* * This function does the following on an initial mount of a file system: * - reads the superblock from disk and init the mount struct * - if we're a 32-bit kernel, do a size check on the superblock @@ -692,12 +786,12 @@ xfs_mountfs( } /* - * Check if sb_agblocks is aligned at stripe boundary - * If sb_agblocks is NOT aligned turn off m_dalign since - * allocator alignment is within an ag, therefore ag has - * to be aligned at stripe boundary. + * If we were given new sunit/swidth options, do some basic validation + * checks and convert the incore dalign and swidth values to the + * same units (FSB) that everything else uses. This /must/ happen + * before computing the inode geometry. */ - error = xfs_update_alignment(mp); + error = xfs_validate_new_dalign(mp); if (error) goto out; @@ -708,6 +802,17 @@ xfs_mountfs( xfs_rmapbt_compute_maxlevels(mp); xfs_refcountbt_compute_maxlevels(mp); + /* + * Check if sb_agblocks is aligned at stripe boundary. If sb_agblocks + * is NOT aligned turn off m_dalign since allocator alignment is within + * an ag, therefore ag has to be aligned at stripe boundary. Note that + * we must compute the free space and rmap btree geometry before doing + * this. + */ + error = xfs_update_alignment(mp); + if (error) + goto out; + /* enable fail_at_unmount as default */ mp->m_fail_unmount = true; @@ -983,7 +1088,7 @@ xfs_mountfs( /* Clean out dquots that might be in memory after quotacheck. */ xfs_qm_unmount(mp); /* - * Cancel all delayed reclaim work and reclaim the inodes directly. + * Flush all inode reclamation work and flush the log. * We have to do this /after/ rtunmount and qm_unmount because those * two will have scheduled delayed reclaim for the rt/quota inodes. * @@ -993,11 +1098,8 @@ xfs_mountfs( * qm_unmount_quotas and therefore rely on qm_unmount to release the * quota inodes. */ - cancel_delayed_work_sync(&mp->m_reclaim_work); - xfs_reclaim_inodes(mp, SYNC_WAIT); - xfs_health_unmount(mp); + xfs_unmount_flush_inodes(mp); out_log_dealloc: - mp->m_flags |= XFS_MOUNT_UNMOUNTING; xfs_log_mount_cancel(mp); out_fail_wait: if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) @@ -1038,47 +1140,7 @@ xfs_unmountfs( xfs_rtunmount_inodes(mp); xfs_irele(mp->m_rootip); - /* - * We can potentially deadlock here if we have an inode cluster - * that has been freed has its buffer still pinned in memory because - * the transaction is still sitting in a iclog. The stale inodes - * on that buffer will have their flush locks held until the - * transaction hits the disk and the callbacks run. the inode - * flush takes the flush lock unconditionally and with nothing to - * push out the iclog we will never get that unlocked. hence we - * need to force the log first. - */ - xfs_log_force(mp, XFS_LOG_SYNC); - - /* - * Wait for all busy extents to be freed, including completion of - * any discard operation. - */ - xfs_extent_busy_wait_all(mp); - flush_workqueue(xfs_discard_wq); - - /* - * We now need to tell the world we are unmounting. This will allow - * us to detect that the filesystem is going away and we should error - * out anything that we have been retrying in the background. This will - * prevent neverending retries in AIL pushing from hanging the unmount. - */ - mp->m_flags |= XFS_MOUNT_UNMOUNTING; - - /* - * Flush all pending changes from the AIL. - */ - xfs_ail_push_all_sync(mp->m_ail); - - /* - * And reclaim all inodes. At this point there should be no dirty - * inodes and none should be pinned or locked, but use synchronous - * reclaim just to be sure. We can stop background inode reclaim - * here as well if it is still running. - */ - cancel_delayed_work_sync(&mp->m_reclaim_work); - xfs_reclaim_inodes(mp, SYNC_WAIT); - xfs_health_unmount(mp); + xfs_unmount_flush_inodes(mp); xfs_qm_unmount(mp); @@ -1154,8 +1216,7 @@ xfs_fs_writable( int xfs_log_sbcount(xfs_mount_t *mp) { - /* allow this to proceed during the freeze sequence... */ - if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE)) + if (!xfs_log_writable(mp)) return 0; /* diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index fdb60e09a9c5..ca7e0c656cee 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -179,6 +179,11 @@ typedef struct xfs_mount { struct xfs_error_cfg m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX]; struct xstats m_stats; /* per-fs stats */ + /* + * Workqueue item so that we can coalesce multiple inode flush attempts + * into a single flush. + */ + struct work_struct m_flush_inodes_work; struct workqueue_struct *m_buf_workqueue; struct workqueue_struct *m_unwritten_workqueue; struct workqueue_struct *m_cil_workqueue; diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index f63fe8d924a3..058af699e046 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -147,11 +147,11 @@ xfs_fs_map_blocks( if (error) goto out_unlock; + ASSERT(!nimaps || imap.br_startblock != DELAYSTARTBLOCK); + if (write) { enum xfs_prealloc_flags flags = 0; - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - if (!nimaps || imap.br_startblock == HOLESTARTBLOCK) { /* * xfs_iomap_write_direct() expects to take ownership of diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index ecd8ce152ab1..6b108a4de08f 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -22,6 +22,7 @@ #include "xfs_qm.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_error.h" /* * The global quota manager. There is only one of these for the entire @@ -120,12 +121,11 @@ xfs_qm_dqpurge( { struct xfs_mount *mp = dqp->q_mount; struct xfs_quotainfo *qi = mp->m_quotainfo; + int error = -EAGAIN; xfs_dqlock(dqp); - if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) { - xfs_dqunlock(dqp); - return -EAGAIN; - } + if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) + goto out_unlock; dqp->dq_flags |= XFS_DQ_FREEING; @@ -138,7 +138,6 @@ xfs_qm_dqpurge( */ if (XFS_DQ_IS_DIRTY(dqp)) { struct xfs_buf *bp = NULL; - int error; /* * We don't care about getting disk errors here. We need @@ -148,6 +147,9 @@ xfs_qm_dqpurge( if (!error) { error = xfs_bwrite(bp); xfs_buf_relse(bp); + } else if (error == -EAGAIN) { + dqp->dq_flags &= ~XFS_DQ_FREEING; + goto out_unlock; } xfs_dqflock(dqp); } @@ -173,6 +175,10 @@ xfs_qm_dqpurge( xfs_qm_dqdestroy(dqp); return 0; + +out_unlock: + xfs_dqunlock(dqp); + return error; } /* @@ -243,14 +249,14 @@ xfs_qm_unmount_quotas( STATIC int xfs_qm_dqattach_one( - xfs_inode_t *ip, - xfs_dqid_t id, - uint type, - bool doalloc, - xfs_dquot_t **IO_idqpp) + struct xfs_inode *ip, + xfs_dqid_t id, + uint type, + bool doalloc, + struct xfs_dquot **IO_idqpp) { - xfs_dquot_t *dqp; - int error; + struct xfs_dquot *dqp; + int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); error = 0; @@ -325,23 +331,23 @@ xfs_qm_dqattach_locked( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) { - error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER, - doalloc, &ip->i_udquot); + error = xfs_qm_dqattach_one(ip, i_uid_read(VFS_I(ip)), + XFS_DQ_USER, doalloc, &ip->i_udquot); if (error) goto done; ASSERT(ip->i_udquot); } if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) { - error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP, - doalloc, &ip->i_gdquot); + error = xfs_qm_dqattach_one(ip, i_gid_read(VFS_I(ip)), + XFS_DQ_GROUP, doalloc, &ip->i_gdquot); if (error) goto done; ASSERT(ip->i_gdquot); } if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) { - error = xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ, + error = xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ, doalloc, &ip->i_pdquot); if (error) goto done; @@ -543,8 +549,8 @@ xfs_qm_set_defquota( uint type, xfs_quotainfo_t *qinf) { - xfs_dquot_t *dqp; - struct xfs_def_quota *defq; + struct xfs_dquot *dqp; + struct xfs_def_quota *defq; struct xfs_disk_dquot *ddqp; int error; @@ -754,11 +760,19 @@ xfs_qm_qino_alloc( if ((flags & XFS_QMOPT_PQUOTA) && (mp->m_sb.sb_gquotino != NULLFSINO)) { ino = mp->m_sb.sb_gquotino; - ASSERT(mp->m_sb.sb_pquotino == NULLFSINO); + if (mp->m_sb.sb_pquotino != NULLFSINO) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, + mp); + return -EFSCORRUPTED; + } } else if ((flags & XFS_QMOPT_GQUOTA) && (mp->m_sb.sb_pquotino != NULLFSINO)) { ino = mp->m_sb.sb_pquotino; - ASSERT(mp->m_sb.sb_gquotino == NULLFSINO); + if (mp->m_sb.sb_gquotino != NULLFSINO) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, + mp); + return -EFSCORRUPTED; + } } if (ino != NULLFSINO) { error = xfs_iget(mp, NULL, ino, 0, 0, ip); @@ -866,12 +880,20 @@ xfs_qm_reset_dqcounts( ddq->d_bcount = 0; ddq->d_icount = 0; ddq->d_rtbcount = 0; - ddq->d_btimer = 0; - ddq->d_itimer = 0; - ddq->d_rtbtimer = 0; - ddq->d_bwarns = 0; - ddq->d_iwarns = 0; - ddq->d_rtbwarns = 0; + + /* + * dquot id 0 stores the default grace period and the maximum + * warning limit that were set by the administrator, so we + * should not reset them. + */ + if (ddq->d_id != 0) { + ddq->d_btimer = 0; + ddq->d_itimer = 0; + ddq->d_rtbtimer = 0; + ddq->d_bwarns = 0; + ddq->d_iwarns = 0; + ddq->d_rtbwarns = 0; + } if (xfs_sb_version_hascrc(&mp->m_sb)) { xfs_update_cksum((char *)&dqb[j], @@ -1608,8 +1630,8 @@ xfs_qm_dqfree_one( int xfs_qm_vop_dqalloc( struct xfs_inode *ip, - xfs_dqid_t uid, - xfs_dqid_t gid, + kuid_t uid, + kgid_t gid, prid_t prid, uint flags, struct xfs_dquot **O_udqpp, @@ -1617,6 +1639,8 @@ xfs_qm_vop_dqalloc( struct xfs_dquot **O_pdqpp) { struct xfs_mount *mp = ip->i_mount; + struct inode *inode = VFS_I(ip); + struct user_namespace *user_ns = inode->i_sb->s_user_ns; struct xfs_dquot *uq = NULL; struct xfs_dquot *gq = NULL; struct xfs_dquot *pq = NULL; @@ -1630,7 +1654,7 @@ xfs_qm_vop_dqalloc( xfs_ilock(ip, lockflags); if ((flags & XFS_QMOPT_INHERIT) && XFS_INHERIT_GID(ip)) - gid = ip->i_d.di_gid; + gid = inode->i_gid; /* * Attach the dquot(s) to this inode, doing a dquot allocation @@ -1645,7 +1669,7 @@ xfs_qm_vop_dqalloc( } if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) { - if (ip->i_d.di_uid != uid) { + if (!uid_eq(inode->i_uid, uid)) { /* * What we need is the dquot that has this uid, and * if we send the inode to dqget, the uid of the inode @@ -1656,7 +1680,8 @@ xfs_qm_vop_dqalloc( * holding ilock. */ xfs_iunlock(ip, lockflags); - error = xfs_qm_dqget(mp, uid, XFS_DQ_USER, true, &uq); + error = xfs_qm_dqget(mp, from_kuid(user_ns, uid), + XFS_DQ_USER, true, &uq); if (error) { ASSERT(error != -ENOENT); return error; @@ -1677,9 +1702,10 @@ xfs_qm_vop_dqalloc( } } if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) { - if (ip->i_d.di_gid != gid) { + if (!gid_eq(inode->i_gid, gid)) { xfs_iunlock(ip, lockflags); - error = xfs_qm_dqget(mp, gid, XFS_DQ_GROUP, true, &gq); + error = xfs_qm_dqget(mp, from_kgid(user_ns, gid), + XFS_DQ_GROUP, true, &gq); if (error) { ASSERT(error != -ENOENT); goto error_rele; @@ -1693,7 +1719,7 @@ xfs_qm_vop_dqalloc( } } if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { - if (xfs_get_projid(ip) != prid) { + if (ip->i_d.di_projid != prid) { xfs_iunlock(ip, lockflags); error = xfs_qm_dqget(mp, (xfs_dqid_t)prid, XFS_DQ_PROJ, true, &pq); @@ -1737,14 +1763,14 @@ error_rele: * Actually transfer ownership, and do dquot modifications. * These were already reserved. */ -xfs_dquot_t * +struct xfs_dquot * xfs_qm_vop_chown( - xfs_trans_t *tp, - xfs_inode_t *ip, - xfs_dquot_t **IO_olddq, - xfs_dquot_t *newdq) + struct xfs_trans *tp, + struct xfs_inode *ip, + struct xfs_dquot **IO_olddq, + struct xfs_dquot *newdq) { - xfs_dquot_t *prevdq; + struct xfs_dquot *prevdq; uint bfield = XFS_IS_REALTIME_INODE(ip) ? XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT; @@ -1805,7 +1831,7 @@ xfs_qm_vop_chown_reserve( XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS; if (XFS_IS_UQUOTA_ON(mp) && udqp && - ip->i_d.di_uid != be32_to_cpu(udqp->q_core.d_id)) { + i_uid_read(VFS_I(ip)) != be32_to_cpu(udqp->q_core.d_id)) { udq_delblks = udqp; /* * If there are delayed allocation blocks, then we have to @@ -1818,7 +1844,7 @@ xfs_qm_vop_chown_reserve( } } if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp && - ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id)) { + i_gid_read(VFS_I(ip)) != be32_to_cpu(gdqp->q_core.d_id)) { gdq_delblks = gdqp; if (delblks) { ASSERT(ip->i_gdquot); @@ -1827,7 +1853,7 @@ xfs_qm_vop_chown_reserve( } if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp && - xfs_get_projid(ip) != be32_to_cpu(pdqp->q_core.d_id)) { + ip->i_d.di_projid != be32_to_cpu(pdqp->q_core.d_id)) { prjflags = XFS_QMOPT_ENOSPC; pdq_delblks = pdqp; if (delblks) { @@ -1915,20 +1941,21 @@ xfs_qm_vop_create_dqattach( if (udqp && XFS_IS_UQUOTA_ON(mp)) { ASSERT(ip->i_udquot == NULL); - ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id)); + ASSERT(i_uid_read(VFS_I(ip)) == be32_to_cpu(udqp->q_core.d_id)); ip->i_udquot = xfs_qm_dqhold(udqp); xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1); } if (gdqp && XFS_IS_GQUOTA_ON(mp)) { ASSERT(ip->i_gdquot == NULL); - ASSERT(ip->i_d.di_gid == be32_to_cpu(gdqp->q_core.d_id)); + ASSERT(i_gid_read(VFS_I(ip)) == be32_to_cpu(gdqp->q_core.d_id)); + ip->i_gdquot = xfs_qm_dqhold(gdqp); xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); } if (pdqp && XFS_IS_PQUOTA_ON(mp)) { ASSERT(ip->i_pdquot == NULL); - ASSERT(xfs_get_projid(ip) == be32_to_cpu(pdqp->q_core.d_id)); + ASSERT(ip->i_d.di_projid == be32_to_cpu(pdqp->q_core.d_id)); ip->i_pdquot = xfs_qm_dqhold(pdqp); xfs_trans_mod_dquot(tp, pdqp, XFS_TRANS_DQ_ICOUNT, 1); diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index 5d72e88598b4..fc2fa418919f 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -54,13 +54,13 @@ xfs_fill_statvfs_from_dquot( */ void xfs_qm_statvfs( - xfs_inode_t *ip, + struct xfs_inode *ip, struct kstatfs *statp) { - xfs_mount_t *mp = ip->i_mount; - xfs_dquot_t *dqp; + struct xfs_mount *mp = ip->i_mount; + struct xfs_dquot *dqp; - if (!xfs_qm_dqget(mp, xfs_get_projid(ip), XFS_DQ_PROJ, false, &dqp)) { + if (!xfs_qm_dqget(mp, ip->i_d.di_projid, XFS_DQ_PROJ, false, &dqp)) { xfs_fill_statvfs_from_dquot(statp, dqp); xfs_qm_dqput(dqp); } diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index da7ad0383037..5d5ac65aa1cc 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -19,9 +19,71 @@ #include "xfs_qm.h" #include "xfs_icache.h" -STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint); -STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *, - uint); +STATIC int +xfs_qm_log_quotaoff( + struct xfs_mount *mp, + struct xfs_qoff_logitem **qoffstartp, + uint flags) +{ + struct xfs_trans *tp; + int error; + struct xfs_qoff_logitem *qoffi; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_quotaoff, 0, 0, 0, &tp); + if (error) + goto out; + + qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT); + xfs_trans_log_quotaoff_item(tp, qoffi); + + spin_lock(&mp->m_sb_lock); + mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL; + spin_unlock(&mp->m_sb_lock); + + xfs_log_sb(tp); + + /* + * We have to make sure that the transaction is secure on disk before we + * return and actually stop quota accounting. So, make it synchronous. + * We don't care about quotoff's performance. + */ + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp); + if (error) + goto out; + + *qoffstartp = qoffi; +out: + return error; +} + +STATIC int +xfs_qm_log_quotaoff_end( + struct xfs_mount *mp, + struct xfs_qoff_logitem **startqoff, + uint flags) +{ + struct xfs_trans *tp; + int error; + struct xfs_qoff_logitem *qoffi; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_equotaoff, 0, 0, 0, &tp); + if (error) + return error; + + qoffi = xfs_trans_get_qoff_item(tp, *startqoff, + flags & XFS_ALL_QUOTA_ACCT); + xfs_trans_log_quotaoff_item(tp, qoffi); + *startqoff = NULL; + + /* + * We have to make sure that the transaction is secure on disk before we + * return and actually stop quota accounting. So, make it synchronous. + * We don't care about quotoff's performance. + */ + xfs_trans_set_sync(tp); + return xfs_trans_commit(tp); +} /* * Turn off quota accounting and/or enforcement for all udquots and/or @@ -40,7 +102,7 @@ xfs_qm_scall_quotaoff( uint dqtype; int error; uint inactivate_flags; - xfs_qoff_logitem_t *qoffstart; + struct xfs_qoff_logitem *qoffstart = NULL; /* * No file system can have quotas enabled on disk but not in core. @@ -165,7 +227,7 @@ xfs_qm_scall_quotaoff( * So, we have QUOTAOFF start and end logitems; the start * logitem won't get overwritten until the end logitem appears... */ - error = xfs_qm_log_quotaoff_end(mp, qoffstart, flags); + error = xfs_qm_log_quotaoff_end(mp, &qoffstart, flags); if (error) { /* We're screwed now. Shutdown is the only option. */ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); @@ -198,6 +260,8 @@ xfs_qm_scall_quotaoff( } out_unlock: + if (error && qoffstart) + xfs_qm_qoff_logitem_relse(qoffstart); mutex_unlock(&q->qi_quotaofflock); return error; } @@ -538,74 +602,6 @@ out_unlock: return error; } -STATIC int -xfs_qm_log_quotaoff_end( - xfs_mount_t *mp, - xfs_qoff_logitem_t *startqoff, - uint flags) -{ - xfs_trans_t *tp; - int error; - xfs_qoff_logitem_t *qoffi; - - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_equotaoff, 0, 0, 0, &tp); - if (error) - return error; - - qoffi = xfs_trans_get_qoff_item(tp, startqoff, - flags & XFS_ALL_QUOTA_ACCT); - xfs_trans_log_quotaoff_item(tp, qoffi); - - /* - * We have to make sure that the transaction is secure on disk before we - * return and actually stop quota accounting. So, make it synchronous. - * We don't care about quotoff's performance. - */ - xfs_trans_set_sync(tp); - return xfs_trans_commit(tp); -} - - -STATIC int -xfs_qm_log_quotaoff( - xfs_mount_t *mp, - xfs_qoff_logitem_t **qoffstartp, - uint flags) -{ - xfs_trans_t *tp; - int error; - xfs_qoff_logitem_t *qoffi; - - *qoffstartp = NULL; - - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_quotaoff, 0, 0, 0, &tp); - if (error) - goto out; - - qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT); - xfs_trans_log_quotaoff_item(tp, qoffi); - - spin_lock(&mp->m_sb_lock); - mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL; - spin_unlock(&mp->m_sb_lock); - - xfs_log_sb(tp); - - /* - * We have to make sure that the transaction is secure on disk before we - * return and actually stop quota accounting. So, make it synchronous. - * We don't care about quotoff's performance. - */ - xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp); - if (error) - goto out; - - *qoffstartp = qoffi; -out: - return error; -} - /* Fill out the quota context. */ static void xfs_qm_scall_getquota_fill_qc( diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index efe42ae7a2f3..aa8fc1f55fbd 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -86,7 +86,7 @@ extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *, struct xfs_mount *, struct xfs_dquot *, struct xfs_dquot *, struct xfs_dquot *, int64_t, long, uint); -extern int xfs_qm_vop_dqalloc(struct xfs_inode *, xfs_dqid_t, xfs_dqid_t, +extern int xfs_qm_vop_dqalloc(struct xfs_inode *, kuid_t, kgid_t, prid_t, uint, struct xfs_dquot **, struct xfs_dquot **, struct xfs_dquot **); extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *, @@ -109,7 +109,7 @@ extern void xfs_qm_unmount_quotas(struct xfs_mount *); #else static inline int -xfs_qm_vop_dqalloc(struct xfs_inode *ip, xfs_dqid_t uid, xfs_dqid_t gid, +xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid, prid_t prid, uint flags, struct xfs_dquot **udqp, struct xfs_dquot **gdqp, struct xfs_dquot **pdqp) { diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 2328268e6245..fa1018a6e677 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -17,7 +17,7 @@ #include "xfs_refcount_item.h" #include "xfs_log.h" #include "xfs_refcount.h" - +#include "xfs_error.h" kmem_zone_t *xfs_cui_zone; kmem_zone_t *xfs_cud_zone; @@ -123,40 +123,6 @@ xfs_cui_item_release( xfs_cui_release(CUI_ITEM(lip)); } -static const struct xfs_item_ops xfs_cui_item_ops = { - .iop_size = xfs_cui_item_size, - .iop_format = xfs_cui_item_format, - .iop_unpin = xfs_cui_item_unpin, - .iop_release = xfs_cui_item_release, -}; - -/* - * Allocate and initialize an cui item with the given number of extents. - */ -struct xfs_cui_log_item * -xfs_cui_init( - struct xfs_mount *mp, - uint nextents) - -{ - struct xfs_cui_log_item *cuip; - - ASSERT(nextents > 0); - if (nextents > XFS_CUI_MAX_FAST_EXTENTS) - cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents), - 0); - else - cuip = kmem_zone_zalloc(xfs_cui_zone, 0); - - xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops); - cuip->cui_format.cui_nextents = nextents; - cuip->cui_format.cui_id = (uintptr_t)(void *)cuip; - atomic_set(&cuip->cui_next_extent, 0); - atomic_set(&cuip->cui_refcount, 2); - - return cuip; -} - static inline struct xfs_cud_log_item *CUD_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_cud_log_item, cud_item); @@ -284,27 +250,6 @@ xfs_refcount_update_diff_items( XFS_FSB_TO_AGNO(mp, rb->ri_startblock); } -/* Get an CUI. */ -STATIC void * -xfs_refcount_update_create_intent( - struct xfs_trans *tp, - unsigned int count) -{ - struct xfs_cui_log_item *cuip; - - ASSERT(tp != NULL); - ASSERT(count > 0); - - cuip = xfs_cui_init(tp->t_mountp, count); - ASSERT(cuip != NULL); - - /* - * Get a log_item_desc to point at the new item. - */ - xfs_trans_add_item(tp, &cuip->cui_item); - return cuip; -} - /* Set the phys extent flags for this reverse mapping. */ static void xfs_trans_set_refcount_flags( @@ -328,16 +273,12 @@ xfs_trans_set_refcount_flags( STATIC void xfs_refcount_update_log_item( struct xfs_trans *tp, - void *intent, - struct list_head *item) + struct xfs_cui_log_item *cuip, + struct xfs_refcount_intent *refc) { - struct xfs_cui_log_item *cuip = intent; - struct xfs_refcount_intent *refc; uint next_extent; struct xfs_phys_extent *ext; - refc = container_of(item, struct xfs_refcount_intent, ri_list); - tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags); @@ -354,14 +295,35 @@ xfs_refcount_update_log_item( xfs_trans_set_refcount_flags(ext, refc->ri_type); } +static struct xfs_log_item * +xfs_refcount_update_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_cui_log_item *cuip = xfs_cui_init(mp, count); + struct xfs_refcount_intent *refc; + + ASSERT(count > 0); + + xfs_trans_add_item(tp, &cuip->cui_item); + if (sort) + list_sort(mp, items, xfs_refcount_update_diff_items); + list_for_each_entry(refc, items, ri_list) + xfs_refcount_update_log_item(tp, cuip, refc); + return &cuip->cui_item; +} + /* Get an CUD so we can process all the deferred refcount updates. */ STATIC void * xfs_refcount_update_create_done( struct xfs_trans *tp, - void *intent, + struct xfs_log_item *intent, unsigned int count) { - return xfs_trans_get_cud(tp, intent); + return xfs_trans_get_cud(tp, CUI_ITEM(intent)); } /* Process a deferred refcount update. */ @@ -411,9 +373,9 @@ xfs_refcount_update_finish_cleanup( /* Abort all pending CUIs. */ STATIC void xfs_refcount_update_abort_intent( - void *intent) + struct xfs_log_item *intent) { - xfs_cui_release(intent); + xfs_cui_release(CUI_ITEM(intent)); } /* Cancel a deferred refcount update. */ @@ -429,10 +391,8 @@ xfs_refcount_update_cancel_item( const struct xfs_defer_op_type xfs_refcount_update_defer_type = { .max_items = XFS_CUI_MAX_FAST_EXTENTS, - .diff_items = xfs_refcount_update_diff_items, .create_intent = xfs_refcount_update_create_intent, .abort_intent = xfs_refcount_update_abort_intent, - .log_item = xfs_refcount_update_log_item, .create_done = xfs_refcount_update_create_done, .finish_item = xfs_refcount_update_finish_item, .finish_cleanup = xfs_refcount_update_finish_cleanup, @@ -445,8 +405,8 @@ const struct xfs_defer_op_type xfs_refcount_update_defer_type = { */ int xfs_cui_recover( - struct xfs_trans *parent_tp, - struct xfs_cui_log_item *cuip) + struct xfs_cui_log_item *cuip, + struct list_head *capture_list) { int i; int error = 0; @@ -462,7 +422,7 @@ xfs_cui_recover( xfs_extlen_t new_len; struct xfs_bmbt_irec irec; bool requeue_only = false; - struct xfs_mount *mp = parent_tp->t_mountp; + struct xfs_mount *mp = cuip->cui_item.li_mountp; ASSERT(!test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags)); @@ -497,7 +457,7 @@ xfs_cui_recover( */ set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags); xfs_cui_release(cuip); - return -EIO; + return -EFSCORRUPTED; } } @@ -517,12 +477,7 @@ xfs_cui_recover( mp->m_refc_maxlevels * 2, 0, XFS_TRANS_RESERVE, &tp); if (error) return error; - /* - * Recovery stashes all deferred ops during intent processing and - * finishes them on completion. Transfer current dfops state to this - * transaction and transfer the result back before we return. - */ - xfs_defer_move(tp, parent_tp); + cudp = xfs_trans_get_cud(tp, cuip); for (i = 0; i < cuip->cui_format.cui_nextents; i++) { @@ -536,6 +491,7 @@ xfs_cui_recover( type = refc_type; break; default: + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); error = -EFSCORRUPTED; goto abort_error; } @@ -579,13 +535,71 @@ xfs_cui_recover( xfs_refcount_finish_one_cleanup(tp, rcur, error); set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags); - xfs_defer_move(parent_tp, tp); - error = xfs_trans_commit(tp); - return error; + return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list); abort_error: xfs_refcount_finish_one_cleanup(tp, rcur, error); - xfs_defer_move(parent_tp, tp); xfs_trans_cancel(tp); return error; } + +/* Relog an intent item to push the log tail forward. */ +static struct xfs_log_item * +xfs_cui_item_relog( + struct xfs_log_item *intent, + struct xfs_trans *tp) +{ + struct xfs_cud_log_item *cudp; + struct xfs_cui_log_item *cuip; + struct xfs_phys_extent *extp; + unsigned int count; + + count = CUI_ITEM(intent)->cui_format.cui_nextents; + extp = CUI_ITEM(intent)->cui_format.cui_extents; + + tp->t_flags |= XFS_TRANS_DIRTY; + cudp = xfs_trans_get_cud(tp, CUI_ITEM(intent)); + set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags); + + cuip = xfs_cui_init(tp->t_mountp, count); + memcpy(cuip->cui_format.cui_extents, extp, count * sizeof(*extp)); + atomic_set(&cuip->cui_next_extent, count); + xfs_trans_add_item(tp, &cuip->cui_item); + set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags); + return &cuip->cui_item; +} + +static const struct xfs_item_ops xfs_cui_item_ops = { + .iop_size = xfs_cui_item_size, + .iop_format = xfs_cui_item_format, + .iop_unpin = xfs_cui_item_unpin, + .iop_release = xfs_cui_item_release, + .iop_relog = xfs_cui_item_relog, +}; + +/* + * Allocate and initialize an cui item with the given number of extents. + */ +struct xfs_cui_log_item * +xfs_cui_init( + struct xfs_mount *mp, + uint nextents) + +{ + struct xfs_cui_log_item *cuip; + + ASSERT(nextents > 0); + if (nextents > XFS_CUI_MAX_FAST_EXTENTS) + cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents), + 0); + else + cuip = kmem_zone_zalloc(xfs_cui_zone, 0); + + xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops); + cuip->cui_format.cui_nextents = nextents; + cuip->cui_format.cui_id = (uintptr_t)(void *)cuip; + atomic_set(&cuip->cui_next_extent, 0); + atomic_set(&cuip->cui_refcount, 2); + + return cuip; +} diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h index e47530f30489..de5f48ff4f74 100644 --- a/fs/xfs/xfs_refcount_item.h +++ b/fs/xfs/xfs_refcount_item.h @@ -80,6 +80,7 @@ extern struct kmem_zone *xfs_cud_zone; struct xfs_cui_log_item *xfs_cui_init(struct xfs_mount *, uint); void xfs_cui_item_free(struct xfs_cui_log_item *); void xfs_cui_release(struct xfs_cui_log_item *); -int xfs_cui_recover(struct xfs_trans *parent_tp, struct xfs_cui_log_item *cuip); +int xfs_cui_recover(struct xfs_cui_log_item *cuip, + struct list_head *capture_list); #endif /* __XFS_REFCOUNT_ITEM_H__ */ diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 904d8285c226..01191ff46647 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -181,7 +181,7 @@ xfs_reflink_trim_around_shared( int error = 0; /* Holes, unwritten, and delalloc extents cannot be shared */ - if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_real_extent(irec)) { + if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) { *shared = false; return 0; } @@ -657,7 +657,7 @@ xfs_reflink_end_cow_extent( * preallocations can leak into the range we are called upon, and we * need to skip them. */ - if (!xfs_bmap_is_real_extent(&got)) { + if (!xfs_bmap_is_written_extent(&got)) { *end_fsb = del.br_startoff; goto out_cancel; } @@ -986,41 +986,28 @@ xfs_reflink_ag_has_free_space( } /* - * Unmap a range of blocks from a file, then map other blocks into the hole. - * The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount). - * The extent irec is mapped into dest at irec->br_startoff. + * Remap the given extent into the file. The dmap blockcount will be set to + * the number of blocks that were actually remapped. */ STATIC int xfs_reflink_remap_extent( struct xfs_inode *ip, - struct xfs_bmbt_irec *irec, - xfs_fileoff_t destoff, + struct xfs_bmbt_irec *dmap, xfs_off_t new_isize) { + struct xfs_bmbt_irec smap; struct xfs_mount *mp = ip->i_mount; - bool real_extent = xfs_bmap_is_real_extent(irec); struct xfs_trans *tp; - unsigned int resblks; - struct xfs_bmbt_irec uirec; - xfs_filblks_t rlen; - xfs_filblks_t unmap_len; xfs_off_t newlen; - int64_t qres; + int64_t qres, qdelta; + unsigned int resblks; + bool smap_real; + bool dmap_written = xfs_bmap_is_written_extent(dmap); + int nimaps; int error; - unmap_len = irec->br_startoff + irec->br_blockcount - destoff; - trace_xfs_reflink_punch_range(ip, destoff, unmap_len); - - /* No reflinking if we're low on space */ - if (real_extent) { - error = xfs_reflink_ag_has_free_space(mp, - XFS_FSB_TO_AGNO(mp, irec->br_startblock)); - if (error) - goto out; - } - /* Start a rolling transaction to switch the mappings */ - resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); + resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); if (error) goto out; @@ -1029,92 +1016,121 @@ xfs_reflink_remap_extent( xfs_trans_ijoin(tp, ip, 0); /* - * Reserve quota for this operation. We don't know if the first unmap - * in the dest file will cause a bmap btree split, so we always reserve - * at least enough blocks for that split. If the extent being mapped - * in is written, we need to reserve quota for that too. + * Read what's currently mapped in the destination file into smap. + * If smap isn't a hole, we will have to remove it before we can add + * dmap to the destination file. */ - qres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); - if (real_extent) - qres += irec->br_blockcount; - error = xfs_trans_reserve_quota_nblks(tp, ip, qres, 0, - XFS_QMOPT_RES_REGBLKS); + nimaps = 1; + error = xfs_bmapi_read(ip, dmap->br_startoff, dmap->br_blockcount, + &smap, &nimaps, 0); if (error) goto out_cancel; + ASSERT(nimaps == 1 && smap.br_startoff == dmap->br_startoff); + smap_real = xfs_bmap_is_real_extent(&smap); - trace_xfs_reflink_remap(ip, irec->br_startoff, - irec->br_blockcount, irec->br_startblock); + /* + * We can only remap as many blocks as the smaller of the two extent + * maps, because we can only remap one extent at a time. + */ + dmap->br_blockcount = min(dmap->br_blockcount, smap.br_blockcount); + ASSERT(dmap->br_blockcount == smap.br_blockcount); - /* Unmap the old blocks in the data fork. */ - rlen = unmap_len; - while (rlen) { - ASSERT(tp->t_firstblock == NULLFSBLOCK); - error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1); + trace_xfs_reflink_remap_extent_dest(ip, &smap); + + /* No reflinking if the AG of the dest mapping is low on space. */ + if (dmap_written) { + error = xfs_reflink_ag_has_free_space(mp, + XFS_FSB_TO_AGNO(mp, dmap->br_startblock)); if (error) goto out_cancel; + } + /* + * Compute quota reservation if we think the quota block counter for + * this file could increase. + * + * We start by reserving enough blocks to handle a bmbt split. + * + * If we are mapping a written extent into the file, we need to have + * enough quota block count reservation to handle the blocks in that + * extent. + * + * Note that if we're replacing a delalloc reservation with a written + * extent, we have to take the full quota reservation because removing + * the delalloc reservation gives the block count back to the quota + * count. This is suboptimal, but the VFS flushed the dest range + * before we started. That should have removed all the delalloc + * reservations, but we code defensively. + */ + qdelta = 0; + qres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); + if (dmap_written) + qres += dmap->br_blockcount; + error = xfs_trans_reserve_quota_nblks(tp, ip, qres, 0, + XFS_QMOPT_RES_REGBLKS); + if (error) + goto out_cancel; + + if (smap_real) { /* - * Trim the extent to whatever got unmapped. - * Remember, bunmapi works backwards. + * If the extent we're unmapping is backed by storage (written + * or not), unmap the extent and drop its refcount. */ - uirec.br_startblock = irec->br_startblock + rlen; - uirec.br_startoff = irec->br_startoff + rlen; - uirec.br_blockcount = unmap_len - rlen; - uirec.br_state = irec->br_state; - unmap_len = rlen; - - /* If this isn't a real mapping, we're done. */ - if (!real_extent || uirec.br_blockcount == 0) - goto next_extent; - - trace_xfs_reflink_remap(ip, uirec.br_startoff, - uirec.br_blockcount, uirec.br_startblock); + xfs_bmap_unmap_extent(tp, ip, &smap); + xfs_refcount_decrease_extent(tp, &smap); + qdelta -= smap.br_blockcount; + } else if (smap.br_startblock == DELAYSTARTBLOCK) { + xfs_filblks_t len = smap.br_blockcount; - /* Update the refcount tree */ - xfs_refcount_increase_extent(tp, &uirec); - - /* Map the new blocks into the data fork. */ - xfs_bmap_map_extent(tp, ip, &uirec); + /* + * If the extent we're unmapping is a delalloc reservation, + * we can use the regular bunmapi function to release the + * incore state. Dropping the delalloc reservation takes care + * of the quota reservation for us. + */ + error = __xfs_bunmapi(NULL, ip, smap.br_startoff, &len, 0, 1); + if (error) + goto out_cancel; + ASSERT(len == 0); + } - /* Update quota accounting. */ - xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, - uirec.br_blockcount); + /* + * If the extent we're sharing is backed by written storage, increase + * its refcount and map it into the file. + */ + if (dmap_written) { + xfs_refcount_increase_extent(tp, dmap); + xfs_bmap_map_extent(tp, ip, dmap); + qdelta += dmap->br_blockcount; + } - /* Update dest isize if needed. */ - newlen = XFS_FSB_TO_B(mp, - uirec.br_startoff + uirec.br_blockcount); - newlen = min_t(xfs_off_t, newlen, new_isize); - if (newlen > i_size_read(VFS_I(ip))) { - trace_xfs_reflink_update_inode_size(ip, newlen); - i_size_write(VFS_I(ip), newlen); - ip->i_d.di_size = newlen; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - } + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, qdelta); -next_extent: - /* Process all the deferred stuff. */ - error = xfs_defer_finish(&tp); - if (error) - goto out_cancel; + /* Update dest isize if needed. */ + newlen = XFS_FSB_TO_B(mp, dmap->br_startoff + dmap->br_blockcount); + newlen = min_t(xfs_off_t, newlen, new_isize); + if (newlen > i_size_read(VFS_I(ip))) { + trace_xfs_reflink_update_inode_size(ip, newlen); + i_size_write(VFS_I(ip), newlen); + ip->i_d.di_size = newlen; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } + /* Commit everything and unlock. */ error = xfs_trans_commit(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (error) - goto out; - return 0; + goto out_unlock; out_cancel: xfs_trans_cancel(tp); +out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); out: - trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_); + if (error) + trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_); return error; } -/* - * Iteratively remap one file's extents (and holes) to another's. - */ +/* Remap a range of one file to the other. */ int xfs_reflink_remap_blocks( struct xfs_inode *src, @@ -1125,25 +1141,22 @@ xfs_reflink_remap_blocks( loff_t *remapped) { struct xfs_bmbt_irec imap; - xfs_fileoff_t srcoff; - xfs_fileoff_t destoff; + struct xfs_mount *mp = src->i_mount; + xfs_fileoff_t srcoff = XFS_B_TO_FSBT(mp, pos_in); + xfs_fileoff_t destoff = XFS_B_TO_FSBT(mp, pos_out); xfs_filblks_t len; - xfs_filblks_t range_len; xfs_filblks_t remapped_len = 0; xfs_off_t new_isize = pos_out + remap_len; int nimaps; int error = 0; - destoff = XFS_B_TO_FSBT(src->i_mount, pos_out); - srcoff = XFS_B_TO_FSBT(src->i_mount, pos_in); - len = XFS_B_TO_FSB(src->i_mount, remap_len); + len = min_t(xfs_filblks_t, XFS_B_TO_FSB(mp, remap_len), + XFS_MAX_FILEOFF); - /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */ - while (len) { - uint lock_mode; + trace_xfs_reflink_remap_blocks(src, srcoff, len, dest, destoff); - trace_xfs_reflink_remap_blocks_loop(src, srcoff, len, - dest, destoff); + while (len > 0) { + unsigned int lock_mode; /* Read extent from the source file */ nimaps = 1; @@ -1152,18 +1165,25 @@ xfs_reflink_remap_blocks( xfs_iunlock(src, lock_mode); if (error) break; - ASSERT(nimaps == 1); - - trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_DATA_FORK, - &imap); + /* + * The caller supposedly flushed all dirty pages in the source + * file range, which means that writeback should have allocated + * or deleted all delalloc reservations in that range. If we + * find one, that's a good sign that something is seriously + * wrong here. + */ + ASSERT(nimaps == 1 && imap.br_startoff == srcoff); + if (imap.br_startblock == DELAYSTARTBLOCK) { + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + error = -EFSCORRUPTED; + break; + } - /* Translate imap into the destination file. */ - range_len = imap.br_startoff + imap.br_blockcount - srcoff; - imap.br_startoff += destoff - srcoff; + trace_xfs_reflink_remap_extent_src(src, &imap); - /* Clear dest from destoff to the end of imap and map it in. */ - error = xfs_reflink_remap_extent(dest, &imap, destoff, - new_isize); + /* Remap into the destination file at the given offset. */ + imap.br_startoff = destoff; + error = xfs_reflink_remap_extent(dest, &imap, new_isize); if (error) break; @@ -1173,10 +1193,10 @@ xfs_reflink_remap_blocks( } /* Advance drange/srange */ - srcoff += range_len; - destoff += range_len; - len -= range_len; - remapped_len += range_len; + srcoff += imap.br_blockcount; + destoff += imap.br_blockcount; + len -= imap.br_blockcount; + remapped_len += imap.br_blockcount; } if (error) @@ -1427,7 +1447,7 @@ xfs_reflink_dirty_extents( goto out; if (nmaps == 0) break; - if (!xfs_bmap_is_real_extent(&map[0])) + if (!xfs_bmap_is_written_extent(&map[0])) goto next; map[1] = map[0]; @@ -1544,7 +1564,8 @@ xfs_reflink_clear_inode_flag( * We didn't find any shared blocks so turn off the reflink flag. * First, get rid of any leftover CoW mappings. */ - error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true); + error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, XFS_MAX_FILEOFF, + true); if (error) return error; diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 8939e0ea09cd..ba1dbb6c4063 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -17,7 +17,7 @@ #include "xfs_rmap_item.h" #include "xfs_log.h" #include "xfs_rmap.h" - +#include "xfs_error.h" kmem_zone_t *xfs_rui_zone; kmem_zone_t *xfs_rud_zone; @@ -122,39 +122,6 @@ xfs_rui_item_release( xfs_rui_release(RUI_ITEM(lip)); } -static const struct xfs_item_ops xfs_rui_item_ops = { - .iop_size = xfs_rui_item_size, - .iop_format = xfs_rui_item_format, - .iop_unpin = xfs_rui_item_unpin, - .iop_release = xfs_rui_item_release, -}; - -/* - * Allocate and initialize an rui item with the given number of extents. - */ -struct xfs_rui_log_item * -xfs_rui_init( - struct xfs_mount *mp, - uint nextents) - -{ - struct xfs_rui_log_item *ruip; - - ASSERT(nextents > 0); - if (nextents > XFS_RUI_MAX_FAST_EXTENTS) - ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0); - else - ruip = kmem_zone_zalloc(xfs_rui_zone, 0); - - xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops); - ruip->rui_format.rui_nextents = nextents; - ruip->rui_format.rui_id = (uintptr_t)(void *)ruip; - atomic_set(&ruip->rui_next_extent, 0); - atomic_set(&ruip->rui_refcount, 2); - - return ruip; -} - /* * Copy an RUI format buffer from the given buf, and into the destination * RUI format structure. The RUI/RUD items were designed not to need any @@ -171,8 +138,10 @@ xfs_rui_copy_format( src_rui_fmt = buf->i_addr; len = xfs_rui_log_format_sizeof(src_rui_fmt->rui_nextents); - if (buf->i_len != len) + if (buf->i_len != len) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); return -EFSCORRUPTED; + } memcpy(dst_rui_fmt, src_rui_fmt, len); return 0; @@ -350,41 +319,16 @@ xfs_rmap_update_diff_items( XFS_FSB_TO_AGNO(mp, rb->ri_bmap.br_startblock); } -/* Get an RUI. */ -STATIC void * -xfs_rmap_update_create_intent( - struct xfs_trans *tp, - unsigned int count) -{ - struct xfs_rui_log_item *ruip; - - ASSERT(tp != NULL); - ASSERT(count > 0); - - ruip = xfs_rui_init(tp->t_mountp, count); - ASSERT(ruip != NULL); - - /* - * Get a log_item_desc to point at the new item. - */ - xfs_trans_add_item(tp, &ruip->rui_item); - return ruip; -} - /* Log rmap updates in the intent item. */ STATIC void xfs_rmap_update_log_item( struct xfs_trans *tp, - void *intent, - struct list_head *item) + struct xfs_rui_log_item *ruip, + struct xfs_rmap_intent *rmap) { - struct xfs_rui_log_item *ruip = intent; - struct xfs_rmap_intent *rmap; uint next_extent; struct xfs_map_extent *map; - rmap = container_of(item, struct xfs_rmap_intent, ri_list); - tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags); @@ -404,14 +348,35 @@ xfs_rmap_update_log_item( rmap->ri_bmap.br_state); } +static struct xfs_log_item * +xfs_rmap_update_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_rui_log_item *ruip = xfs_rui_init(mp, count); + struct xfs_rmap_intent *rmap; + + ASSERT(count > 0); + + xfs_trans_add_item(tp, &ruip->rui_item); + if (sort) + list_sort(mp, items, xfs_rmap_update_diff_items); + list_for_each_entry(rmap, items, ri_list) + xfs_rmap_update_log_item(tp, ruip, rmap); + return &ruip->rui_item; +} + /* Get an RUD so we can process all the deferred rmap updates. */ STATIC void * xfs_rmap_update_create_done( struct xfs_trans *tp, - void *intent, + struct xfs_log_item *intent, unsigned int count) { - return xfs_trans_get_rud(tp, intent); + return xfs_trans_get_rud(tp, RUI_ITEM(intent)); } /* Process a deferred rmap update. */ @@ -453,9 +418,9 @@ xfs_rmap_update_finish_cleanup( /* Abort all pending RUIs. */ STATIC void xfs_rmap_update_abort_intent( - void *intent) + struct xfs_log_item *intent) { - xfs_rui_release(intent); + xfs_rui_release(RUI_ITEM(intent)); } /* Cancel a deferred rmap update. */ @@ -471,10 +436,8 @@ xfs_rmap_update_cancel_item( const struct xfs_defer_op_type xfs_rmap_update_defer_type = { .max_items = XFS_RUI_MAX_FAST_EXTENTS, - .diff_items = xfs_rmap_update_diff_items, .create_intent = xfs_rmap_update_create_intent, .abort_intent = xfs_rmap_update_abort_intent, - .log_item = xfs_rmap_update_log_item, .create_done = xfs_rmap_update_create_done, .finish_item = xfs_rmap_update_finish_item, .finish_cleanup = xfs_rmap_update_finish_cleanup, @@ -487,9 +450,10 @@ const struct xfs_defer_op_type xfs_rmap_update_defer_type = { */ int xfs_rui_recover( - struct xfs_mount *mp, - struct xfs_rui_log_item *ruip) + struct xfs_rui_log_item *ruip, + struct list_head *capture_list) { + struct xfs_mount *mp = ruip->rui_item.li_mountp; int i; int error = 0; struct xfs_map_extent *rmap; @@ -539,7 +503,7 @@ xfs_rui_recover( */ set_bit(XFS_RUI_RECOVERED, &ruip->rui_flags); xfs_rui_release(ruip); - return -EIO; + return -EFSCORRUPTED; } } @@ -581,6 +545,7 @@ xfs_rui_recover( type = XFS_RMAP_FREE; break; default: + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); error = -EFSCORRUPTED; goto abort_error; } @@ -595,11 +560,70 @@ xfs_rui_recover( xfs_rmap_finish_one_cleanup(tp, rcur, error); set_bit(XFS_RUI_RECOVERED, &ruip->rui_flags); - error = xfs_trans_commit(tp); - return error; + return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list); abort_error: xfs_rmap_finish_one_cleanup(tp, rcur, error); xfs_trans_cancel(tp); return error; } + +/* Relog an intent item to push the log tail forward. */ +static struct xfs_log_item * +xfs_rui_item_relog( + struct xfs_log_item *intent, + struct xfs_trans *tp) +{ + struct xfs_rud_log_item *rudp; + struct xfs_rui_log_item *ruip; + struct xfs_map_extent *extp; + unsigned int count; + + count = RUI_ITEM(intent)->rui_format.rui_nextents; + extp = RUI_ITEM(intent)->rui_format.rui_extents; + + tp->t_flags |= XFS_TRANS_DIRTY; + rudp = xfs_trans_get_rud(tp, RUI_ITEM(intent)); + set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags); + + ruip = xfs_rui_init(tp->t_mountp, count); + memcpy(ruip->rui_format.rui_extents, extp, count * sizeof(*extp)); + atomic_set(&ruip->rui_next_extent, count); + xfs_trans_add_item(tp, &ruip->rui_item); + set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags); + return &ruip->rui_item; +} + +static const struct xfs_item_ops xfs_rui_item_ops = { + .iop_size = xfs_rui_item_size, + .iop_format = xfs_rui_item_format, + .iop_unpin = xfs_rui_item_unpin, + .iop_release = xfs_rui_item_release, + .iop_relog = xfs_rui_item_relog, +}; + +/* + * Allocate and initialize an rui item with the given number of extents. + */ +struct xfs_rui_log_item * +xfs_rui_init( + struct xfs_mount *mp, + uint nextents) + +{ + struct xfs_rui_log_item *ruip; + + ASSERT(nextents > 0); + if (nextents > XFS_RUI_MAX_FAST_EXTENTS) + ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0); + else + ruip = kmem_zone_zalloc(xfs_rui_zone, 0); + + xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops); + ruip->rui_format.rui_nextents = nextents; + ruip->rui_format.rui_id = (uintptr_t)(void *)ruip; + atomic_set(&ruip->rui_next_extent, 0); + atomic_set(&ruip->rui_refcount, 2); + + return ruip; +} diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h index 8708e4a5aa5c..5cf4acb0e915 100644 --- a/fs/xfs/xfs_rmap_item.h +++ b/fs/xfs/xfs_rmap_item.h @@ -82,6 +82,7 @@ int xfs_rui_copy_format(struct xfs_log_iovec *buf, struct xfs_rui_log_format *dst_rui_fmt); void xfs_rui_item_free(struct xfs_rui_log_item *); void xfs_rui_release(struct xfs_rui_log_item *); -int xfs_rui_recover(struct xfs_mount *mp, struct xfs_rui_log_item *ruip); +int xfs_rui_recover(struct xfs_rui_log_item *ruip, + struct list_head *capture_list); #endif /* __XFS_RMAP_ITEM_H__ */ diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 113883c4f202..20e0534a772c 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -23,6 +23,7 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) uint64_t xs_xstrat_bytes = 0; uint64_t xs_write_bytes = 0; uint64_t xs_read_bytes = 0; + uint64_t defer_relog = 0; static const struct xstats_entry { char *desc; @@ -57,24 +58,27 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) /* Loop over all stats groups */ for (i = j = 0; i < ARRAY_SIZE(xstats); i++) { - len += snprintf(buf + len, PATH_MAX - len, "%s", + len += scnprintf(buf + len, PATH_MAX - len, "%s", xstats[i].desc); /* inner loop does each group */ for (; j < xstats[i].endpoint; j++) - len += snprintf(buf + len, PATH_MAX - len, " %u", + len += scnprintf(buf + len, PATH_MAX - len, " %u", counter_val(stats, j)); - len += snprintf(buf + len, PATH_MAX - len, "\n"); + len += scnprintf(buf + len, PATH_MAX - len, "\n"); } /* extra precision counters */ for_each_possible_cpu(i) { xs_xstrat_bytes += per_cpu_ptr(stats, i)->s.xs_xstrat_bytes; xs_write_bytes += per_cpu_ptr(stats, i)->s.xs_write_bytes; xs_read_bytes += per_cpu_ptr(stats, i)->s.xs_read_bytes; + defer_relog += per_cpu_ptr(stats, i)->s.defer_relog; } - len += snprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n", + len += scnprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n", xs_xstrat_bytes, xs_write_bytes, xs_read_bytes); - len += snprintf(buf + len, PATH_MAX-len, "debug %u\n", + len += scnprintf(buf + len, PATH_MAX-len, "defer_relog %llu\n", + defer_relog); + len += scnprintf(buf + len, PATH_MAX-len, "debug %u\n", #if defined(DEBUG) 1); #else diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index 34d704f703d2..43ffba74f045 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -137,6 +137,7 @@ struct __xfsstats { uint64_t xs_xstrat_bytes; uint64_t xs_write_bytes; uint64_t xs_read_bytes; + uint64_t defer_relog; }; #define xfsstats_offset(f) (offsetof(struct __xfsstats, f)/sizeof(uint32_t)) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 8d1df9f8be07..e802cbc9daad 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -490,10 +490,12 @@ xfs_showargs( seq_printf(m, ",swidth=%d", (int)XFS_FSB_TO_BB(mp, mp->m_swidth)); - if (mp->m_qflags & (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD)) - seq_puts(m, ",usrquota"); - else if (mp->m_qflags & XFS_UQUOTA_ACCT) - seq_puts(m, ",uqnoenforce"); + if (mp->m_qflags & XFS_UQUOTA_ACCT) { + if (mp->m_qflags & XFS_UQUOTA_ENFD) + seq_puts(m, ",usrquota"); + else + seq_puts(m, ",uqnoenforce"); + } if (mp->m_qflags & XFS_PQUOTA_ACCT) { if (mp->m_qflags & XFS_PQUOTA_ENFD) @@ -512,32 +514,6 @@ xfs_showargs( seq_puts(m, ",noquota"); } -static uint64_t -xfs_max_file_offset( - unsigned int blockshift) -{ - unsigned int pagefactor = 1; - unsigned int bitshift = BITS_PER_LONG - 1; - - /* Figure out maximum filesize, on Linux this can depend on - * the filesystem blocksize (on 32 bit platforms). - * __block_write_begin does this in an [unsigned] long long... - * page->index << (PAGE_SHIFT - bbits) - * So, for page sized blocks (4K on 32 bit platforms), - * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is - * (((u64)PAGE_SIZE << (BITS_PER_LONG-1))-1) - * but for smaller blocksizes it is less (bbits = log2 bsize). - */ - -#if BITS_PER_LONG == 32 - ASSERT(sizeof(sector_t) == 8); - pagefactor = PAGE_SIZE; - bitshift = BITS_PER_LONG; -#endif - - return (((uint64_t)pagefactor) << bitshift) - 1; -} - /* * Set parameters for inode allocation heuristics, taking into account * filesystem size and inode32/inode64 mount options; i.e. specifically @@ -866,6 +842,20 @@ xfs_destroy_mount_workqueues( destroy_workqueue(mp->m_buf_workqueue); } +static void +xfs_flush_inodes_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(work, struct xfs_mount, + m_flush_inodes_work); + struct super_block *sb = mp->m_super; + + if (down_read_trylock(&sb->s_umount)) { + sync_inodes_sb(sb); + up_read(&sb->s_umount); + } +} + /* * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK * or a page lock. We use sync_inodes_sb() here to ensure we block while waiting @@ -876,12 +866,15 @@ void xfs_flush_inodes( struct xfs_mount *mp) { - struct super_block *sb = mp->m_super; + /* + * If flush_work() returns true then that means we waited for a flush + * which was already in progress. Don't bother running another scan. + */ + if (flush_work(&mp->m_flush_inodes_work)) + return; - if (down_read_trylock(&sb->s_umount)) { - sync_inodes_sb(sb); - up_read(&sb->s_umount); - } + queue_work(mp->m_sync_workqueue, &mp->m_flush_inodes_work); + flush_work(&mp->m_flush_inodes_work); } /* Catch misguided souls that try to use this interface on XFS */ @@ -1237,6 +1230,10 @@ xfs_fs_remount( char *p; int error; + /* version 5 superblocks always support version counters. */ + if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5) + *flags |= SB_I_VERSION; + /* First, check for complete junk; i.e. invalid options */ error = xfs_test_remount_options(sb, options); if (error) @@ -1558,6 +1555,7 @@ xfs_mount_alloc( spin_lock_init(&mp->m_perag_lock); mutex_init(&mp->m_growlock); atomic_set(&mp->m_active_trans, 0); + INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker); @@ -1650,6 +1648,26 @@ xfs_fs_fill_super( if (error) goto out_free_sb; + /* + * XFS block mappings use 54 bits to store the logical block offset. + * This should suffice to handle the maximum file size that the VFS + * supports (currently 2^63 bytes on 64-bit and ULONG_MAX << PAGE_SHIFT + * bytes on 32-bit), but as XFS and VFS have gotten the s_maxbytes + * calculation wrong on 32-bit kernels in the past, we'll add a WARN_ON + * to check this assertion. + * + * Avoid integer overflow by comparing the maximum bmbt offset to the + * maximum pagecache offset in units of fs blocks. + */ + if (XFS_B_TO_FSBT(mp, MAX_LFS_FILESIZE) > XFS_MAX_FILEOFF) { + xfs_warn(mp, +"MAX_LFS_FILESIZE block offset (%llu) exceeds extent map maximum (%llu)!", + XFS_B_TO_FSBT(mp, MAX_LFS_FILESIZE), + XFS_MAX_FILEOFF); + error = -EINVAL; + goto out_free_sb; + } + error = xfs_filestream_mount(mp); if (error) goto out_free_sb; @@ -1661,7 +1679,7 @@ xfs_fs_fill_super( sb->s_magic = XFS_SUPER_MAGIC; sb->s_blocksize = mp->m_sb.sb_blocksize; sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1; - sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits); + sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_max_links = XFS_MAXLINK; sb->s_time_gran = 1; sb->s_time_min = S32_MIN; @@ -1898,13 +1916,13 @@ xfs_init_zones(void) if (!xfs_buf_item_zone) goto out_destroy_trans_zone; - xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) + + xfs_efd_zone = kmem_zone_init((sizeof(struct xfs_efd_log_item) + ((XFS_EFD_MAX_FAST_EXTENTS - 1) * sizeof(xfs_extent_t))), "xfs_efd_item"); if (!xfs_efd_zone) goto out_destroy_buf_item_zone; - xfs_efi_zone = kmem_zone_init((sizeof(xfs_efi_log_item_t) + + xfs_efi_zone = kmem_zone_init((sizeof(struct xfs_efi_log_item) + ((XFS_EFI_MAX_FAST_EXTENTS - 1) * sizeof(xfs_extent_t))), "xfs_efi_item"); if (!xfs_efi_zone) @@ -1918,8 +1936,8 @@ xfs_init_zones(void) goto out_destroy_efi_zone; xfs_ili_zone = - kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili", - KM_ZONE_SPREAD, NULL); + kmem_zone_init_flags(sizeof(struct xfs_inode_log_item), + "xfs_ili", KM_ZONE_SPREAD, NULL); if (!xfs_ili_zone) goto out_destroy_inode_zone; xfs_icreate_zone = kmem_zone_init(sizeof(struct xfs_icreate_item), diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index ed66fd2de327..a2037e22ebda 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -191,9 +191,7 @@ xfs_symlink( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, - xfs_kuid_to_uid(current_fsuid()), - xfs_kgid_to_gid(current_fsgid()), prid, + error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -203,7 +201,7 @@ xfs_symlink( * The symlink will fit into the inode data fork? * There can't be any attributes so we get the whole variable part. */ - if (pathlen <= XFS_LITINO(mp, dp->i_d.di_version)) + if (pathlen <= XFS_LITINO(mp)) fs_blocks = 0; else fs_blocks = xfs_symlink_blocks(mp, pathlen); @@ -316,6 +314,7 @@ xfs_symlink( } ASSERT(pathlen == 0); } + i_size_write(VFS_I(ip), ip->i_d.di_size); /* * Create the directory entry for the symlink. diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index eaae275ed430..4b5818395406 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1011,6 +1011,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub); DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter); DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit); DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub); +DEFINE_LOGGRANT_EVENT(xfs_log_cil_wait); DECLARE_EVENT_CLASS(xfs_log_item_class, TP_PROTO(struct xfs_log_item *lip), @@ -2417,6 +2418,7 @@ DEFINE_DEFER_PENDING_EVENT(xfs_defer_create_intent); DEFINE_DEFER_PENDING_EVENT(xfs_defer_cancel_list); DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish); DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_relog_intent); #define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer); @@ -3077,8 +3079,7 @@ DEFINE_EVENT(xfs_inode_irec_class, name, \ DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag); DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag); DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size); -DEFINE_IMAP_EVENT(xfs_reflink_remap_imap); -TRACE_EVENT(xfs_reflink_remap_blocks_loop, +TRACE_EVENT(xfs_reflink_remap_blocks, TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset, xfs_filblks_t len, struct xfs_inode *dest, xfs_fileoff_t doffset), @@ -3109,59 +3110,14 @@ TRACE_EVENT(xfs_reflink_remap_blocks_loop, __entry->dest_ino, __entry->dest_lblk) ); -TRACE_EVENT(xfs_reflink_punch_range, - TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, - xfs_extlen_t len), - TP_ARGS(ip, lblk, len), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_ino_t, ino) - __field(xfs_fileoff_t, lblk) - __field(xfs_extlen_t, len) - ), - TP_fast_assign( - __entry->dev = VFS_I(ip)->i_sb->s_dev; - __entry->ino = ip->i_ino; - __entry->lblk = lblk; - __entry->len = len; - ), - TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, - __entry->lblk, - __entry->len) -); -TRACE_EVENT(xfs_reflink_remap, - TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, - xfs_extlen_t len, xfs_fsblock_t new_pblk), - TP_ARGS(ip, lblk, len, new_pblk), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_ino_t, ino) - __field(xfs_fileoff_t, lblk) - __field(xfs_extlen_t, len) - __field(xfs_fsblock_t, new_pblk) - ), - TP_fast_assign( - __entry->dev = VFS_I(ip)->i_sb->s_dev; - __entry->ino = ip->i_ino; - __entry->lblk = lblk; - __entry->len = len; - __entry->new_pblk = new_pblk; - ), - TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x new_pblk %llu", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, - __entry->lblk, - __entry->len, - __entry->new_pblk) -); DEFINE_DOUBLE_IO_EVENT(xfs_reflink_remap_range); DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_set_inode_flag_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_update_inode_size_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_blocks_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_extent_error); +DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent_src); +DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent_dest); /* dedupe tracepoints */ DEFINE_DOUBLE_IO_EVENT(xfs_reflink_compare_extents); @@ -3609,6 +3565,27 @@ DEFINE_KMEM_EVENT(kmem_alloc_large); DEFINE_KMEM_EVENT(kmem_realloc); DEFINE_KMEM_EVENT(kmem_zone_alloc); +TRACE_EVENT(xfs_check_new_dalign, + TP_PROTO(struct xfs_mount *mp, int new_dalign, xfs_ino_t calc_rootino), + TP_ARGS(mp, new_dalign, calc_rootino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, new_dalign) + __field(xfs_ino_t, sb_rootino) + __field(xfs_ino_t, calc_rootino) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->new_dalign = new_dalign; + __entry->sb_rootino = mp->m_sb.sb_rootino; + __entry->calc_rootino = calc_rootino; + ), + TP_printk("dev %d:%d new_dalign %d sb_rootino %llu calc_rootino %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->new_dalign, __entry->sb_rootino, + __entry->calc_rootino) +) + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index b32a66452d44..47acf4096022 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -107,7 +107,8 @@ xfs_trans_dup( ntp->t_flags = XFS_TRANS_PERM_LOG_RES | (tp->t_flags & XFS_TRANS_RESERVE) | - (tp->t_flags & XFS_TRANS_NO_WRITECOUNT); + (tp->t_flags & XFS_TRANS_NO_WRITECOUNT) | + (tp->t_flags & XFS_TRANS_RES_FDBLKS); /* We gave our writer reference to the new transaction */ tp->t_flags |= XFS_TRANS_NO_WRITECOUNT; ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket); @@ -273,6 +274,8 @@ xfs_trans_alloc( */ WARN_ON(resp->tr_logres > 0 && mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); + ASSERT(!(flags & XFS_TRANS_RES_FDBLKS) || + xfs_sb_version_haslazysbcount(&mp->m_sb)); atomic_inc(&mp->m_active_trans); tp->t_magic = XFS_TRANS_HEADER_MAGIC; @@ -368,6 +371,20 @@ xfs_trans_mod_sb( tp->t_blk_res_used += (uint)-delta; if (tp->t_blk_res_used > tp->t_blk_res) xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + } else if (delta > 0 && (tp->t_flags & XFS_TRANS_RES_FDBLKS)) { + int64_t blkres_delta; + + /* + * Return freed blocks directly to the reservation + * instead of the global pool, being careful not to + * overflow the trans counter. This is used to preserve + * reservation across chains of transaction rolls that + * repeatedly free and allocate blocks. + */ + blkres_delta = min_t(int64_t, delta, + UINT_MAX - tp->t_blk_res); + tp->t_blk_res += blkres_delta; + delta -= blkres_delta; } tp->t_fdblocks_delta += delta; if (xfs_sb_version_haslazysbcount(&mp->m_sb)) @@ -532,57 +549,9 @@ xfs_trans_apply_sb_deltas( sizeof(sbp->sb_frextents) - 1); } -STATIC int -xfs_sb_mod8( - uint8_t *field, - int8_t delta) -{ - int8_t counter = *field; - - counter += delta; - if (counter < 0) { - ASSERT(0); - return -EINVAL; - } - *field = counter; - return 0; -} - -STATIC int -xfs_sb_mod32( - uint32_t *field, - int32_t delta) -{ - int32_t counter = *field; - - counter += delta; - if (counter < 0) { - ASSERT(0); - return -EINVAL; - } - *field = counter; - return 0; -} - -STATIC int -xfs_sb_mod64( - uint64_t *field, - int64_t delta) -{ - int64_t counter = *field; - - counter += delta; - if (counter < 0) { - ASSERT(0); - return -EINVAL; - } - *field = counter; - return 0; -} - /* - * xfs_trans_unreserve_and_mod_sb() is called to release unused reservations - * and apply superblock counter changes to the in-core superblock. The + * xfs_trans_unreserve_and_mod_sb() is called to release unused reservations and + * apply superblock counter changes to the in-core superblock. The * t_res_fdblocks_delta and t_res_frextents_delta fields are explicitly NOT * applied to the in-core superblock. The idea is that that has already been * done. @@ -627,20 +596,17 @@ xfs_trans_unreserve_and_mod_sb( /* apply the per-cpu counters */ if (blkdelta) { error = xfs_mod_fdblocks(mp, blkdelta, rsvd); - if (error) - goto out; + ASSERT(!error); } if (idelta) { error = xfs_mod_icount(mp, idelta); - if (error) - goto out_undo_fdblocks; + ASSERT(!error); } if (ifreedelta) { error = xfs_mod_ifree(mp, ifreedelta); - if (error) - goto out_undo_icount; + ASSERT(!error); } if (rtxdelta == 0 && !(tp->t_flags & XFS_TRANS_SB_DIRTY)) @@ -648,95 +614,23 @@ xfs_trans_unreserve_and_mod_sb( /* apply remaining deltas */ spin_lock(&mp->m_sb_lock); - if (rtxdelta) { - error = xfs_sb_mod64(&mp->m_sb.sb_frextents, rtxdelta); - if (error) - goto out_undo_ifree; - } - - if (tp->t_dblocks_delta != 0) { - error = xfs_sb_mod64(&mp->m_sb.sb_dblocks, tp->t_dblocks_delta); - if (error) - goto out_undo_frextents; - } - if (tp->t_agcount_delta != 0) { - error = xfs_sb_mod32(&mp->m_sb.sb_agcount, tp->t_agcount_delta); - if (error) - goto out_undo_dblocks; - } - if (tp->t_imaxpct_delta != 0) { - error = xfs_sb_mod8(&mp->m_sb.sb_imax_pct, tp->t_imaxpct_delta); - if (error) - goto out_undo_agcount; - } - if (tp->t_rextsize_delta != 0) { - error = xfs_sb_mod32(&mp->m_sb.sb_rextsize, - tp->t_rextsize_delta); - if (error) - goto out_undo_imaxpct; - } - if (tp->t_rbmblocks_delta != 0) { - error = xfs_sb_mod32(&mp->m_sb.sb_rbmblocks, - tp->t_rbmblocks_delta); - if (error) - goto out_undo_rextsize; - } - if (tp->t_rblocks_delta != 0) { - error = xfs_sb_mod64(&mp->m_sb.sb_rblocks, tp->t_rblocks_delta); - if (error) - goto out_undo_rbmblocks; - } - if (tp->t_rextents_delta != 0) { - error = xfs_sb_mod64(&mp->m_sb.sb_rextents, - tp->t_rextents_delta); - if (error) - goto out_undo_rblocks; - } - if (tp->t_rextslog_delta != 0) { - error = xfs_sb_mod8(&mp->m_sb.sb_rextslog, - tp->t_rextslog_delta); - if (error) - goto out_undo_rextents; - } + mp->m_sb.sb_frextents += rtxdelta; + mp->m_sb.sb_dblocks += tp->t_dblocks_delta; + mp->m_sb.sb_agcount += tp->t_agcount_delta; + mp->m_sb.sb_imax_pct += tp->t_imaxpct_delta; + mp->m_sb.sb_rextsize += tp->t_rextsize_delta; + mp->m_sb.sb_rbmblocks += tp->t_rbmblocks_delta; + mp->m_sb.sb_rblocks += tp->t_rblocks_delta; + mp->m_sb.sb_rextents += tp->t_rextents_delta; + mp->m_sb.sb_rextslog += tp->t_rextslog_delta; spin_unlock(&mp->m_sb_lock); - return; -out_undo_rextents: - if (tp->t_rextents_delta) - xfs_sb_mod64(&mp->m_sb.sb_rextents, -tp->t_rextents_delta); -out_undo_rblocks: - if (tp->t_rblocks_delta) - xfs_sb_mod64(&mp->m_sb.sb_rblocks, -tp->t_rblocks_delta); -out_undo_rbmblocks: - if (tp->t_rbmblocks_delta) - xfs_sb_mod32(&mp->m_sb.sb_rbmblocks, -tp->t_rbmblocks_delta); -out_undo_rextsize: - if (tp->t_rextsize_delta) - xfs_sb_mod32(&mp->m_sb.sb_rextsize, -tp->t_rextsize_delta); -out_undo_imaxpct: - if (tp->t_rextsize_delta) - xfs_sb_mod8(&mp->m_sb.sb_imax_pct, -tp->t_imaxpct_delta); -out_undo_agcount: - if (tp->t_agcount_delta) - xfs_sb_mod32(&mp->m_sb.sb_agcount, -tp->t_agcount_delta); -out_undo_dblocks: - if (tp->t_dblocks_delta) - xfs_sb_mod64(&mp->m_sb.sb_dblocks, -tp->t_dblocks_delta); -out_undo_frextents: - if (rtxdelta) - xfs_sb_mod64(&mp->m_sb.sb_frextents, -rtxdelta); -out_undo_ifree: - spin_unlock(&mp->m_sb_lock); - if (ifreedelta) - xfs_mod_ifree(mp, -ifreedelta); -out_undo_icount: - if (idelta) - xfs_mod_icount(mp, -idelta); -out_undo_fdblocks: - if (blkdelta) - xfs_mod_fdblocks(mp, -blkdelta, rsvd); -out: - ASSERT(error == 0); + /* + * Debug checks outside of the spinlock so they don't lock up the + * machine if they fail. + */ + ASSERT(mp->m_sb.sb_imax_pct >= 0); + ASSERT(mp->m_sb.sb_rextslog >= 0); return; } diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 64d7f171ebd3..941647027f00 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -77,6 +77,8 @@ struct xfs_item_ops { void (*iop_release)(struct xfs_log_item *); xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t); void (*iop_error)(struct xfs_log_item *, xfs_buf_t *); + struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent, + struct xfs_trans *tp); }; /* @@ -244,4 +246,12 @@ void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp, extern kmem_zone_t *xfs_trans_zone; +static inline struct xfs_log_item * +xfs_trans_item_relog( + struct xfs_log_item *lip, + struct xfs_trans *tp) +{ + return lip->li_ops->iop_relog(lip, tp); +} + #endif /* __XFS_TRANS_H__ */ diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 812108f6cc89..a41ba155d3a3 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -108,17 +108,25 @@ xfs_ail_next( * We need the AIL lock in order to get a coherent read of the lsn of the last * item in the AIL. */ +static xfs_lsn_t +__xfs_ail_min_lsn( + struct xfs_ail *ailp) +{ + struct xfs_log_item *lip = xfs_ail_min(ailp); + + if (lip) + return lip->li_lsn; + return 0; +} + xfs_lsn_t xfs_ail_min_lsn( struct xfs_ail *ailp) { - xfs_lsn_t lsn = 0; - struct xfs_log_item *lip; + xfs_lsn_t lsn; spin_lock(&ailp->ail_lock); - lip = xfs_ail_min(ailp); - if (lip) - lsn = lip->li_lsn; + lsn = __xfs_ail_min_lsn(ailp); spin_unlock(&ailp->ail_lock); return lsn; @@ -394,16 +402,10 @@ xfsaild_push( target = ailp->ail_target; ailp->ail_target_prev = target; + /* we're done if the AIL is empty or our push has reached the end */ lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->ail_last_pushed_lsn); - if (!lip) { - /* - * If the AIL is empty or our push has reached the end we are - * done now. - */ - xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->ail_lock); + if (!lip) goto out_done; - } XFS_STATS_INC(mp, xs_push_ail); @@ -485,6 +487,8 @@ xfsaild_push( break; lsn = lip->li_lsn; } + +out_done: xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->ail_lock); @@ -492,7 +496,6 @@ xfsaild_push( ailp->ail_log_flush++; if (!count || XFS_LSN_CMP(lsn, target) >= 0) { -out_done: /* * We reached the target or the AIL is empty, so wait a bit * longer for I/O to complete and remove pushed items from the @@ -584,7 +587,8 @@ xfsaild( */ smp_rmb(); if (!xfs_ail_min(ailp) && - ailp->ail_target == ailp->ail_target_prev) { + ailp->ail_target == ailp->ail_target_prev && + list_empty(&ailp->ail_buf_list)) { spin_unlock(&ailp->ail_lock); freezable_schedule(); tout = 0; @@ -680,6 +684,28 @@ xfs_ail_push_all_sync( finish_wait(&ailp->ail_empty, &wait); } +void +xfs_ail_update_finish( + struct xfs_ail *ailp, + xfs_lsn_t old_lsn) __releases(ailp->ail_lock) +{ + struct xfs_mount *mp = ailp->ail_mount; + + /* if the tail lsn hasn't changed, don't do updates or wakeups. */ + if (!old_lsn || old_lsn == __xfs_ail_min_lsn(ailp)) { + spin_unlock(&ailp->ail_lock); + return; + } + + if (!XFS_FORCED_SHUTDOWN(mp)) + xlog_assign_tail_lsn_locked(mp); + + if (list_empty(&ailp->ail_head)) + wake_up_all(&ailp->ail_empty); + spin_unlock(&ailp->ail_lock); + xfs_log_space_wake(mp); +} + /* * xfs_trans_ail_update - bulk AIL insertion operation. * @@ -711,7 +737,7 @@ xfs_trans_ail_update_bulk( xfs_lsn_t lsn) __releases(ailp->ail_lock) { struct xfs_log_item *mlip; - int mlip_changed = 0; + xfs_lsn_t tail_lsn = 0; int i; LIST_HEAD(tmp); @@ -726,9 +752,10 @@ xfs_trans_ail_update_bulk( continue; trace_xfs_ail_move(lip, lip->li_lsn, lsn); + if (mlip == lip && !tail_lsn) + tail_lsn = lip->li_lsn; + xfs_ail_delete(ailp, lip); - if (mlip == lip) - mlip_changed = 1; } else { trace_xfs_ail_insert(lip, 0, lsn); } @@ -739,23 +766,23 @@ xfs_trans_ail_update_bulk( if (!list_empty(&tmp)) xfs_ail_splice(ailp, cur, &tmp, lsn); - if (mlip_changed) { - if (!XFS_FORCED_SHUTDOWN(ailp->ail_mount)) - xlog_assign_tail_lsn_locked(ailp->ail_mount); - spin_unlock(&ailp->ail_lock); - - xfs_log_space_wake(ailp->ail_mount); - } else { - spin_unlock(&ailp->ail_lock); - } + xfs_ail_update_finish(ailp, tail_lsn); } -bool +/* + * Delete one log item from the AIL. + * + * If this item was at the tail of the AIL, return the LSN of the log item so + * that we can use it to check if the LSN of the tail of the log has moved + * when finishing up the AIL delete process in xfs_ail_update_finish(). + */ +xfs_lsn_t xfs_ail_delete_one( struct xfs_ail *ailp, struct xfs_log_item *lip) { struct xfs_log_item *mlip = xfs_ail_min(ailp); + xfs_lsn_t lsn = lip->li_lsn; trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn); xfs_ail_delete(ailp, lip); @@ -763,7 +790,9 @@ xfs_ail_delete_one( clear_bit(XFS_LI_IN_AIL, &lip->li_flags); lip->li_lsn = 0; - return mlip == lip; + if (mlip == lip) + return lsn; + return 0; } /** @@ -791,10 +820,10 @@ void xfs_trans_ail_delete( struct xfs_ail *ailp, struct xfs_log_item *lip, - int shutdown_type) __releases(ailp->ail_lock) + int shutdown_type) { struct xfs_mount *mp = ailp->ail_mount; - bool mlip_changed; + xfs_lsn_t tail_lsn; if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) { spin_unlock(&ailp->ail_lock); @@ -807,17 +836,8 @@ xfs_trans_ail_delete( return; } - mlip_changed = xfs_ail_delete_one(ailp, lip); - if (mlip_changed) { - if (!XFS_FORCED_SHUTDOWN(mp)) - xlog_assign_tail_lsn_locked(mp); - if (list_empty(&ailp->ail_head)) - wake_up_all(&ailp->ail_empty); - } - - spin_unlock(&ailp->ail_lock); - if (mlip_changed) - xfs_log_space_wake(ailp->ail_mount); + tail_lsn = xfs_ail_delete_one(ailp, lip); + xfs_ail_update_finish(ailp, tail_lsn); } int diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 904780dd74aa..4e43d415161d 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -15,6 +15,7 @@ #include "xfs_trans_priv.h" #include "xfs_quota.h" #include "xfs_qm.h" +#include "xfs_error.h" STATIC void xfs_trans_alloc_dqinfo(xfs_trans_t *); @@ -25,8 +26,8 @@ STATIC void xfs_trans_alloc_dqinfo(xfs_trans_t *); */ void xfs_trans_dqjoin( - xfs_trans_t *tp, - xfs_dquot_t *dqp) + struct xfs_trans *tp, + struct xfs_dquot *dqp) { ASSERT(XFS_DQ_IS_LOCKED(dqp)); ASSERT(dqp->q_logitem.qli_dquot == dqp); @@ -49,8 +50,8 @@ xfs_trans_dqjoin( */ void xfs_trans_log_dquot( - xfs_trans_t *tp, - xfs_dquot_t *dqp) + struct xfs_trans *tp, + struct xfs_dquot *dqp) { ASSERT(XFS_DQ_IS_LOCKED(dqp)); @@ -486,12 +487,12 @@ xfs_trans_apply_dquot_deltas( */ void xfs_trans_unreserve_and_mod_dquots( - xfs_trans_t *tp) + struct xfs_trans *tp) { int i, j; - xfs_dquot_t *dqp; + struct xfs_dquot *dqp; struct xfs_dqtrx *qtrx, *qa; - bool locked; + bool locked; if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY)) return; @@ -571,21 +572,21 @@ xfs_quota_warn( */ STATIC int xfs_trans_dqresv( - xfs_trans_t *tp, - xfs_mount_t *mp, - xfs_dquot_t *dqp, - int64_t nblks, - long ninos, - uint flags) + struct xfs_trans *tp, + struct xfs_mount *mp, + struct xfs_dquot *dqp, + int64_t nblks, + long ninos, + uint flags) { - xfs_qcnt_t hardlimit; - xfs_qcnt_t softlimit; - time_t timer; - xfs_qwarncnt_t warns; - xfs_qwarncnt_t warnlimit; - xfs_qcnt_t total_count; - xfs_qcnt_t *resbcountp; - xfs_quotainfo_t *q = mp->m_quotainfo; + xfs_qcnt_t hardlimit; + xfs_qcnt_t softlimit; + time_t timer; + xfs_qwarncnt_t warns; + xfs_qwarncnt_t warnlimit; + xfs_qcnt_t total_count; + xfs_qcnt_t *resbcountp; + xfs_quotainfo_t *q = mp->m_quotainfo; struct xfs_def_quota *defq; @@ -700,9 +701,14 @@ xfs_trans_dqresv( XFS_TRANS_DQ_RES_INOS, ninos); } - ASSERT(dqp->q_res_bcount >= be64_to_cpu(dqp->q_core.d_bcount)); - ASSERT(dqp->q_res_rtbcount >= be64_to_cpu(dqp->q_core.d_rtbcount)); - ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount)); + + if (XFS_IS_CORRUPT(mp, + dqp->q_res_bcount < be64_to_cpu(dqp->q_core.d_bcount)) || + XFS_IS_CORRUPT(mp, + dqp->q_res_rtbcount < be64_to_cpu(dqp->q_core.d_rtbcount)) || + XFS_IS_CORRUPT(mp, + dqp->q_res_icount < be64_to_cpu(dqp->q_core.d_icount))) + goto error_corrupt; xfs_dqunlock(dqp); return 0; @@ -712,6 +718,10 @@ error_return: if (flags & XFS_QMOPT_ENOSPC) return -ENOSPC; return -EDQUOT; +error_corrupt: + xfs_dqunlock(dqp); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return -EFSCORRUPTED; } @@ -756,7 +766,8 @@ xfs_trans_reserve_quota_bydquots( } if (gdqp) { - error = xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags); + error = xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, + (flags & ~XFS_QMOPT_ENOSPC)); if (error) goto unwind_usr; } @@ -824,13 +835,13 @@ xfs_trans_reserve_quota_nblks( /* * This routine is called to allocate a quotaoff log item. */ -xfs_qoff_logitem_t * +struct xfs_qoff_logitem * xfs_trans_get_qoff_item( - xfs_trans_t *tp, - xfs_qoff_logitem_t *startqoff, + struct xfs_trans *tp, + struct xfs_qoff_logitem *startqoff, uint flags) { - xfs_qoff_logitem_t *q; + struct xfs_qoff_logitem *q; ASSERT(tp != NULL); @@ -852,8 +863,8 @@ xfs_trans_get_qoff_item( */ void xfs_trans_log_quotaoff_item( - xfs_trans_t *tp, - xfs_qoff_logitem_t *qlp) + struct xfs_trans *tp, + struct xfs_qoff_logitem *qlp) { tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &qlp->qql_item.li_flags); diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 2e073c1c4614..35655eac01a6 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -91,9 +91,11 @@ xfs_trans_ail_update( xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn); } -bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip); +xfs_lsn_t xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip); +void xfs_ail_update_finish(struct xfs_ail *ailp, xfs_lsn_t old_lsn) + __releases(ailp->ail_lock); void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip, - int shutdown_type) __releases(ailp->ail_lock); + int shutdown_type); static inline void xfs_trans_ail_remove( |