aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ceph
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ceph')
-rw-r--r--fs/ceph/caps.c41
-rw-r--r--fs/ceph/debugfs.c13
-rw-r--r--fs/ceph/file.c10
-rw-r--r--fs/ceph/inode.c4
-rw-r--r--fs/ceph/mds_client.c11
-rw-r--r--fs/ceph/mds_client.h14
-rw-r--r--fs/ceph/snap.c48
-rw-r--r--fs/ceph/super.c14
-rw-r--r--fs/ceph/xattr.c10
9 files changed, 136 insertions, 29 deletions
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 06e9b26bf277..45b8f6741f8d 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1558,6 +1558,7 @@ void ceph_flush_snaps(struct ceph_inode_info *ci,
struct inode *inode = &ci->vfs_inode;
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
struct ceph_mds_session *session = NULL;
+ bool need_put = false;
int mds;
dout("ceph_flush_snaps %p\n", inode);
@@ -1609,8 +1610,13 @@ out:
}
/* we flushed them all; remove this inode from the queue */
spin_lock(&mdsc->snap_flush_lock);
+ if (!list_empty(&ci->i_snap_flush_item))
+ need_put = true;
list_del_init(&ci->i_snap_flush_item);
spin_unlock(&mdsc->snap_flush_lock);
+
+ if (need_put)
+ iput(inode);
}
/*
@@ -2784,7 +2790,19 @@ int ceph_get_caps(struct file *filp, int need, int want,
if (ret == -EAGAIN)
continue;
if (!ret) {
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct cap_wait cw;
DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+ cw.ino = inode->i_ino;
+ cw.tgid = current->tgid;
+ cw.need = need;
+ cw.want = want;
+
+ spin_lock(&mdsc->caps_list_lock);
+ list_add(&cw.list, &mdsc->cap_wait_list);
+ spin_unlock(&mdsc->caps_list_lock);
+
add_wait_queue(&ci->i_cap_wq, &wait);
flags |= NON_BLOCKING;
@@ -2798,6 +2816,11 @@ int ceph_get_caps(struct file *filp, int need, int want,
}
remove_wait_queue(&ci->i_cap_wq, &wait);
+
+ spin_lock(&mdsc->caps_list_lock);
+ list_del(&cw.list);
+ spin_unlock(&mdsc->caps_list_lock);
+
if (ret == -EAGAIN)
continue;
}
@@ -3334,6 +3357,15 @@ static void handle_cap_grant(struct inode *inode,
}
BUG_ON(cap->issued & ~cap->implemented);
+ /* don't let check_caps skip sending a response to MDS for revoke msgs */
+ if (le32_to_cpu(grant->op) == CEPH_CAP_OP_REVOKE) {
+ cap->mds_wanted = 0;
+ if (cap == ci->i_auth_cap)
+ check_caps = 1; /* check auth cap only */
+ else
+ check_caps = 2; /* check all caps */
+ }
+
if (extra_info->inline_version > 0 &&
extra_info->inline_version >= ci->i_inline_version) {
ci->i_inline_version = extra_info->inline_version;
@@ -4272,12 +4304,14 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
struct inode *dir,
int mds, int drop, int unless)
{
- struct dentry *parent = NULL;
struct ceph_mds_request_release *rel = *p;
struct ceph_dentry_info *di = ceph_dentry(dentry);
int force = 0;
int ret;
+ /* This shouldn't happen */
+ BUG_ON(!dir);
+
/*
* force an record for the directory caps if we have a dentry lease.
* this is racy (can't take i_ceph_lock and d_lock together), but it
@@ -4287,14 +4321,9 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
spin_lock(&dentry->d_lock);
if (di->lease_session && di->lease_session->s_mds == mds)
force = 1;
- if (!dir) {
- parent = dget(dentry->d_parent);
- dir = d_inode(parent);
- }
spin_unlock(&dentry->d_lock);
ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
- dput(parent);
spin_lock(&dentry->d_lock);
if (ret && di->lease_session && di->lease_session->s_mds == mds) {
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index facb387c2735..c281f32b54f7 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -139,6 +139,7 @@ static int caps_show(struct seq_file *s, void *p)
struct ceph_fs_client *fsc = s->private;
struct ceph_mds_client *mdsc = fsc->mdsc;
int total, avail, used, reserved, min, i;
+ struct cap_wait *cw;
ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min);
seq_printf(s, "total\t\t%d\n"
@@ -166,6 +167,18 @@ static int caps_show(struct seq_file *s, void *p)
}
mutex_unlock(&mdsc->mutex);
+ seq_printf(s, "\n\nWaiters:\n--------\n");
+ seq_printf(s, "tgid ino need want\n");
+ seq_printf(s, "-----------------------------------------------------\n");
+
+ spin_lock(&mdsc->caps_list_lock);
+ list_for_each_entry(cw, &mdsc->cap_wait_list, list) {
+ seq_printf(s, "%-13d0x%-17lx%-17s%-17s\n", cw->tgid, cw->ino,
+ ceph_cap_string(cw->need),
+ ceph_cap_string(cw->want));
+ }
+ spin_unlock(&mdsc->caps_list_lock);
+
return 0;
}
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index aa1eac6d89f2..83122fc5f813 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -452,6 +452,12 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
if (dentry->d_name.len > NAME_MAX)
return -ENAMETOOLONG;
+ /*
+ * Do not truncate the file, since atomic_open is called before the
+ * permission check. The caller will do the truncation afterward.
+ */
+ flags &= ~O_TRUNC;
+
if (flags & O_CREAT) {
if (ceph_quota_is_max_files_exceeded(dir))
return -EDQUOT;
@@ -490,9 +496,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
req->r_parent = dir;
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
- err = ceph_mdsc_do_request(mdsc,
- (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
- req);
+ err = ceph_mdsc_do_request(mdsc, (flags & O_CREAT) ? dir : NULL, req);
err = ceph_handle_snapdir(req, dentry, err);
if (err)
goto out_req;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index af85a7237604..a08ddd4e26d9 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -619,9 +619,7 @@ int ceph_fill_file_size(struct inode *inode, int issued,
ci->i_truncate_seq = truncate_seq;
/* the MDS should have revoked these caps */
- WARN_ON_ONCE(issued & (CEPH_CAP_FILE_EXCL |
- CEPH_CAP_FILE_RD |
- CEPH_CAP_FILE_WR |
+ WARN_ON_ONCE(issued & (CEPH_CAP_FILE_RD |
CEPH_CAP_FILE_LAZYIO));
/*
* If we hold relevant caps, or in the case where we're
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 37fb71797b34..f7acf9680c9b 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3151,6 +3151,12 @@ static void handle_session(struct ceph_mds_session *session,
break;
case CEPH_SESSION_FLUSHMSG:
+ /* flush cap releases */
+ spin_lock(&session->s_cap_lock);
+ if (session->s_num_cap_releases)
+ ceph_flush_cap_releases(mdsc, session);
+ spin_unlock(&session->s_cap_lock);
+
send_flushmsg_ack(mdsc, session, seq);
break;
@@ -4068,7 +4074,7 @@ static void delayed_work(struct work_struct *work)
dout("mdsc delayed_work\n");
- if (mdsc->stopping)
+ if (mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHED)
return;
mutex_lock(&mdsc->mutex);
@@ -4168,6 +4174,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
mdsc->last_renew_caps = jiffies;
INIT_LIST_HEAD(&mdsc->cap_delay_list);
+ INIT_LIST_HEAD(&mdsc->cap_wait_list);
spin_lock_init(&mdsc->cap_delay_lock);
INIT_LIST_HEAD(&mdsc->snap_flush_list);
spin_lock_init(&mdsc->snap_flush_lock);
@@ -4239,7 +4246,7 @@ static void wait_requests(struct ceph_mds_client *mdsc)
void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
{
dout("pre_umount\n");
- mdsc->stopping = 1;
+ mdsc->stopping = CEPH_MDSC_STOPPING_BEGIN;
lock_unlock_sessions(mdsc);
ceph_flush_dirty_caps(mdsc);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 5cd131b41d84..4fbbc33023c9 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -340,6 +340,19 @@ struct ceph_quotarealm_inode {
struct inode *inode;
};
+struct cap_wait {
+ struct list_head list;
+ unsigned long ino;
+ pid_t tgid;
+ int need;
+ int want;
+};
+
+enum {
+ CEPH_MDSC_STOPPING_BEGIN = 1,
+ CEPH_MDSC_STOPPING_FLUSHED = 2,
+};
+
/*
* mds client state
*/
@@ -416,6 +429,7 @@ struct ceph_mds_client {
spinlock_t caps_list_lock;
struct list_head caps_list; /* unused (reserved or
unreserved) */
+ struct list_head cap_wait_list;
int caps_total_count; /* total caps allocated */
int caps_use_count; /* in use */
int caps_use_max; /* max used caps */
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index e1b9b224fcb2..4df068bdba68 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -644,8 +644,10 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
capsnap->size);
spin_lock(&mdsc->snap_flush_lock);
- if (list_empty(&ci->i_snap_flush_item))
+ if (list_empty(&ci->i_snap_flush_item)) {
+ ihold(inode);
list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
+ }
spin_unlock(&mdsc->snap_flush_lock);
return 1; /* caller may want to ceph_flush_snaps */
}
@@ -694,9 +696,10 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
struct ceph_mds_snap_realm *ri; /* encoded */
__le64 *snaps; /* encoded */
__le64 *prior_parent_snaps; /* encoded */
- struct ceph_snap_realm *realm = NULL;
+ struct ceph_snap_realm *realm;
struct ceph_snap_realm *first_realm = NULL;
- int invalidate = 0;
+ struct ceph_snap_realm *realm_to_rebuild = NULL;
+ int rebuild_snapcs;
int err = -ENOMEM;
LIST_HEAD(dirty_realms);
@@ -704,6 +707,8 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
dout("update_snap_trace deletion=%d\n", deletion);
more:
+ realm = NULL;
+ rebuild_snapcs = 0;
ceph_decode_need(&p, e, sizeof(*ri), bad);
ri = p;
p += sizeof(*ri);
@@ -727,7 +732,7 @@ more:
err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
if (err < 0)
goto fail;
- invalidate += err;
+ rebuild_snapcs += err;
if (le64_to_cpu(ri->seq) > realm->seq) {
dout("update_snap_trace updating %llx %p %lld -> %lld\n",
@@ -752,22 +757,30 @@ more:
if (realm->seq > mdsc->last_snap_seq)
mdsc->last_snap_seq = realm->seq;
- invalidate = 1;
+ rebuild_snapcs = 1;
} else if (!realm->cached_context) {
dout("update_snap_trace %llx %p seq %lld new\n",
realm->ino, realm, realm->seq);
- invalidate = 1;
+ rebuild_snapcs = 1;
} else {
dout("update_snap_trace %llx %p seq %lld unchanged\n",
realm->ino, realm, realm->seq);
}
- dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
- realm, invalidate, p, e);
+ dout("done with %llx %p, rebuild_snapcs=%d, %p %p\n", realm->ino,
+ realm, rebuild_snapcs, p, e);
+
+ /*
+ * this will always track the uppest parent realm from which
+ * we need to rebuild the snapshot contexts _downward_ in
+ * hierarchy.
+ */
+ if (rebuild_snapcs)
+ realm_to_rebuild = realm;
- /* invalidate when we reach the _end_ (root) of the trace */
- if (invalidate && p >= e)
- rebuild_snap_realms(realm, &dirty_realms);
+ /* rebuild_snapcs when we reach the _end_ (root) of the trace */
+ if (realm_to_rebuild && p >= e)
+ rebuild_snap_realms(realm_to_rebuild, &dirty_realms);
if (!first_realm)
first_realm = realm;
@@ -994,6 +1007,19 @@ skip_inode:
continue;
adjust_snap_realm_parent(mdsc, child, realm->ino);
}
+ } else {
+ /*
+ * In the non-split case both 'num_split_inos' and
+ * 'num_split_realms' should be 0, making this a no-op.
+ * However the MDS happens to populate 'split_realms' list
+ * in one of the UPDATE op cases by mistake.
+ *
+ * Skip both lists just in case to ensure that 'p' is
+ * positioned at the start of realm info, as expected by
+ * ceph_update_snap_trace().
+ */
+ p += sizeof(u64) * num_split_inos;
+ p += sizeof(u64) * num_split_realms;
}
/*
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index d40658d5e808..0e38678d5add 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1174,14 +1174,23 @@ out_final:
static void ceph_kill_sb(struct super_block *s)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(s);
- dev_t dev = s->s_dev;
dout("kill_sb %p\n", s);
ceph_mdsc_pre_umount(fsc->mdsc);
flush_fs_workqueues(fsc);
- generic_shutdown_super(s);
+ /*
+ * Though the kill_anon_super() will finally trigger the
+ * sync_filesystem() anyway, we still need to do it here
+ * and then bump the stage of shutdown to stop the work
+ * queue as earlier as possible.
+ */
+ sync_filesystem(s);
+
+ fsc->mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHED;
+
+ kill_anon_super(s);
fsc->client->extra_mon_dispatch = NULL;
ceph_fs_debugfs_cleanup(fsc);
@@ -1189,7 +1198,6 @@ static void ceph_kill_sb(struct super_block *s)
ceph_fscache_unregister_fs(fsc);
destroy_fs_client(fsc);
- free_anon_bdev(dev);
}
static struct file_system_type ceph_fs_type = {
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index cb18ee637cb7..4bcf0226818d 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -316,6 +316,14 @@ static ssize_t ceph_vxattrcb_snap_btime(struct ceph_inode_info *ci, char *val,
}
#define XATTR_RSTAT_FIELD(_type, _name) \
XATTR_NAME_CEPH(_type, _name, VXATTR_FLAG_RSTAT)
+#define XATTR_RSTAT_FIELD_UPDATABLE(_type, _name) \
+ { \
+ .name = CEPH_XATTR_NAME(_type, _name), \
+ .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \
+ .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
+ .exists_cb = NULL, \
+ .flags = VXATTR_FLAG_RSTAT, \
+ }
#define XATTR_LAYOUT_FIELD(_type, _name, _field) \
{ \
.name = CEPH_XATTR_NAME2(_type, _name, _field), \
@@ -353,7 +361,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
XATTR_RSTAT_FIELD(dir, rfiles),
XATTR_RSTAT_FIELD(dir, rsubdirs),
XATTR_RSTAT_FIELD(dir, rbytes),
- XATTR_RSTAT_FIELD(dir, rctime),
+ XATTR_RSTAT_FIELD_UPDATABLE(dir, rctime),
{
.name = "ceph.dir.pin",
.name_size = sizeof("ceph.dir.pin"),