ceph: factor out libceph from Ceph file system

This factors out protocol and low-level storage parts of ceph into a
separate libceph module living in net/ceph and include/linux/ceph.  This
is mostly a matter of moving files around.  However, a few key pieces
of the interface change as well:

 - ceph_client becomes ceph_fs_client and ceph_client, where the latter
   captures the mon and osd clients, and the fs_client gets the mds client
   and file system specific pieces.
 - Mount option parsing and debugfs setup is correspondingly broken into
   two pieces.
 - The mon client gets a generic handler callback for otherwise unknown
   messages (mds map, in this case).
 - The basic supported/required feature bits can be expanded (and are by
   ceph_fs_client).

No functional change, aside from some subtle error handling cases that got
cleaned up in the refactoring process.

Signed-off-by: Sage Weil <sage@newdream.net>
This commit is contained in:
Yehuda Sadeh 2010-04-06 15:14:15 -07:00 committed by Sage Weil
parent ae1533b62b
commit 3d14c5d2b6
73 changed files with 2590 additions and 1862 deletions

92
include/linux/ceph/auth.h Normal file
View file

@ -0,0 +1,92 @@
#ifndef _FS_CEPH_AUTH_H
#define _FS_CEPH_AUTH_H
#include <linux/ceph/types.h>
#include <linux/ceph/buffer.h>
/*
* Abstract interface for communicating with the authenticate module.
* There is some handshake that takes place between us and the monitor
* to acquire the necessary keys. These are used to generate an
* 'authorizer' that we use when connecting to a service (mds, osd).
*/
struct ceph_auth_client;
struct ceph_authorizer;
struct ceph_auth_client_ops {
const char *name;
/*
* true if we are authenticated and can connect to
* services.
*/
int (*is_authenticated)(struct ceph_auth_client *ac);
/*
* true if we should (re)authenticate, e.g., when our tickets
* are getting old and crusty.
*/
int (*should_authenticate)(struct ceph_auth_client *ac);
/*
* build requests and process replies during monitor
* handshake. if handle_reply returns -EAGAIN, we build
* another request.
*/
int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
int (*handle_reply)(struct ceph_auth_client *ac, int result,
void *buf, void *end);
/*
* Create authorizer for connecting to a service, and verify
* the response to authenticate the service.
*/
int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
struct ceph_authorizer **a,
void **buf, size_t *len,
void **reply_buf, size_t *reply_len);
int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
struct ceph_authorizer *a, size_t len);
void (*destroy_authorizer)(struct ceph_auth_client *ac,
struct ceph_authorizer *a);
void (*invalidate_authorizer)(struct ceph_auth_client *ac,
int peer_type);
/* reset when we (re)connect to a monitor */
void (*reset)(struct ceph_auth_client *ac);
void (*destroy)(struct ceph_auth_client *ac);
};
struct ceph_auth_client {
u32 protocol; /* CEPH_AUTH_* */
void *private; /* for use by protocol implementation */
const struct ceph_auth_client_ops *ops; /* null iff protocol==0 */
bool negotiating; /* true if negotiating protocol */
const char *name; /* entity name */
u64 global_id; /* our unique id in system */
const char *secret; /* our secret key */
unsigned want_keys; /* which services we want */
};
extern struct ceph_auth_client *ceph_auth_init(const char *name,
const char *secret);
extern void ceph_auth_destroy(struct ceph_auth_client *ac);
extern void ceph_auth_reset(struct ceph_auth_client *ac);
extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
void *buf, size_t len);
extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
void *buf, size_t len,
void *reply_buf, size_t reply_len);
extern int ceph_entity_name_encode(const char *name, void **p, void *end);
extern int ceph_build_auth(struct ceph_auth_client *ac,
void *msg_buf, size_t msg_len);
extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
#endif

View file

@ -0,0 +1,39 @@
#ifndef __FS_CEPH_BUFFER_H
#define __FS_CEPH_BUFFER_H
#include <linux/kref.h>
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <linux/types.h>
#include <linux/uio.h>
/*
* a simple reference counted buffer.
*
* use kmalloc for small sizes (<= one page), vmalloc for larger
* sizes.
*/
struct ceph_buffer {
struct kref kref;
struct kvec vec;
size_t alloc_len;
bool is_vmalloc;
};
extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
extern void ceph_buffer_release(struct kref *kref);
static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
{
kref_get(&b->kref);
return b;
}
static inline void ceph_buffer_put(struct ceph_buffer *b)
{
kref_put(&b->kref, ceph_buffer_release);
}
extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end);
#endif

View file

@ -0,0 +1,38 @@
#ifndef _FS_CEPH_DEBUG_H
#define _FS_CEPH_DEBUG_H
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#ifdef CONFIG_CEPH_LIB_PRETTYDEBUG
/*
* wrap pr_debug to include a filename:lineno prefix on each line.
* this incurs some overhead (kernel size and execution time) due to
* the extra function call at each call site.
*/
# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
extern const char *ceph_file_part(const char *s, int len);
# define dout(fmt, ...) \
pr_debug("%.*s %12.12s:%-4d : " fmt, \
8 - (int)sizeof(KBUILD_MODNAME), " ", \
ceph_file_part(__FILE__, sizeof(__FILE__)), \
__LINE__, ##__VA_ARGS__)
# else
/* faux printk call just to see any compiler warnings. */
# define dout(fmt, ...) do { \
if (0) \
printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
} while (0)
# endif
#else
/*
* or, just wrap pr_debug
*/
# define dout(fmt, ...) pr_debug(" " fmt, ##__VA_ARGS__)
#endif
#endif

View file

@ -0,0 +1,109 @@
#ifndef FS_CEPH_FRAG_H
#define FS_CEPH_FRAG_H
/*
* "Frags" are a way to describe a subset of a 32-bit number space,
* using a mask and a value to match against that mask. Any given frag
* (subset of the number space) can be partitioned into 2^n sub-frags.
*
* Frags are encoded into a 32-bit word:
* 8 upper bits = "bits"
* 24 lower bits = "value"
* (We could go to 5+27 bits, but who cares.)
*
* We use the _most_ significant bits of the 24 bit value. This makes
* values logically sort.
*
* Unfortunately, because the "bits" field is still in the high bits, we
* can't sort encoded frags numerically. However, it does allow you
* to feed encoded frags as values into frag_contains_value.
*/
static inline __u32 ceph_frag_make(__u32 b, __u32 v)
{
return (b << 24) |
(v & (0xffffffu << (24-b)) & 0xffffffu);
}
static inline __u32 ceph_frag_bits(__u32 f)
{
return f >> 24;
}
static inline __u32 ceph_frag_value(__u32 f)
{
return f & 0xffffffu;
}
static inline __u32 ceph_frag_mask(__u32 f)
{
return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
}
static inline __u32 ceph_frag_mask_shift(__u32 f)
{
return 24 - ceph_frag_bits(f);
}
static inline int ceph_frag_contains_value(__u32 f, __u32 v)
{
return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
}
static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
{
/* is sub as specific as us, and contained by us? */
return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
(ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
}
static inline __u32 ceph_frag_parent(__u32 f)
{
return ceph_frag_make(ceph_frag_bits(f) - 1,
ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
}
static inline int ceph_frag_is_left_child(__u32 f)
{
return ceph_frag_bits(f) > 0 &&
(ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
}
static inline int ceph_frag_is_right_child(__u32 f)
{
return ceph_frag_bits(f) > 0 &&
(ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
}
static inline __u32 ceph_frag_sibling(__u32 f)
{
return ceph_frag_make(ceph_frag_bits(f),
ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
}
static inline __u32 ceph_frag_left_child(__u32 f)
{
return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
}
static inline __u32 ceph_frag_right_child(__u32 f)
{
return ceph_frag_make(ceph_frag_bits(f)+1,
ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
}
static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
{
int newbits = ceph_frag_bits(f) + by;
return ceph_frag_make(newbits,
ceph_frag_value(f) | (i << (24 - newbits)));
}
static inline int ceph_frag_is_leftmost(__u32 f)
{
return ceph_frag_value(f) == 0;
}
static inline int ceph_frag_is_rightmost(__u32 f)
{
return ceph_frag_value(f) == ceph_frag_mask(f);
}
static inline __u32 ceph_frag_next(__u32 f)
{
return ceph_frag_make(ceph_frag_bits(f),
ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
}
/*
* comparator to sort frags logically, as when traversing the
* number space in ascending order...
*/
int ceph_frag_compare(__u32 a, __u32 b);
#endif

View file

@ -0,0 +1,728 @@
/*
* ceph_fs.h - Ceph constants and data types to share between kernel and
* user space.
*
* Most types in this file are defined as little-endian, and are
* primarily intended to describe data structures that pass over the
* wire or that are stored on disk.
*
* LGPL2
*/
#ifndef CEPH_FS_H
#define CEPH_FS_H
#include "msgr.h"
#include "rados.h"
/*
* subprotocol versions. when specific messages types or high-level
* protocols change, bump the affected components. we keep rev
* internal cluster protocols separately from the public,
* client-facing protocol.
*/
#define CEPH_OSD_PROTOCOL 8 /* cluster internal */
#define CEPH_MDS_PROTOCOL 12 /* cluster internal */
#define CEPH_MON_PROTOCOL 5 /* cluster internal */
#define CEPH_OSDC_PROTOCOL 24 /* server/client */
#define CEPH_MDSC_PROTOCOL 32 /* server/client */
#define CEPH_MONC_PROTOCOL 15 /* server/client */
#define CEPH_INO_ROOT 1
#define CEPH_INO_CEPH 2 /* hidden .ceph dir */
/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
#define CEPH_MAX_MON 31
/*
* feature bits
*/
#define CEPH_FEATURE_UID (1<<0)
#define CEPH_FEATURE_NOSRCADDR (1<<1)
#define CEPH_FEATURE_MONCLOCKCHECK (1<<2)
#define CEPH_FEATURE_FLOCK (1<<3)
/*
* ceph_file_layout - describe data layout for a file/inode
*/
struct ceph_file_layout {
/* file -> object mapping */
__le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple
of page size. */
__le32 fl_stripe_count; /* over this many objects */
__le32 fl_object_size; /* until objects are this big, then move to
new objects */
__le32 fl_cas_hash; /* 0 = none; 1 = sha256 */
/* pg -> disk layout */
__le32 fl_object_stripe_unit; /* for per-object parity, if any */
/* object -> pg layout */
__le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
__le32 fl_pg_pool; /* namespace, crush ruleset, rep level */
} __attribute__ ((packed));
#define CEPH_MIN_STRIPE_UNIT 65536
int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
/* crypto algorithms */
#define CEPH_CRYPTO_NONE 0x0
#define CEPH_CRYPTO_AES 0x1
#define CEPH_AES_IV "cephsageyudagreg"
/* security/authentication protocols */
#define CEPH_AUTH_UNKNOWN 0x0
#define CEPH_AUTH_NONE 0x1
#define CEPH_AUTH_CEPHX 0x2
#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
/*********************************************
* message layer
*/
/*
* message types
*/
/* misc */
#define CEPH_MSG_SHUTDOWN 1
#define CEPH_MSG_PING 2
/* client <-> monitor */
#define CEPH_MSG_MON_MAP 4
#define CEPH_MSG_MON_GET_MAP 5
#define CEPH_MSG_STATFS 13
#define CEPH_MSG_STATFS_REPLY 14
#define CEPH_MSG_MON_SUBSCRIBE 15
#define CEPH_MSG_MON_SUBSCRIBE_ACK 16
#define CEPH_MSG_AUTH 17
#define CEPH_MSG_AUTH_REPLY 18
/* client <-> mds */
#define CEPH_MSG_MDS_MAP 21
#define CEPH_MSG_CLIENT_SESSION 22
#define CEPH_MSG_CLIENT_RECONNECT 23
#define CEPH_MSG_CLIENT_REQUEST 24
#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
#define CEPH_MSG_CLIENT_REPLY 26
#define CEPH_MSG_CLIENT_CAPS 0x310
#define CEPH_MSG_CLIENT_LEASE 0x311
#define CEPH_MSG_CLIENT_SNAP 0x312
#define CEPH_MSG_CLIENT_CAPRELEASE 0x313
/* pool ops */
#define CEPH_MSG_POOLOP_REPLY 48
#define CEPH_MSG_POOLOP 49
/* osd */
#define CEPH_MSG_OSD_MAP 41
#define CEPH_MSG_OSD_OP 42
#define CEPH_MSG_OSD_OPREPLY 43
/* pool operations */
enum {
POOL_OP_CREATE = 0x01,
POOL_OP_DELETE = 0x02,
POOL_OP_AUID_CHANGE = 0x03,
POOL_OP_CREATE_SNAP = 0x11,
POOL_OP_DELETE_SNAP = 0x12,
POOL_OP_CREATE_UNMANAGED_SNAP = 0x21,
POOL_OP_DELETE_UNMANAGED_SNAP = 0x22,
};
struct ceph_mon_request_header {
__le64 have_version;
__le16 session_mon;
__le64 session_mon_tid;
} __attribute__ ((packed));
struct ceph_mon_statfs {
struct ceph_mon_request_header monhdr;
struct ceph_fsid fsid;
} __attribute__ ((packed));
struct ceph_statfs {
__le64 kb, kb_used, kb_avail;
__le64 num_objects;
} __attribute__ ((packed));
struct ceph_mon_statfs_reply {
struct ceph_fsid fsid;
__le64 version;
struct ceph_statfs st;
} __attribute__ ((packed));
const char *ceph_pool_op_name(int op);
struct ceph_mon_poolop {
struct ceph_mon_request_header monhdr;
struct ceph_fsid fsid;
__le32 pool;
__le32 op;
__le64 auid;
__le64 snapid;
__le32 name_len;
} __attribute__ ((packed));
struct ceph_mon_poolop_reply {
struct ceph_mon_request_header monhdr;
struct ceph_fsid fsid;
__le32 reply_code;
__le32 epoch;
char has_data;
char data[0];
} __attribute__ ((packed));
struct ceph_mon_unmanaged_snap {
__le64 snapid;
} __attribute__ ((packed));
struct ceph_osd_getmap {
struct ceph_mon_request_header monhdr;
struct ceph_fsid fsid;
__le32 start;
} __attribute__ ((packed));
struct ceph_mds_getmap {
struct ceph_mon_request_header monhdr;
struct ceph_fsid fsid;
} __attribute__ ((packed));
struct ceph_client_mount {
struct ceph_mon_request_header monhdr;
} __attribute__ ((packed));
struct ceph_mon_subscribe_item {
__le64 have_version; __le64 have;
__u8 onetime;
} __attribute__ ((packed));
struct ceph_mon_subscribe_ack {
__le32 duration; /* seconds */
struct ceph_fsid fsid;
} __attribute__ ((packed));
/*
* mds states
* > 0 -> in
* <= 0 -> out
*/
#define CEPH_MDS_STATE_DNE 0 /* down, does not exist. */
#define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees.
empty log. */
#define CEPH_MDS_STATE_BOOT -4 /* up, boot announcement. */
#define CEPH_MDS_STATE_STANDBY -5 /* up, idle. waiting for assignment. */
#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */
#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */
#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */
#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed
operations (import, rename, etc.) */
#define CEPH_MDS_STATE_RECONNECT 10 /* up, reconnect to clients */
#define CEPH_MDS_STATE_REJOIN 11 /* up, rejoining distributed cache */
#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
#define CEPH_MDS_STATE_ACTIVE 13 /* up, active */
#define CEPH_MDS_STATE_STOPPING 14 /* up, but exporting metadata */
extern const char *ceph_mds_state_name(int s);
/*
* metadata lock types.
* - these are bitmasks.. we can compose them
* - they also define the lock ordering by the MDS
* - a few of these are internal to the mds
*/
#define CEPH_LOCK_DVERSION 1
#define CEPH_LOCK_DN 2
#define CEPH_LOCK_ISNAP 16
#define CEPH_LOCK_IVERSION 32 /* mds internal */
#define CEPH_LOCK_IFILE 64
#define CEPH_LOCK_IAUTH 128
#define CEPH_LOCK_ILINK 256
#define CEPH_LOCK_IDFT 512 /* dir frag tree */
#define CEPH_LOCK_INEST 1024 /* mds internal */
#define CEPH_LOCK_IXATTR 2048
#define CEPH_LOCK_IFLOCK 4096 /* advisory file locks */
#define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */
/* client_session ops */
enum {
CEPH_SESSION_REQUEST_OPEN,
CEPH_SESSION_OPEN,
CEPH_SESSION_REQUEST_CLOSE,
CEPH_SESSION_CLOSE,
CEPH_SESSION_REQUEST_RENEWCAPS,
CEPH_SESSION_RENEWCAPS,
CEPH_SESSION_STALE,
CEPH_SESSION_RECALL_STATE,
};
extern const char *ceph_session_op_name(int op);
struct ceph_mds_session_head {
__le32 op;
__le64 seq;
struct ceph_timespec stamp;
__le32 max_caps, max_leases;
} __attribute__ ((packed));
/* client_request */
/*
* metadata ops.
* & 0x001000 -> write op
* & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
& & 0x100000 -> use weird ino/path trace
*/
#define CEPH_MDS_OP_WRITE 0x001000
enum {
CEPH_MDS_OP_LOOKUP = 0x00100,
CEPH_MDS_OP_GETATTR = 0x00101,
CEPH_MDS_OP_LOOKUPHASH = 0x00102,
CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
CEPH_MDS_OP_SETXATTR = 0x01105,
CEPH_MDS_OP_RMXATTR = 0x01106,
CEPH_MDS_OP_SETLAYOUT = 0x01107,
CEPH_MDS_OP_SETATTR = 0x01108,
CEPH_MDS_OP_SETFILELOCK= 0x01109,
CEPH_MDS_OP_GETFILELOCK= 0x00110,
CEPH_MDS_OP_MKNOD = 0x01201,
CEPH_MDS_OP_LINK = 0x01202,
CEPH_MDS_OP_UNLINK = 0x01203,
CEPH_MDS_OP_RENAME = 0x01204,
CEPH_MDS_OP_MKDIR = 0x01220,
CEPH_MDS_OP_RMDIR = 0x01221,
CEPH_MDS_OP_SYMLINK = 0x01222,
CEPH_MDS_OP_CREATE = 0x01301,
CEPH_MDS_OP_OPEN = 0x00302,
CEPH_MDS_OP_READDIR = 0x00305,
CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
CEPH_MDS_OP_MKSNAP = 0x01400,
CEPH_MDS_OP_RMSNAP = 0x01401,
CEPH_MDS_OP_LSSNAP = 0x00402,
};
extern const char *ceph_mds_op_name(int op);
#define CEPH_SETATTR_MODE 1
#define CEPH_SETATTR_UID 2
#define CEPH_SETATTR_GID 4
#define CEPH_SETATTR_MTIME 8
#define CEPH_SETATTR_ATIME 16
#define CEPH_SETATTR_SIZE 32
#define CEPH_SETATTR_CTIME 64
union ceph_mds_request_args {
struct {
__le32 mask; /* CEPH_CAP_* */
} __attribute__ ((packed)) getattr;
struct {
__le32 mode;
__le32 uid;
__le32 gid;
struct ceph_timespec mtime;
struct ceph_timespec atime;
__le64 size, old_size; /* old_size needed by truncate */
__le32 mask; /* CEPH_SETATTR_* */
} __attribute__ ((packed)) setattr;
struct {
__le32 frag; /* which dir fragment */
__le32 max_entries; /* how many dentries to grab */
__le32 max_bytes;
} __attribute__ ((packed)) readdir;
struct {
__le32 mode;
__le32 rdev;
} __attribute__ ((packed)) mknod;
struct {
__le32 mode;
} __attribute__ ((packed)) mkdir;
struct {
__le32 flags;
__le32 mode;
__le32 stripe_unit; /* layout for newly created file */
__le32 stripe_count; /* ... */
__le32 object_size;
__le32 file_replication;
__le32 preferred;
} __attribute__ ((packed)) open;
struct {
__le32 flags;
} __attribute__ ((packed)) setxattr;
struct {
struct ceph_file_layout layout;
} __attribute__ ((packed)) setlayout;
struct {
__u8 rule; /* currently fcntl or flock */
__u8 type; /* shared, exclusive, remove*/
__le64 pid; /* process id requesting the lock */
__le64 pid_namespace;
__le64 start; /* initial location to lock */
__le64 length; /* num bytes to lock from start */
__u8 wait; /* will caller wait for lock to become available? */
} __attribute__ ((packed)) filelock_change;
} __attribute__ ((packed));
#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */
#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */
struct ceph_mds_request_head {
__le64 oldest_client_tid;
__le32 mdsmap_epoch; /* on client */
__le32 flags; /* CEPH_MDS_FLAG_* */
__u8 num_retry, num_fwd; /* count retry, fwd attempts */
__le16 num_releases; /* # include cap/lease release records */
__le32 op; /* mds op code */
__le32 caller_uid, caller_gid;
__le64 ino; /* use this ino for openc, mkdir, mknod,
etc. (if replaying) */
union ceph_mds_request_args args;
} __attribute__ ((packed));
/* cap/lease release record */
struct ceph_mds_request_release {
__le64 ino, cap_id; /* ino and unique cap id */
__le32 caps, wanted; /* new issued, wanted */
__le32 seq, issue_seq, mseq;
__le32 dname_seq; /* if releasing a dentry lease, a */
__le32 dname_len; /* string follows. */
} __attribute__ ((packed));
/* client reply */
struct ceph_mds_reply_head {
__le32 op;
__le32 result;
__le32 mdsmap_epoch;
__u8 safe; /* true if committed to disk */
__u8 is_dentry, is_target; /* true if dentry, target inode records
are included with reply */
} __attribute__ ((packed));
/* one for each node split */
struct ceph_frag_tree_split {
__le32 frag; /* this frag splits... */
__le32 by; /* ...by this many bits */
} __attribute__ ((packed));
struct ceph_frag_tree_head {
__le32 nsplits; /* num ceph_frag_tree_split records */
struct ceph_frag_tree_split splits[];
} __attribute__ ((packed));
/* capability issue, for bundling with mds reply */
struct ceph_mds_reply_cap {
__le32 caps, wanted; /* caps issued, wanted */
__le64 cap_id;
__le32 seq, mseq;
__le64 realm; /* snap realm */
__u8 flags; /* CEPH_CAP_FLAG_* */
} __attribute__ ((packed));
#define CEPH_CAP_FLAG_AUTH 1 /* cap is issued by auth mds */
/* inode record, for bundling with mds reply */
struct ceph_mds_reply_inode {
__le64 ino;
__le64 snapid;
__le32 rdev;
__le64 version; /* inode version */
__le64 xattr_version; /* version for xattr blob */
struct ceph_mds_reply_cap cap; /* caps issued for this inode */
struct ceph_file_layout layout;
struct ceph_timespec ctime, mtime, atime;
__le32 time_warp_seq;
__le64 size, max_size, truncate_size;
__le32 truncate_seq;
__le32 mode, uid, gid;
__le32 nlink;
__le64 files, subdirs, rbytes, rfiles, rsubdirs; /* dir stats */
struct ceph_timespec rctime;
struct ceph_frag_tree_head fragtree; /* (must be at end of struct) */
} __attribute__ ((packed));
/* followed by frag array, then symlink string, then xattr blob */
/* reply_lease follows dname, and reply_inode */
struct ceph_mds_reply_lease {
__le16 mask; /* lease type(s) */
__le32 duration_ms; /* lease duration */
__le32 seq;
} __attribute__ ((packed));
struct ceph_mds_reply_dirfrag {
__le32 frag; /* fragment */
__le32 auth; /* auth mds, if this is a delegation point */
__le32 ndist; /* number of mds' this is replicated on */
__le32 dist[];
} __attribute__ ((packed));
#define CEPH_LOCK_FCNTL 1
#define CEPH_LOCK_FLOCK 2
#define CEPH_LOCK_SHARED 1
#define CEPH_LOCK_EXCL 2
#define CEPH_LOCK_UNLOCK 4
struct ceph_filelock {
__le64 start;/* file offset to start lock at */
__le64 length; /* num bytes to lock; 0 for all following start */
__le64 client; /* which client holds the lock */
__le64 pid; /* process id holding the lock on the client */
__le64 pid_namespace;
__u8 type; /* shared lock, exclusive lock, or unlock */
} __attribute__ ((packed));
/* file access modes */
#define CEPH_FILE_MODE_PIN 0
#define CEPH_FILE_MODE_RD 1
#define CEPH_FILE_MODE_WR 2
#define CEPH_FILE_MODE_RDWR 3 /* RD | WR */
#define CEPH_FILE_MODE_LAZY 4 /* lazy io */
#define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */
int ceph_flags_to_mode(int flags);
/* capability bits */
#define CEPH_CAP_PIN 1 /* no specific capabilities beyond the pin */
/* generic cap bits */
#define CEPH_CAP_GSHARED 1 /* client can reads */
#define CEPH_CAP_GEXCL 2 /* client can read and update */
#define CEPH_CAP_GCACHE 4 /* (file) client can cache reads */
#define CEPH_CAP_GRD 8 /* (file) client can read */
#define CEPH_CAP_GWR 16 /* (file) client can write */
#define CEPH_CAP_GBUFFER 32 /* (file) client can buffer writes */
#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */
#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */
/* per-lock shift */
#define CEPH_CAP_SAUTH 2
#define CEPH_CAP_SLINK 4
#define CEPH_CAP_SXATTR 6
#define CEPH_CAP_SFILE 8
#define CEPH_CAP_SFLOCK 20
#define CEPH_CAP_BITS 22
/* composed values */
#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH)
#define CEPH_CAP_AUTH_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SAUTH)
#define CEPH_CAP_LINK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SLINK)
#define CEPH_CAP_LINK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SLINK)
#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SXATTR)
#define CEPH_CAP_XATTR_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SXATTR)
#define CEPH_CAP_FILE(x) (x << CEPH_CAP_SFILE)
#define CEPH_CAP_FILE_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFILE)
#define CEPH_CAP_FILE_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFILE)
#define CEPH_CAP_FILE_CACHE (CEPH_CAP_GCACHE << CEPH_CAP_SFILE)
#define CEPH_CAP_FILE_RD (CEPH_CAP_GRD << CEPH_CAP_SFILE)
#define CEPH_CAP_FILE_WR (CEPH_CAP_GWR << CEPH_CAP_SFILE)
#define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE)
#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
#define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE)
#define CEPH_CAP_FLOCK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFLOCK)
#define CEPH_CAP_FLOCK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFLOCK)
/* cap masks (for getattr) */
#define CEPH_STAT_CAP_INODE CEPH_CAP_PIN
#define CEPH_STAT_CAP_TYPE CEPH_CAP_PIN /* mode >> 12 */
#define CEPH_STAT_CAP_SYMLINK CEPH_CAP_PIN
#define CEPH_STAT_CAP_UID CEPH_CAP_AUTH_SHARED
#define CEPH_STAT_CAP_GID CEPH_CAP_AUTH_SHARED
#define CEPH_STAT_CAP_MODE CEPH_CAP_AUTH_SHARED
#define CEPH_STAT_CAP_NLINK CEPH_CAP_LINK_SHARED
#define CEPH_STAT_CAP_LAYOUT CEPH_CAP_FILE_SHARED
#define CEPH_STAT_CAP_MTIME CEPH_CAP_FILE_SHARED
#define CEPH_STAT_CAP_SIZE CEPH_CAP_FILE_SHARED
#define CEPH_STAT_CAP_ATIME CEPH_CAP_FILE_SHARED /* fixme */
#define CEPH_STAT_CAP_XATTR CEPH_CAP_XATTR_SHARED
#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN | \
CEPH_CAP_AUTH_SHARED | \
CEPH_CAP_LINK_SHARED | \
CEPH_CAP_FILE_SHARED | \
CEPH_CAP_XATTR_SHARED)
#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \
CEPH_CAP_LINK_SHARED | \
CEPH_CAP_XATTR_SHARED | \
CEPH_CAP_FILE_SHARED)
#define CEPH_CAP_ANY_RD (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD | \
CEPH_CAP_FILE_CACHE)
#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL | \
CEPH_CAP_LINK_EXCL | \
CEPH_CAP_XATTR_EXCL | \
CEPH_CAP_FILE_EXCL)
#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \
CEPH_CAP_FILE_EXCL)
#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \
CEPH_CAP_PIN)
#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
CEPH_LOCK_IXATTR)
int ceph_caps_for_mode(int mode);
enum {
CEPH_CAP_OP_GRANT, /* mds->client grant */
CEPH_CAP_OP_REVOKE, /* mds->client revoke */
CEPH_CAP_OP_TRUNC, /* mds->client trunc notify */
CEPH_CAP_OP_EXPORT, /* mds has exported the cap */
CEPH_CAP_OP_IMPORT, /* mds has imported the cap */
CEPH_CAP_OP_UPDATE, /* client->mds update */
CEPH_CAP_OP_DROP, /* client->mds drop cap bits */
CEPH_CAP_OP_FLUSH, /* client->mds cap writeback */
CEPH_CAP_OP_FLUSH_ACK, /* mds->client flushed */
CEPH_CAP_OP_FLUSHSNAP, /* client->mds flush snapped metadata */
CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
CEPH_CAP_OP_RELEASE, /* client->mds release (clean) cap */
CEPH_CAP_OP_RENEW, /* client->mds renewal request */
};
extern const char *ceph_cap_op_name(int op);
/*
* caps message, used for capability callbacks, acks, requests, etc.
*/
struct ceph_mds_caps {
__le32 op; /* CEPH_CAP_OP_* */
__le64 ino, realm;
__le64 cap_id;
__le32 seq, issue_seq;
__le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
__le32 migrate_seq;
__le64 snap_follows;
__le32 snap_trace_len;
/* authlock */
__le32 uid, gid, mode;
/* linklock */
__le32 nlink;
/* xattrlock */
__le32 xattr_len;
__le64 xattr_version;
/* filelock */
__le64 size, max_size, truncate_size;
__le32 truncate_seq;
struct ceph_timespec mtime, atime, ctime;
struct ceph_file_layout layout;
__le32 time_warp_seq;
} __attribute__ ((packed));
/* cap release msg head */
struct ceph_mds_cap_release {
__le32 num; /* number of cap_items that follow */
} __attribute__ ((packed));
struct ceph_mds_cap_item {
__le64 ino;
__le64 cap_id;
__le32 migrate_seq, seq;
} __attribute__ ((packed));
#define CEPH_MDS_LEASE_REVOKE 1 /* mds -> client */
#define CEPH_MDS_LEASE_RELEASE 2 /* client -> mds */
#define CEPH_MDS_LEASE_RENEW 3 /* client <-> mds */
#define CEPH_MDS_LEASE_REVOKE_ACK 4 /* client -> mds */
extern const char *ceph_lease_op_name(int o);
/* lease msg header */
struct ceph_mds_lease {
__u8 action; /* CEPH_MDS_LEASE_* */
__le16 mask; /* which lease */
__le64 ino;
__le64 first, last; /* snap range */
__le32 seq;
__le32 duration_ms; /* duration of renewal */
} __attribute__ ((packed));
/* followed by a __le32+string for dname */
/* client reconnect */
struct ceph_mds_cap_reconnect {
__le64 cap_id;
__le32 wanted;
__le32 issued;
__le64 snaprealm;
__le64 pathbase; /* base ino for our path to this ino */
__le32 flock_len; /* size of flock state blob, if any */
} __attribute__ ((packed));
/* followed by flock blob */
struct ceph_mds_cap_reconnect_v1 {
__le64 cap_id;
__le32 wanted;
__le32 issued;
__le64 size;
struct ceph_timespec mtime, atime;
__le64 snaprealm;
__le64 pathbase; /* base ino for our path to this ino */
} __attribute__ ((packed));
struct ceph_mds_snaprealm_reconnect {
__le64 ino; /* snap realm base */
__le64 seq; /* snap seq for this snap realm */
__le64 parent; /* parent realm */
} __attribute__ ((packed));
/*
* snaps
*/
enum {
CEPH_SNAP_OP_UPDATE, /* CREATE or DESTROY */
CEPH_SNAP_OP_CREATE,
CEPH_SNAP_OP_DESTROY,
CEPH_SNAP_OP_SPLIT,
};
extern const char *ceph_snap_op_name(int o);
/* snap msg header */
struct ceph_mds_snap_head {
__le32 op; /* CEPH_SNAP_OP_* */
__le64 split; /* ino to split off, if any */
__le32 num_split_inos; /* # inos belonging to new child realm */
__le32 num_split_realms; /* # child realms udner new child realm */
__le32 trace_len; /* size of snap trace blob */
} __attribute__ ((packed));
/* followed by split ino list, then split realms, then the trace blob */
/*
* encode info about a snaprealm, as viewed by a client
*/
struct ceph_mds_snap_realm {
__le64 ino; /* ino */
__le64 created; /* snap: when created */
__le64 parent; /* ino: parent realm */
__le64 parent_since; /* snap: same parent since */
__le64 seq; /* snap: version */
__le32 num_snaps;
__le32 num_prior_parent_snaps;
} __attribute__ ((packed));
/* followed by my snap list, then prior parent snap list */
#endif

View file

@ -0,0 +1,13 @@
#ifndef FS_CEPH_HASH_H
#define FS_CEPH_HASH_H
#define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */
#define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */
extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
extern const char *ceph_str_hash_name(int type);
#endif

View file

@ -0,0 +1,33 @@
#ifndef _FS_CEPH_DEBUGFS_H
#define _FS_CEPH_DEBUGFS_H
#include "ceph_debug.h"
#include "types.h"
#define CEPH_DEFINE_SHOW_FUNC(name) \
static int name##_open(struct inode *inode, struct file *file) \
{ \
struct seq_file *sf; \
int ret; \
\
ret = single_open(file, name, NULL); \
sf = file->private_data; \
sf->private = inode->i_private; \
return ret; \
} \
\
static const struct file_operations name##_fops = { \
.open = name##_open, \
.read = seq_read, \
.llseek = seq_lseek, \
.release = single_release, \
};
/* debugfs.c */
extern int ceph_debugfs_init(void);
extern void ceph_debugfs_cleanup(void);
extern int ceph_debugfs_client_init(struct ceph_client *client);
extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
#endif

201
include/linux/ceph/decode.h Normal file
View file

@ -0,0 +1,201 @@
#ifndef __CEPH_DECODE_H
#define __CEPH_DECODE_H
#include <asm/unaligned.h>
#include <linux/time.h>
#include "types.h"
/*
* in all cases,
* void **p pointer to position pointer
* void *end pointer to end of buffer (last byte + 1)
*/
static inline u64 ceph_decode_64(void **p)
{
u64 v = get_unaligned_le64(*p);
*p += sizeof(u64);
return v;
}
static inline u32 ceph_decode_32(void **p)
{
u32 v = get_unaligned_le32(*p);
*p += sizeof(u32);
return v;
}
static inline u16 ceph_decode_16(void **p)
{
u16 v = get_unaligned_le16(*p);
*p += sizeof(u16);
return v;
}
static inline u8 ceph_decode_8(void **p)
{
u8 v = *(u8 *)*p;
(*p)++;
return v;
}
static inline void ceph_decode_copy(void **p, void *pv, size_t n)
{
memcpy(pv, *p, n);
*p += n;
}
/*
* bounds check input.
*/
#define ceph_decode_need(p, end, n, bad) \
do { \
if (unlikely(*(p) + (n) > (end))) \
goto bad; \
} while (0)
#define ceph_decode_64_safe(p, end, v, bad) \
do { \
ceph_decode_need(p, end, sizeof(u64), bad); \
v = ceph_decode_64(p); \
} while (0)
#define ceph_decode_32_safe(p, end, v, bad) \
do { \
ceph_decode_need(p, end, sizeof(u32), bad); \
v = ceph_decode_32(p); \
} while (0)
#define ceph_decode_16_safe(p, end, v, bad) \
do { \
ceph_decode_need(p, end, sizeof(u16), bad); \
v = ceph_decode_16(p); \
} while (0)
#define ceph_decode_8_safe(p, end, v, bad) \
do { \
ceph_decode_need(p, end, sizeof(u8), bad); \
v = ceph_decode_8(p); \
} while (0)
#define ceph_decode_copy_safe(p, end, pv, n, bad) \
do { \
ceph_decode_need(p, end, n, bad); \
ceph_decode_copy(p, pv, n); \
} while (0)
/*
* struct ceph_timespec <-> struct timespec
*/
static inline void ceph_decode_timespec(struct timespec *ts,
const struct ceph_timespec *tv)
{
ts->tv_sec = le32_to_cpu(tv->tv_sec);
ts->tv_nsec = le32_to_cpu(tv->tv_nsec);
}
static inline void ceph_encode_timespec(struct ceph_timespec *tv,
const struct timespec *ts)
{
tv->tv_sec = cpu_to_le32(ts->tv_sec);
tv->tv_nsec = cpu_to_le32(ts->tv_nsec);
}
/*
* sockaddr_storage <-> ceph_sockaddr
*/
static inline void ceph_encode_addr(struct ceph_entity_addr *a)
{
__be16 ss_family = htons(a->in_addr.ss_family);
a->in_addr.ss_family = *(__u16 *)&ss_family;
}
static inline void ceph_decode_addr(struct ceph_entity_addr *a)
{
__be16 ss_family = *(__be16 *)&a->in_addr.ss_family;
a->in_addr.ss_family = ntohs(ss_family);
WARN_ON(a->in_addr.ss_family == 512);
}
/*
* encoders
*/
static inline void ceph_encode_64(void **p, u64 v)
{
put_unaligned_le64(v, (__le64 *)*p);
*p += sizeof(u64);
}
static inline void ceph_encode_32(void **p, u32 v)
{
put_unaligned_le32(v, (__le32 *)*p);
*p += sizeof(u32);
}
static inline void ceph_encode_16(void **p, u16 v)
{
put_unaligned_le16(v, (__le16 *)*p);
*p += sizeof(u16);
}
static inline void ceph_encode_8(void **p, u8 v)
{
*(u8 *)*p = v;
(*p)++;
}
static inline void ceph_encode_copy(void **p, const void *s, int len)
{
memcpy(*p, s, len);
*p += len;
}
/*
* filepath, string encoders
*/
static inline void ceph_encode_filepath(void **p, void *end,
u64 ino, const char *path)
{
u32 len = path ? strlen(path) : 0;
BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end);
ceph_encode_8(p, 1);
ceph_encode_64(p, ino);
ceph_encode_32(p, len);
if (len)
memcpy(*p, path, len);
*p += len;
}
static inline void ceph_encode_string(void **p, void *end,
const char *s, u32 len)
{
BUG_ON(*p + sizeof(len) + len > end);
ceph_encode_32(p, len);
if (len)
memcpy(*p, s, len);
*p += len;
}
#define ceph_encode_need(p, end, n, bad) \
do { \
if (unlikely(*(p) + (n) > (end))) \
goto bad; \
} while (0)
#define ceph_encode_64_safe(p, end, v, bad) \
do { \
ceph_encode_need(p, end, sizeof(u64), bad); \
ceph_encode_64(p, v); \
} while (0)
#define ceph_encode_32_safe(p, end, v, bad) \
do { \
ceph_encode_need(p, end, sizeof(u32), bad); \
ceph_encode_32(p, v); \
} while (0)
#define ceph_encode_16_safe(p, end, v, bad) \
do { \
ceph_encode_need(p, end, sizeof(u16), bad); \
ceph_encode_16(p, v); \
} while (0)
#define ceph_encode_copy_safe(p, end, pv, n, bad) \
do { \
ceph_encode_need(p, end, n, bad); \
ceph_encode_copy(p, pv, n); \
} while (0)
#define ceph_encode_string_safe(p, end, s, n, bad) \
do { \
ceph_encode_need(p, end, n, bad); \
ceph_encode_string(p, end, s, n); \
} while (0)
#endif

View file

@ -0,0 +1,249 @@
#ifndef _FS_CEPH_LIBCEPH_H
#define _FS_CEPH_LIBCEPH_H
#include "ceph_debug.h"
#include <asm/unaligned.h>
#include <linux/backing-dev.h>
#include <linux/completion.h>
#include <linux/exportfs.h>
#include <linux/fs.h>
#include <linux/mempool.h>
#include <linux/pagemap.h>
#include <linux/wait.h>
#include <linux/writeback.h>
#include <linux/slab.h>
#include "types.h"
#include "messenger.h"
#include "msgpool.h"
#include "mon_client.h"
#include "osd_client.h"
#include "ceph_fs.h"
/*
* Supported features
*/
#define CEPH_FEATURE_SUPPORTED_DEFAULT CEPH_FEATURE_NOSRCADDR
#define CEPH_FEATURE_REQUIRED_DEFAULT CEPH_FEATURE_NOSRCADDR
/*
* mount options
*/
#define CEPH_OPT_FSID (1<<0)
#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */
#define CEPH_OPT_MYIP (1<<2) /* specified my ip */
#define CEPH_OPT_NOCRC (1<<3) /* no data crc on writes */
#define CEPH_OPT_DEFAULT (0);
#define ceph_set_opt(client, opt) \
(client)->options->flags |= CEPH_OPT_##opt;
#define ceph_test_opt(client, opt) \
(!!((client)->options->flags & CEPH_OPT_##opt))
struct ceph_options {
int flags;
struct ceph_fsid fsid;
struct ceph_entity_addr my_addr;
int mount_timeout;
int osd_idle_ttl;
int osd_timeout;
int osd_keepalive_timeout;
/*
* any type that can't be simply compared or doesn't need need
* to be compared should go beyond this point,
* ceph_compare_options() should be updated accordingly
*/
struct ceph_entity_addr *mon_addr; /* should be the first
pointer type of args */
int num_mon;
char *name;
char *secret;
};
/*
* defaults
*/
#define CEPH_MOUNT_TIMEOUT_DEFAULT 60
#define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */
#define CEPH_OSD_KEEPALIVE_DEFAULT 5
#define CEPH_OSD_IDLE_TTL_DEFAULT 60
#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */
#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
#define CEPH_AUTH_NAME_DEFAULT "guest"
/*
* Delay telling the MDS we no longer want caps, in case we reopen
* the file. Delay a minimum amount of time, even if we send a cap
* message for some other reason. Otherwise, take the oppotunity to
* update the mds to avoid sending another message later.
*/
#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4)
/* mount state */
enum {
CEPH_MOUNT_MOUNTING,
CEPH_MOUNT_MOUNTED,
CEPH_MOUNT_UNMOUNTING,
CEPH_MOUNT_UNMOUNTED,
CEPH_MOUNT_SHUTDOWN,
};
/*
* subtract jiffies
*/
static inline unsigned long time_sub(unsigned long a, unsigned long b)
{
BUG_ON(time_after(b, a));
return (long)a - (long)b;
}
struct ceph_mds_client;
/*
* per client state
*
* possibly shared by multiple mount points, if they are
* mounting the same ceph filesystem/cluster.
*/
struct ceph_client {
struct ceph_fsid fsid;
bool have_fsid;
void *private;
struct ceph_options *options;
struct mutex mount_mutex; /* serialize mount attempts */
wait_queue_head_t auth_wq;
int auth_err;
int (*extra_mon_dispatch)(struct ceph_client *, struct ceph_msg *);
u32 supported_features;
u32 required_features;
struct ceph_messenger *msgr; /* messenger instance */
struct ceph_mon_client monc;
struct ceph_osd_client osdc;
#ifdef CONFIG_DEBUG_FS
struct dentry *debugfs_dir;
struct dentry *debugfs_monmap;
struct dentry *debugfs_osdmap;
#endif
};
/*
* snapshots
*/
/*
* A "snap context" is the set of existing snapshots when we
* write data. It is used by the OSD to guide its COW behavior.
*
* The ceph_snap_context is refcounted, and attached to each dirty
* page, indicating which context the dirty data belonged when it was
* dirtied.
*/
struct ceph_snap_context {
atomic_t nref;
u64 seq;
int num_snaps;
u64 snaps[];
};
static inline struct ceph_snap_context *
ceph_get_snap_context(struct ceph_snap_context *sc)
{
/*
printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
atomic_read(&sc->nref)+1);
*/
if (sc)
atomic_inc(&sc->nref);
return sc;
}
static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
{
if (!sc)
return;
/*
printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
atomic_read(&sc->nref)-1);
*/
if (atomic_dec_and_test(&sc->nref)) {
/*printk(" deleting snap_context %p\n", sc);*/
kfree(sc);
}
}
/*
* calculate the number of pages a given length and offset map onto,
* if we align the data.
*/
static inline int calc_pages_for(u64 off, u64 len)
{
return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
(off >> PAGE_CACHE_SHIFT);
}
/* ceph_common.c */
extern const char *ceph_msg_type_name(int type);
extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
extern struct kmem_cache *ceph_inode_cachep;
extern struct kmem_cache *ceph_cap_cachep;
extern struct kmem_cache *ceph_dentry_cachep;
extern struct kmem_cache *ceph_file_cachep;
extern int ceph_parse_options(struct ceph_options **popt, char *options,
const char *dev_name, const char *dev_name_end,
int (*parse_extra_token)(char *c, void *private),
void *private);
extern void ceph_destroy_options(struct ceph_options *opt);
extern int ceph_compare_options(struct ceph_options *new_opt,
struct ceph_client *client);
extern struct ceph_client *ceph_create_client(struct ceph_options *opt,
void *private);
extern u64 ceph_client_id(struct ceph_client *client);
extern void ceph_destroy_client(struct ceph_client *client);
extern int __ceph_open_session(struct ceph_client *client,
unsigned long started);
extern int ceph_open_session(struct ceph_client *client);
/* pagevec.c */
extern void ceph_release_page_vector(struct page **pages, int num_pages);
extern struct page **ceph_get_direct_page_vector(const char __user *data,
int num_pages,
loff_t off, size_t len);
extern void ceph_put_page_vector(struct page **pages, int num_pages);
extern void ceph_release_page_vector(struct page **pages, int num_pages);
extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
extern int ceph_copy_user_to_page_vector(struct page **pages,
const char __user *data,
loff_t off, size_t len);
extern int ceph_copy_to_page_vector(struct page **pages,
const char *data,
loff_t off, size_t len);
extern int ceph_copy_from_page_vector(struct page **pages,
char *data,
loff_t off, size_t len);
extern int ceph_copy_page_vector_to_user(struct page **pages, char __user *data,
loff_t off, size_t len);
extern void ceph_zero_page_vector_range(int off, int len, struct page **pages);
#endif /* _FS_CEPH_SUPER_H */

View file

@ -0,0 +1,62 @@
#ifndef _FS_CEPH_MDSMAP_H
#define _FS_CEPH_MDSMAP_H
#include "types.h"
/*
* mds map - describe servers in the mds cluster.
*
* we limit fields to those the client actually xcares about
*/
struct ceph_mds_info {
u64 global_id;
struct ceph_entity_addr addr;
s32 state;
int num_export_targets;
bool laggy;
u32 *export_targets;
};
struct ceph_mdsmap {
u32 m_epoch, m_client_epoch, m_last_failure;
u32 m_root;
u32 m_session_timeout; /* seconds */
u32 m_session_autoclose; /* seconds */
u64 m_max_file_size;
u32 m_max_mds; /* size of m_addr, m_state arrays */
struct ceph_mds_info *m_info;
/* which object pools file data can be stored in */
int m_num_data_pg_pools;
u32 *m_data_pg_pools;
u32 m_cas_pg_pool;
};
static inline struct ceph_entity_addr *
ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
{
if (w >= m->m_max_mds)
return NULL;
return &m->m_info[w].addr;
}
static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
{
BUG_ON(w < 0);
if (w >= m->m_max_mds)
return CEPH_MDS_STATE_DNE;
return m->m_info[w].state;
}
static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
{
if (w >= 0 && w < m->m_max_mds)
return m->m_info[w].laggy;
return false;
}
extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
#endif

View file

@ -0,0 +1,261 @@
#ifndef __FS_CEPH_MESSENGER_H
#define __FS_CEPH_MESSENGER_H
#include <linux/kref.h>
#include <linux/mutex.h>
#include <linux/net.h>
#include <linux/radix-tree.h>
#include <linux/uio.h>
#include <linux/version.h>
#include <linux/workqueue.h>
#include "types.h"
#include "buffer.h"
struct ceph_msg;
struct ceph_connection;
extern struct workqueue_struct *ceph_msgr_wq; /* receive work queue */
/*
* Ceph defines these callbacks for handling connection events.
*/
struct ceph_connection_operations {
struct ceph_connection *(*get)(struct ceph_connection *);
void (*put)(struct ceph_connection *);
/* handle an incoming message. */
void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
/* authorize an outgoing connection */
int (*get_authorizer) (struct ceph_connection *con,
void **buf, int *len, int *proto,
void **reply_buf, int *reply_len, int force_new);
int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
int (*invalidate_authorizer)(struct ceph_connection *con);
/* protocol version mismatch */
void (*bad_proto) (struct ceph_connection *con);
/* there was some error on the socket (disconnect, whatever) */
void (*fault) (struct ceph_connection *con);
/* a remote host as terminated a message exchange session, and messages
* we sent (or they tried to send us) may be lost. */
void (*peer_reset) (struct ceph_connection *con);
struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
struct ceph_msg_header *hdr,
int *skip);
};
/* use format string %s%d */
#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
struct ceph_messenger {
struct ceph_entity_inst inst; /* my name+address */
struct ceph_entity_addr my_enc_addr;
struct page *zero_page; /* used in certain error cases */
bool nocrc;
/*
* the global_seq counts connections i (attempt to) initiate
* in order to disambiguate certain connect race conditions.
*/
u32 global_seq;
spinlock_t global_seq_lock;
u32 supported_features;
u32 required_features;
};
/*
* a single message. it contains a header (src, dest, message type, etc.),
* footer (crc values, mainly), a "front" message body, and possibly a
* data payload (stored in some number of pages).
*/
struct ceph_msg {
struct ceph_msg_header hdr; /* header */
struct ceph_msg_footer footer; /* footer */
struct kvec front; /* unaligned blobs of message */
struct ceph_buffer *middle;
struct page **pages; /* data payload. NOT OWNER. */
unsigned nr_pages; /* size of page array */
struct ceph_pagelist *pagelist; /* instead of pages */
struct list_head list_head;
struct kref kref;
struct bio *bio; /* instead of pages/pagelist */
struct bio *bio_iter; /* bio iterator */
int bio_seg; /* current bio segment */
struct ceph_pagelist *trail; /* the trailing part of the data */
bool front_is_vmalloc;
bool more_to_follow;
bool needs_out_seq;
int front_max;
struct ceph_msgpool *pool;
};
struct ceph_msg_pos {
int page, page_pos; /* which page; offset in page */
int data_pos; /* offset in data payload */
int did_page_crc; /* true if we've calculated crc for current page */
};
/* ceph connection fault delay defaults, for exponential backoff */
#define BASE_DELAY_INTERVAL (HZ/2)
#define MAX_DELAY_INTERVAL (5 * 60 * HZ)
/*
* ceph_connection state bit flags
*
* QUEUED and BUSY are used together to ensure that only a single
* thread is currently opening, reading or writing data to the socket.
*/
#define LOSSYTX 0 /* we can close channel or drop messages on errors */
#define CONNECTING 1
#define NEGOTIATING 2
#define KEEPALIVE_PENDING 3
#define WRITE_PENDING 4 /* we have data ready to send */
#define QUEUED 5 /* there is work queued on this connection */
#define BUSY 6 /* work is being done */
#define STANDBY 8 /* no outgoing messages, socket closed. we keep
* the ceph_connection around to maintain shared
* state with the peer. */
#define CLOSED 10 /* we've closed the connection */
#define SOCK_CLOSED 11 /* socket state changed to closed */
#define OPENING 13 /* open connection w/ (possibly new) peer */
#define DEAD 14 /* dead, about to kfree */
/*
* A single connection with another host.
*
* We maintain a queue of outgoing messages, and some session state to
* ensure that we can preserve the lossless, ordered delivery of
* messages in the case of a TCP disconnect.
*/
struct ceph_connection {
void *private;
atomic_t nref;
const struct ceph_connection_operations *ops;
struct ceph_messenger *msgr;
struct socket *sock;
unsigned long state; /* connection state (see flags above) */
const char *error_msg; /* error message, if any */
struct ceph_entity_addr peer_addr; /* peer address */
struct ceph_entity_name peer_name; /* peer name */
struct ceph_entity_addr peer_addr_for_me;
unsigned peer_features;
u32 connect_seq; /* identify the most recent connection
attempt for this connection, client */
u32 peer_global_seq; /* peer's global seq for this connection */
int auth_retry; /* true if we need a newer authorizer */
void *auth_reply_buf; /* where to put the authorizer reply */
int auth_reply_buf_len;
struct mutex mutex;
/* out queue */
struct list_head out_queue;
struct list_head out_sent; /* sending or sent but unacked */
u64 out_seq; /* last message queued for send */
bool out_keepalive_pending;
u64 in_seq, in_seq_acked; /* last message received, acked */
/* connection negotiation temps */
char in_banner[CEPH_BANNER_MAX_LEN];
union {
struct { /* outgoing connection */
struct ceph_msg_connect out_connect;
struct ceph_msg_connect_reply in_reply;
};
struct { /* incoming */
struct ceph_msg_connect in_connect;
struct ceph_msg_connect_reply out_reply;
};
};
struct ceph_entity_addr actual_peer_addr;
/* message out temps */
struct ceph_msg *out_msg; /* sending message (== tail of
out_sent) */
bool out_msg_done;
struct ceph_msg_pos out_msg_pos;
struct kvec out_kvec[8], /* sending header/footer data */
*out_kvec_cur;
int out_kvec_left; /* kvec's left in out_kvec */
int out_skip; /* skip this many bytes */
int out_kvec_bytes; /* total bytes left */
bool out_kvec_is_msg; /* kvec refers to out_msg */
int out_more; /* there is more data after the kvecs */
__le64 out_temp_ack; /* for writing an ack */
/* message in temps */
struct ceph_msg_header in_hdr;
struct ceph_msg *in_msg;
struct ceph_msg_pos in_msg_pos;
u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */
char in_tag; /* protocol control byte */
int in_base_pos; /* bytes read */
__le64 in_temp_ack; /* for reading an ack */
struct delayed_work work; /* send|recv work */
unsigned long delay; /* current delay interval */
};
extern const char *ceph_pr_addr(const struct sockaddr_storage *ss);
extern int ceph_parse_ips(const char *c, const char *end,
struct ceph_entity_addr *addr,
int max_count, int *count);
extern int ceph_msgr_init(void);
extern void ceph_msgr_exit(void);
extern void ceph_msgr_flush(void);
extern struct ceph_messenger *ceph_messenger_create(
struct ceph_entity_addr *myaddr,
u32 features, u32 required);
extern void ceph_messenger_destroy(struct ceph_messenger *);
extern void ceph_con_init(struct ceph_messenger *msgr,
struct ceph_connection *con);
extern void ceph_con_open(struct ceph_connection *con,
struct ceph_entity_addr *addr);
extern bool ceph_con_opened(struct ceph_connection *con);
extern void ceph_con_close(struct ceph_connection *con);
extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
extern void ceph_con_revoke_message(struct ceph_connection *con,
struct ceph_msg *msg);
extern void ceph_con_keepalive(struct ceph_connection *con);
extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
extern void ceph_con_put(struct ceph_connection *con);
extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags);
extern void ceph_msg_kfree(struct ceph_msg *m);
static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
{
kref_get(&msg->kref);
return msg;
}
extern void ceph_msg_last_put(struct kref *kref);
static inline void ceph_msg_put(struct ceph_msg *msg)
{
kref_put(&msg->kref, ceph_msg_last_put);
}
extern void ceph_msg_dump(struct ceph_msg *msg);
#endif

View file

@ -0,0 +1,122 @@
#ifndef _FS_CEPH_MON_CLIENT_H
#define _FS_CEPH_MON_CLIENT_H
#include <linux/completion.h>
#include <linux/kref.h>
#include <linux/rbtree.h>
#include "messenger.h"
struct ceph_client;
struct ceph_mount_args;
struct ceph_auth_client;
/*
* The monitor map enumerates the set of all monitors.
*/
struct ceph_monmap {
struct ceph_fsid fsid;
u32 epoch;
u32 num_mon;
struct ceph_entity_inst mon_inst[0];
};
struct ceph_mon_client;
struct ceph_mon_generic_request;
/*
* Generic mechanism for resending monitor requests.
*/
typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc,
int newmon);
/* a pending monitor request */
struct ceph_mon_request {
struct ceph_mon_client *monc;
struct delayed_work delayed_work;
unsigned long delay;
ceph_monc_request_func_t do_request;
};
/*
* ceph_mon_generic_request is being used for the statfs and poolop requests
* which are bening done a bit differently because we need to get data back
* to the caller
*/
struct ceph_mon_generic_request {
struct kref kref;
u64 tid;
struct rb_node node;
int result;
void *buf;
int buf_len;
struct completion completion;
struct ceph_msg *request; /* original request */
struct ceph_msg *reply; /* and reply */
};
struct ceph_mon_client {
struct ceph_client *client;
struct ceph_monmap *monmap;
struct mutex mutex;
struct delayed_work delayed_work;
struct ceph_auth_client *auth;
struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack;
int pending_auth;
bool hunting;
int cur_mon; /* last monitor i contacted */
unsigned long sub_sent, sub_renew_after;
struct ceph_connection *con;
bool have_fsid;
/* pending generic requests */
struct rb_root generic_request_tree;
int num_generic_requests;
u64 last_tid;
/* mds/osd map */
int want_mdsmap;
int want_next_osdmap; /* 1 = want, 2 = want+asked */
u32 have_osdmap, have_mdsmap;
#ifdef CONFIG_DEBUG_FS
struct dentry *debugfs_file;
#endif
};
extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
extern int ceph_monmap_contains(struct ceph_monmap *m,
struct ceph_entity_addr *addr);
extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
extern void ceph_monc_stop(struct ceph_mon_client *monc);
/*
* The model here is to indicate that we need a new map of at least
* epoch @want, and also call in when we receive a map. We will
* periodically rerequest the map from the monitor cluster until we
* get what we want.
*/
extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
struct ceph_statfs *buf);
extern int ceph_monc_open_session(struct ceph_mon_client *monc);
extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
extern int ceph_monc_create_snapid(struct ceph_mon_client *monc,
u32 pool, u64 *snapid);
extern int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
u32 pool, u64 snapid);
#endif

View file

@ -0,0 +1,25 @@
#ifndef _FS_CEPH_MSGPOOL
#define _FS_CEPH_MSGPOOL
#include <linux/mempool.h>
#include "messenger.h"
/*
* we use memory pools for preallocating messages we may receive, to
* avoid unexpected OOM conditions.
*/
struct ceph_msgpool {
const char *name;
mempool_t *pool;
int front_len; /* preallocated payload size */
};
extern int ceph_msgpool_init(struct ceph_msgpool *pool,
int front_len, int size, bool blocking,
const char *name);
extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
int front_len);
extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
#endif

175
include/linux/ceph/msgr.h Normal file
View file

@ -0,0 +1,175 @@
#ifndef CEPH_MSGR_H
#define CEPH_MSGR_H
/*
* Data types for message passing layer used by Ceph.
*/
#define CEPH_MON_PORT 6789 /* default monitor port */
/*
* client-side processes will try to bind to ports in this
* range, simply for the benefit of tools like nmap or wireshark
* that would like to identify the protocol.
*/
#define CEPH_PORT_FIRST 6789
#define CEPH_PORT_START 6800 /* non-monitors start here */
#define CEPH_PORT_LAST 6900
/*
* tcp connection banner. include a protocol version. and adjust
* whenever the wire protocol changes. try to keep this string length
* constant.
*/
#define CEPH_BANNER "ceph v027"
#define CEPH_BANNER_MAX_LEN 30
/*
* Rollover-safe type and comparator for 32-bit sequence numbers.
* Comparator returns -1, 0, or 1.
*/
typedef __u32 ceph_seq_t;
static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
{
return (__s32)a - (__s32)b;
}
/*
* entity_name -- logical name for a process participating in the
* network, e.g. 'mds0' or 'osd3'.
*/
struct ceph_entity_name {
__u8 type; /* CEPH_ENTITY_TYPE_* */
__le64 num;
} __attribute__ ((packed));
#define CEPH_ENTITY_TYPE_MON 0x01
#define CEPH_ENTITY_TYPE_MDS 0x02
#define CEPH_ENTITY_TYPE_OSD 0x04
#define CEPH_ENTITY_TYPE_CLIENT 0x08
#define CEPH_ENTITY_TYPE_AUTH 0x20
#define CEPH_ENTITY_TYPE_ANY 0xFF
extern const char *ceph_entity_type_name(int type);
/*
* entity_addr -- network address
*/
struct ceph_entity_addr {
__le32 type;
__le32 nonce; /* unique id for process (e.g. pid) */
struct sockaddr_storage in_addr;
} __attribute__ ((packed));
struct ceph_entity_inst {
struct ceph_entity_name name;
struct ceph_entity_addr addr;
} __attribute__ ((packed));
/* used by message exchange protocol */
#define CEPH_MSGR_TAG_READY 1 /* server->client: ready for messages */
#define CEPH_MSGR_TAG_RESETSESSION 2 /* server->client: reset, try again */
#define CEPH_MSGR_TAG_WAIT 3 /* server->client: wait for racing
incoming connection */
#define CEPH_MSGR_TAG_RETRY_SESSION 4 /* server->client + cseq: try again
with higher cseq */
#define CEPH_MSGR_TAG_RETRY_GLOBAL 5 /* server->client + gseq: try again
with higher gseq */
#define CEPH_MSGR_TAG_CLOSE 6 /* closing pipe */
#define CEPH_MSGR_TAG_MSG 7 /* message */
#define CEPH_MSGR_TAG_ACK 8 /* message ack */
#define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */
#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */
#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
#define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */
/*
* connection negotiation
*/
struct ceph_msg_connect {
__le64 features; /* supported feature bits */
__le32 host_type; /* CEPH_ENTITY_TYPE_* */
__le32 global_seq; /* count connections initiated by this host */
__le32 connect_seq; /* count connections initiated in this session */
__le32 protocol_version;
__le32 authorizer_protocol;
__le32 authorizer_len;
__u8 flags; /* CEPH_MSG_CONNECT_* */
} __attribute__ ((packed));
struct ceph_msg_connect_reply {
__u8 tag;
__le64 features; /* feature bits for this session */
__le32 global_seq;
__le32 connect_seq;
__le32 protocol_version;
__le32 authorizer_len;
__u8 flags;
} __attribute__ ((packed));
#define CEPH_MSG_CONNECT_LOSSY 1 /* messages i send may be safely dropped */
/*
* message header
*/
struct ceph_msg_header_old {
__le64 seq; /* message seq# for this session */
__le64 tid; /* transaction id */
__le16 type; /* message type */
__le16 priority; /* priority. higher value == higher priority */
__le16 version; /* version of message encoding */
__le32 front_len; /* bytes in main payload */
__le32 middle_len;/* bytes in middle payload */
__le32 data_len; /* bytes of data payload */
__le16 data_off; /* sender: include full offset;
receiver: mask against ~PAGE_MASK */
struct ceph_entity_inst src, orig_src;
__le32 reserved;
__le32 crc; /* header crc32c */
} __attribute__ ((packed));
struct ceph_msg_header {
__le64 seq; /* message seq# for this session */
__le64 tid; /* transaction id */
__le16 type; /* message type */
__le16 priority; /* priority. higher value == higher priority */
__le16 version; /* version of message encoding */
__le32 front_len; /* bytes in main payload */
__le32 middle_len;/* bytes in middle payload */
__le32 data_len; /* bytes of data payload */
__le16 data_off; /* sender: include full offset;
receiver: mask against ~PAGE_MASK */
struct ceph_entity_name src;
__le32 reserved;
__le32 crc; /* header crc32c */
} __attribute__ ((packed));
#define CEPH_MSG_PRIO_LOW 64
#define CEPH_MSG_PRIO_DEFAULT 127
#define CEPH_MSG_PRIO_HIGH 196
#define CEPH_MSG_PRIO_HIGHEST 255
/*
* follows data payload
*/
struct ceph_msg_footer {
__le32 front_crc, middle_crc, data_crc;
__u8 flags;
} __attribute__ ((packed));
#define CEPH_MSG_FOOTER_COMPLETE (1<<0) /* msg wasn't aborted */
#define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */
#endif

View file

@ -0,0 +1,234 @@
#ifndef _FS_CEPH_OSD_CLIENT_H
#define _FS_CEPH_OSD_CLIENT_H
#include <linux/completion.h>
#include <linux/kref.h>
#include <linux/mempool.h>
#include <linux/rbtree.h>
#include "types.h"
#include "osdmap.h"
#include "messenger.h"
struct ceph_msg;
struct ceph_snap_context;
struct ceph_osd_request;
struct ceph_osd_client;
struct ceph_authorizer;
struct ceph_pagelist;
/*
* completion callback for async writepages
*/
typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
struct ceph_msg *);
/* a given osd we're communicating with */
struct ceph_osd {
atomic_t o_ref;
struct ceph_osd_client *o_osdc;
int o_osd;
int o_incarnation;
struct rb_node o_node;
struct ceph_connection o_con;
struct list_head o_requests;
struct list_head o_osd_lru;
struct ceph_authorizer *o_authorizer;
void *o_authorizer_buf, *o_authorizer_reply_buf;
size_t o_authorizer_buf_len, o_authorizer_reply_buf_len;
unsigned long lru_ttl;
int o_marked_for_keepalive;
struct list_head o_keepalive_item;
};
/* an in-flight request */
struct ceph_osd_request {
u64 r_tid; /* unique for this client */
struct rb_node r_node;
struct list_head r_req_lru_item;
struct list_head r_osd_item;
struct ceph_osd *r_osd;
struct ceph_pg r_pgid;
int r_pg_osds[CEPH_PG_MAX_SIZE];
int r_num_pg_osds;
struct ceph_connection *r_con_filling_msg;
struct ceph_msg *r_request, *r_reply;
int r_result;
int r_flags; /* any additional flags for the osd */
u32 r_sent; /* >0 if r_request is sending/sent */
int r_got_reply;
struct ceph_osd_client *r_osdc;
struct kref r_kref;
bool r_mempool;
struct completion r_completion, r_safe_completion;
ceph_osdc_callback_t r_callback, r_safe_callback;
struct ceph_eversion r_reassert_version;
struct list_head r_unsafe_item;
struct inode *r_inode; /* for use by callbacks */
void *r_priv; /* ditto */
char r_oid[40]; /* object name */
int r_oid_len;
unsigned long r_stamp; /* send OR check time */
bool r_resend; /* msg send failed, needs retry */
struct ceph_file_layout r_file_layout;
struct ceph_snap_context *r_snapc; /* snap context for writes */
unsigned r_num_pages; /* size of page array (follows) */
struct page **r_pages; /* pages for data payload */
int r_pages_from_pool;
int r_own_pages; /* if true, i own page list */
#ifdef CONFIG_BLOCK
struct bio *r_bio; /* instead of pages */
#endif
struct ceph_pagelist *r_trail; /* trailing part of the data */
};
struct ceph_osd_client {
struct ceph_client *client;
struct ceph_osdmap *osdmap; /* current map */
struct rw_semaphore map_sem;
struct completion map_waiters;
u64 last_requested_map;
struct mutex request_mutex;
struct rb_root osds; /* osds */
struct list_head osd_lru; /* idle osds */
u64 timeout_tid; /* tid of timeout triggering rq */
u64 last_tid; /* tid of last request */
struct rb_root requests; /* pending requests */
struct list_head req_lru; /* pending requests lru */
int num_requests;
struct delayed_work timeout_work;
struct delayed_work osds_timeout_work;
#ifdef CONFIG_DEBUG_FS
struct dentry *debugfs_file;
#endif
mempool_t *req_mempool;
struct ceph_msgpool msgpool_op;
struct ceph_msgpool msgpool_op_reply;
};
struct ceph_osd_req_op {
u16 op; /* CEPH_OSD_OP_* */
u32 flags; /* CEPH_OSD_FLAG_* */
union {
struct {
u64 offset, length;
u64 truncate_size;
u32 truncate_seq;
} extent;
struct {
const char *name;
u32 name_len;
const char *val;
u32 value_len;
__u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
__u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
} xattr;
struct {
const char *class_name;
__u8 class_len;
const char *method_name;
__u8 method_len;
__u8 argc;
const char *indata;
u32 indata_len;
} cls;
struct {
u64 cookie, count;
} pgls;
struct {
u64 snapid;
} snap;
};
u32 payload_len;
};
extern int ceph_osdc_init(struct ceph_osd_client *osdc,
struct ceph_client *client);
extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
struct ceph_msg *msg);
extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
struct ceph_msg *msg);
extern void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
struct ceph_file_layout *layout,
u64 snapid,
u64 off, u64 *plen, u64 *bno,
struct ceph_osd_request *req,
struct ceph_osd_req_op *op);
extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
int flags,
struct ceph_snap_context *snapc,
struct ceph_osd_req_op *ops,
bool use_mempool,
gfp_t gfp_flags,
struct page **pages,
struct bio *bio);
extern void ceph_osdc_build_request(struct ceph_osd_request *req,
u64 off, u64 *plen,
struct ceph_osd_req_op *src_ops,
struct ceph_snap_context *snapc,
struct timespec *mtime,
const char *oid,
int oid_len);
extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
struct ceph_file_layout *layout,
struct ceph_vino vino,
u64 offset, u64 *len, int op, int flags,
struct ceph_snap_context *snapc,
int do_sync, u32 truncate_seq,
u64 truncate_size,
struct timespec *mtime,
bool use_mempool, int num_reply);
static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
{
kref_get(&req->r_kref);
}
extern void ceph_osdc_release_request(struct kref *kref);
static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
{
kref_put(&req->r_kref, ceph_osdc_release_request);
}
extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
struct ceph_osd_request *req,
bool nofail);
extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
struct ceph_osd_request *req);
extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
struct ceph_vino vino,
struct ceph_file_layout *layout,
u64 off, u64 *plen,
u32 truncate_seq, u64 truncate_size,
struct page **pages, int nr_pages);
extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
struct ceph_vino vino,
struct ceph_file_layout *layout,
struct ceph_snap_context *sc,
u64 off, u64 len,
u32 truncate_seq, u64 truncate_size,
struct timespec *mtime,
struct page **pages, int nr_pages,
int flags, int do_sync, bool nofail);
#endif

130
include/linux/ceph/osdmap.h Normal file
View file

@ -0,0 +1,130 @@
#ifndef _FS_CEPH_OSDMAP_H
#define _FS_CEPH_OSDMAP_H
#include <linux/rbtree.h>
#include "types.h"
#include "ceph_fs.h"
#include <linux/crush/crush.h>
/*
* The osd map describes the current membership of the osd cluster and
* specifies the mapping of objects to placement groups and placement
* groups to (sets of) osds. That is, it completely specifies the
* (desired) distribution of all data objects in the system at some
* point in time.
*
* Each map version is identified by an epoch, which increases monotonically.
*
* The map can be updated either via an incremental map (diff) describing
* the change between two successive epochs, or as a fully encoded map.
*/
struct ceph_pg_pool_info {
struct rb_node node;
int id;
struct ceph_pg_pool v;
int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
char *name;
};
struct ceph_pg_mapping {
struct rb_node node;
struct ceph_pg pgid;
int len;
int osds[];
};
struct ceph_osdmap {
struct ceph_fsid fsid;
u32 epoch;
u32 mkfs_epoch;
struct ceph_timespec created, modified;
u32 flags; /* CEPH_OSDMAP_* */
u32 max_osd; /* size of osd_state, _offload, _addr arrays */
u8 *osd_state; /* CEPH_OSD_* */
u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */
struct ceph_entity_addr *osd_addr;
struct rb_root pg_temp;
struct rb_root pg_pools;
u32 pool_max;
/* the CRUSH map specifies the mapping of placement groups to
* the list of osds that store+replicate them. */
struct crush_map *crush;
};
/*
* file layout helpers
*/
#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
#define ceph_file_layout_stripe_count(l) \
((__s32)le32_to_cpu((l).fl_stripe_count))
#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
#define ceph_file_layout_object_su(l) \
((__s32)le32_to_cpu((l).fl_object_stripe_unit))
#define ceph_file_layout_pg_preferred(l) \
((__s32)le32_to_cpu((l).fl_pg_preferred))
#define ceph_file_layout_pg_pool(l) \
((__s32)le32_to_cpu((l).fl_pg_pool))
static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
{
return le32_to_cpu(l->fl_stripe_unit) *
le32_to_cpu(l->fl_stripe_count);
}
/* "period" == bytes before i start on a new set of objects */
static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
{
return le32_to_cpu(l->fl_object_size) *
le32_to_cpu(l->fl_stripe_count);
}
static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
{
return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
}
static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
{
return map && (map->flags & flag);
}
extern char *ceph_osdmap_state_str(char *str, int len, int state);
static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
int osd)
{
if (osd >= map->max_osd)
return NULL;
return &map->osd_addr[osd];
}
extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
struct ceph_osdmap *map,
struct ceph_messenger *msgr);
extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
/* calculate mapping of a file extent to an object */
extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
u64 off, u64 *plen,
u64 *bno, u64 *oxoff, u64 *oxlen);
/* calculate mapping of object to a placement group */
extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
const char *oid,
struct ceph_file_layout *fl,
struct ceph_osdmap *osdmap);
extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
int *acting);
extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
struct ceph_pg pgid);
extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name);
#endif

View file

@ -0,0 +1,54 @@
#ifndef __FS_CEPH_PAGELIST_H
#define __FS_CEPH_PAGELIST_H
#include <linux/list.h>
struct ceph_pagelist {
struct list_head head;
void *mapped_tail;
size_t length;
size_t room;
};
static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
{
INIT_LIST_HEAD(&pl->head);
pl->mapped_tail = NULL;
pl->length = 0;
pl->room = 0;
}
extern int ceph_pagelist_release(struct ceph_pagelist *pl);
extern int ceph_pagelist_append(struct ceph_pagelist *pl, const void *d, size_t l);
static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
{
__le64 ev = cpu_to_le64(v);
return ceph_pagelist_append(pl, &ev, sizeof(ev));
}
static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v)
{
__le32 ev = cpu_to_le32(v);
return ceph_pagelist_append(pl, &ev, sizeof(ev));
}
static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v)
{
__le16 ev = cpu_to_le16(v);
return ceph_pagelist_append(pl, &ev, sizeof(ev));
}
static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v)
{
return ceph_pagelist_append(pl, &v, 1);
}
static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl,
char *s, size_t len)
{
int ret = ceph_pagelist_encode_32(pl, len);
if (ret)
return ret;
if (len)
return ceph_pagelist_append(pl, s, len);
return 0;
}
#endif

405
include/linux/ceph/rados.h Normal file
View file

@ -0,0 +1,405 @@
#ifndef CEPH_RADOS_H
#define CEPH_RADOS_H
/*
* Data types for the Ceph distributed object storage layer RADOS
* (Reliable Autonomic Distributed Object Store).
*/
#include "msgr.h"
/*
* osdmap encoding versions
*/
#define CEPH_OSDMAP_INC_VERSION 5
#define CEPH_OSDMAP_INC_VERSION_EXT 5
#define CEPH_OSDMAP_VERSION 5
#define CEPH_OSDMAP_VERSION_EXT 5
/*
* fs id
*/
struct ceph_fsid {
unsigned char fsid[16];
};
static inline int ceph_fsid_compare(const struct ceph_fsid *a,
const struct ceph_fsid *b)
{
return memcmp(a, b, sizeof(*a));
}
/*
* ino, object, etc.
*/
typedef __le64 ceph_snapid_t;
#define CEPH_SNAPDIR ((__u64)(-1)) /* reserved for hidden .snap dir */
#define CEPH_NOSNAP ((__u64)(-2)) /* "head", "live" revision */
#define CEPH_MAXSNAP ((__u64)(-3)) /* largest valid snapid */
struct ceph_timespec {
__le32 tv_sec;
__le32 tv_nsec;
} __attribute__ ((packed));
/*
* object layout - how objects are mapped into PGs
*/
#define CEPH_OBJECT_LAYOUT_HASH 1
#define CEPH_OBJECT_LAYOUT_LINEAR 2
#define CEPH_OBJECT_LAYOUT_HASHINO 3
/*
* pg layout -- how PGs are mapped onto (sets of) OSDs
*/
#define CEPH_PG_LAYOUT_CRUSH 0
#define CEPH_PG_LAYOUT_HASH 1
#define CEPH_PG_LAYOUT_LINEAR 2
#define CEPH_PG_LAYOUT_HYBRID 3
#define CEPH_PG_MAX_SIZE 16 /* max # osds in a single pg */
/*
* placement group.
* we encode this into one __le64.
*/
struct ceph_pg {
__le16 preferred; /* preferred primary osd */
__le16 ps; /* placement seed */
__le32 pool; /* object pool */
} __attribute__ ((packed));
/*
* pg_pool is a set of pgs storing a pool of objects
*
* pg_num -- base number of pseudorandomly placed pgs
*
* pgp_num -- effective number when calculating pg placement. this
* is used for pg_num increases. new pgs result in data being "split"
* into new pgs. for this to proceed smoothly, new pgs are intiially
* colocated with their parents; that is, pgp_num doesn't increase
* until the new pgs have successfully split. only _then_ are the new
* pgs placed independently.
*
* lpg_num -- localized pg count (per device). replicas are randomly
* selected.
*
* lpgp_num -- as above.
*/
#define CEPH_PG_TYPE_REP 1
#define CEPH_PG_TYPE_RAID4 2
#define CEPH_PG_POOL_VERSION 2
struct ceph_pg_pool {
__u8 type; /* CEPH_PG_TYPE_* */
__u8 size; /* number of osds in each pg */
__u8 crush_ruleset; /* crush placement rule */
__u8 object_hash; /* hash mapping object name to ps */
__le32 pg_num, pgp_num; /* number of pg's */
__le32 lpg_num, lpgp_num; /* number of localized pg's */
__le32 last_change; /* most recent epoch changed */
__le64 snap_seq; /* seq for per-pool snapshot */
__le32 snap_epoch; /* epoch of last snap */
__le32 num_snaps;
__le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
__le64 auid; /* who owns the pg */
} __attribute__ ((packed));
/*
* stable_mod func is used to control number of placement groups.
* similar to straight-up modulo, but produces a stable mapping as b
* increases over time. b is the number of bins, and bmask is the
* containing power of 2 minus 1.
*
* b <= bmask and bmask=(2**n)-1
* e.g., b=12 -> bmask=15, b=123 -> bmask=127
*/
static inline int ceph_stable_mod(int x, int b, int bmask)
{
if ((x & bmask) < b)
return x & bmask;
else
return x & (bmask >> 1);
}
/*
* object layout - how a given object should be stored.
*/
struct ceph_object_layout {
struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */
__le32 ol_stripe_unit; /* for per-object parity, if any */
} __attribute__ ((packed));
/*
* compound epoch+version, used by storage layer to serialize mutations
*/
struct ceph_eversion {
__le32 epoch;
__le64 version;
} __attribute__ ((packed));
/*
* osd map bits
*/
/* status bits */
#define CEPH_OSD_EXISTS 1
#define CEPH_OSD_UP 2
/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
#define CEPH_OSD_IN 0x10000
#define CEPH_OSD_OUT 0
/*
* osd map flag bits
*/
#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */
#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */
#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */
#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */
#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */
/*
* osd ops
*/
#define CEPH_OSD_OP_MODE 0xf000
#define CEPH_OSD_OP_MODE_RD 0x1000
#define CEPH_OSD_OP_MODE_WR 0x2000
#define CEPH_OSD_OP_MODE_RMW 0x3000
#define CEPH_OSD_OP_MODE_SUB 0x4000
#define CEPH_OSD_OP_TYPE 0x0f00
#define CEPH_OSD_OP_TYPE_LOCK 0x0100
#define CEPH_OSD_OP_TYPE_DATA 0x0200
#define CEPH_OSD_OP_TYPE_ATTR 0x0300
#define CEPH_OSD_OP_TYPE_EXEC 0x0400
#define CEPH_OSD_OP_TYPE_PG 0x0500
enum {
/** data **/
/* read */
CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
/* fancy read */
CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
/* write */
CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
CEPH_OSD_OP_TRUNCATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
CEPH_OSD_OP_ZERO = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
CEPH_OSD_OP_DELETE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
/* fancy write */
CEPH_OSD_OP_APPEND = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
CEPH_OSD_OP_SETTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
CEPH_OSD_OP_TMAPUP = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
CEPH_OSD_OP_ROLLBACK= CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 14,
/** attrs **/
/* read */
CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
CEPH_OSD_OP_CMPXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3,
/* write */
CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
/** subop **/
CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1,
CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2,
CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3,
CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5,
/** lock **/
CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
CEPH_OSD_OP_WRUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
CEPH_OSD_OP_RDLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
CEPH_OSD_OP_RDUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
CEPH_OSD_OP_UPLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
/** exec **/
CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
/** pg **/
CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
};
static inline int ceph_osd_op_type_lock(int op)
{
return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
}
static inline int ceph_osd_op_type_data(int op)
{
return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
}
static inline int ceph_osd_op_type_attr(int op)
{
return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
}
static inline int ceph_osd_op_type_exec(int op)
{
return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
}
static inline int ceph_osd_op_type_pg(int op)
{
return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
}
static inline int ceph_osd_op_mode_subop(int op)
{
return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
}
static inline int ceph_osd_op_mode_read(int op)
{
return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
}
static inline int ceph_osd_op_mode_modify(int op)
{
return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
}
/*
* note that the following tmap stuff is also defined in the ceph librados.h
* any modification here needs to be updated there
*/
#define CEPH_OSD_TMAP_HDR 'h'
#define CEPH_OSD_TMAP_SET 's'
#define CEPH_OSD_TMAP_RM 'r'
extern const char *ceph_osd_op_name(int op);
/*
* osd op flags
*
* An op may be READ, WRITE, or READ|WRITE.
*/
enum {
CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */
CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */
CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */
CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */
CEPH_OSD_FLAG_READ = 16, /* op may read */
CEPH_OSD_FLAG_WRITE = 32, /* op may write */
CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */
CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */
CEPH_OSD_FLAG_BALANCE_READS = 256,
CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */
CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */
CEPH_OSD_FLAG_EXEC_PUBLIC = 4096, /* op may exec (public) */
};
enum {
CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */
};
#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
#define EBLACKLISTED ESHUTDOWN /* blacklisted */
/* xattr comparison */
enum {
CEPH_OSD_CMPXATTR_OP_NOP = 0,
CEPH_OSD_CMPXATTR_OP_EQ = 1,
CEPH_OSD_CMPXATTR_OP_NE = 2,
CEPH_OSD_CMPXATTR_OP_GT = 3,
CEPH_OSD_CMPXATTR_OP_GTE = 4,
CEPH_OSD_CMPXATTR_OP_LT = 5,
CEPH_OSD_CMPXATTR_OP_LTE = 6
};
enum {
CEPH_OSD_CMPXATTR_MODE_STRING = 1,
CEPH_OSD_CMPXATTR_MODE_U64 = 2
};
/*
* an individual object operation. each may be accompanied by some data
* payload
*/
struct ceph_osd_op {
__le16 op; /* CEPH_OSD_OP_* */
__le32 flags; /* CEPH_OSD_FLAG_* */
union {
struct {
__le64 offset, length;
__le64 truncate_size;
__le32 truncate_seq;
} __attribute__ ((packed)) extent;
struct {
__le32 name_len;
__le32 value_len;
__u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
__u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
} __attribute__ ((packed)) xattr;
struct {
__u8 class_len;
__u8 method_len;
__u8 argc;
__le32 indata_len;
} __attribute__ ((packed)) cls;
struct {
__le64 cookie, count;
} __attribute__ ((packed)) pgls;
struct {
__le64 snapid;
} __attribute__ ((packed)) snap;
};
__le32 payload_len;
} __attribute__ ((packed));
/*
* osd request message header. each request may include multiple
* ceph_osd_op object operations.
*/
struct ceph_osd_request_head {
__le32 client_inc; /* client incarnation */
struct ceph_object_layout layout; /* pgid */
__le32 osdmap_epoch; /* client's osdmap epoch */
__le32 flags;
struct ceph_timespec mtime; /* for mutations only */
struct ceph_eversion reassert_version; /* if we are replaying op */
__le32 object_len; /* length of object name */
__le64 snapid; /* snapid to read */
__le64 snap_seq; /* writer's snap context */
__le32 num_snaps;
__le16 num_ops;
struct ceph_osd_op ops[]; /* followed by ops[], obj, ticket, snaps */
} __attribute__ ((packed));
struct ceph_osd_reply_head {
__le32 client_inc; /* client incarnation */
__le32 flags;
struct ceph_object_layout layout;
__le32 osdmap_epoch;
struct ceph_eversion reassert_version; /* for replaying uncommitted */
__le32 result; /* result code */
__le32 object_len; /* length of object name */
__le32 num_ops;
struct ceph_osd_op ops[0]; /* ops[], object */
} __attribute__ ((packed));
#endif

View file

@ -0,0 +1,29 @@
#ifndef _FS_CEPH_TYPES_H
#define _FS_CEPH_TYPES_H
/* needed before including ceph_fs.h */
#include <linux/in.h>
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/string.h>
#include "ceph_fs.h"
#include "ceph_frag.h"
#include "ceph_hash.h"
/*
* Identify inodes by both their ino AND snapshot id (a u64).
*/
struct ceph_vino {
u64 ino;
u64 snap;
};
/* context for the caps reservation mechanism */
struct ceph_cap_reservation {
int count;
};
#endif