From: Philipp Reisner Date: Fri, 9 Nov 2012 13:18:43 +0000 (+0100) Subject: Merge branch 'drbd-8.4_ed6' into for-3.8-drivers-drbd-8.4_ed6 X-Git-Tag: v3.8-rc1~78^2~18 X-Git-Url: http://pileus.org/git/?p=~andy%2Flinux;a=commitdiff_plain;h=986836503e49ccf7e84b813715d344964ec93566 Merge branch 'drbd-8.4_ed6' into for-3.8-drivers-drbd-8.4_ed6 --- 986836503e49ccf7e84b813715d344964ec93566 diff --cc drivers/block/drbd/drbd_bitmap.c index 8d806975804,e30ff720894..1ab205a4bf6 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@@ -376,12 -391,12 +391,12 @@@ static struct page **bm_realloc_pages(s * GFP_NOIO, as this is called while drbd IO is "suspended", * and during resize or attach on diskless Primary, * we must not block on IO to ourselves. - * Context is receiver thread or cqueue thread/dmsetup. */ + * Context is receiver thread or dmsetup. */ bytes = sizeof(struct page *)*want; - new_pages = kmalloc(bytes, GFP_NOIO); + new_pages = kzalloc(bytes, GFP_NOIO); if (!new_pages) { new_pages = __vmalloc(bytes, - GFP_NOIO | __GFP_HIGHMEM, + GFP_NOIO | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); if (!new_pages) return NULL; @@@ -1425,13 -1478,21 +1477,21 @@@ static inline void bm_set_full_words_wi { int i; int bits; + int changed = 0; - unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr], KM_IRQ1); + unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]); for (i = first_word; i < last_word; i++) { bits = hweight_long(paddr[i]); paddr[i] = ~0UL; - b->bm_set += BITS_PER_LONG - bits; + changed += BITS_PER_LONG - bits; } - kunmap_atomic(paddr, KM_IRQ1); + kunmap_atomic(paddr); + if (changed) { + /* We only need lazy writeout, the information is still in the + * remote bitmap as well, and is reconstructed during the next + * bitmap exchange, if lost locally due to a crash. */ + bm_set_page_lazy_writeout(b->bm_pages[page_nr]); + b->bm_set += changed; + } } /* Same thing as drbd_bm_set_bits, diff --cc drivers/block/drbd/drbd_int.h index 277c69c9465,784f4eb2ed6..ef72a72814c --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@@ -59,9 -64,8 +63,8 @@@ /* module parameter, defined in drbd_main.c */ extern unsigned int minor_count; -extern int disable_sendpage; -extern int allow_oos; +extern bool disable_sendpage; +extern bool allow_oos; - extern unsigned int cn_idx; #ifdef CONFIG_DRBD_FAULT_INJECTION extern int enable_faults; @@@ -1138,46 -1028,16 +1027,16 @@@ struct drbd_conf int rs_last_events; /* counter of read or write "events" (unit sectors) * on the lower level device when we last looked. */ int c_sync_rate; /* current resync rate after syncer throttle magic */ - struct fifo_buffer rs_plan_s; /* correction values of resync planer */ + struct fifo_buffer *rs_plan_s; /* correction values of resync planer (RCU, tconn->conn_update) */ int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */ - int rs_planed; /* resync sectors already planned */ atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */ - int peer_max_bio_size; - int local_max_bio_size; + unsigned int peer_max_bio_size; + unsigned int local_max_bio_size; }; - static inline void drbd_set_flag(struct drbd_conf *mdev, enum drbd_flag f) - { - set_bit(f, &mdev->drbd_flags[0]); - } - - static inline void drbd_clear_flag(struct drbd_conf *mdev, enum drbd_flag f) - { - clear_bit(f, &mdev->drbd_flags[0]); - } - - static inline int drbd_test_flag(struct drbd_conf *mdev, enum drbd_flag f) - { - return test_bit(f, &mdev->drbd_flags[0]); - } - - static inline int drbd_test_and_set_flag(struct drbd_conf *mdev, enum drbd_flag f) - { - return test_and_set_bit(f, &mdev->drbd_flags[0]); - } - - static inline int drbd_test_and_clear_flag(struct drbd_conf *mdev, enum drbd_flag f) - { - return test_and_clear_bit(f, &mdev->drbd_flags[0]); - } - static inline struct drbd_conf *minor_to_mdev(unsigned int minor) { - struct drbd_conf *mdev; - - mdev = minor < minor_count ? minor_table[minor] : NULL; - - return mdev; + return (struct drbd_conf *)idr_find(&minors, minor); } static inline unsigned int mdev_to_minor(struct drbd_conf *mdev) @@@ -1468,17 -1300,20 +1299,20 @@@ struct bm_extent #endif #endif - /* Sector shift value for the "hash" functions of tl_hash and ee_hash tables. - * With a value of 8 all IO in one 128K block make it to the same slot of the - * hash table. */ - #define HT_SHIFT 8 - #define DRBD_MAX_BIO_SIZE (1U<<(9+HT_SHIFT)) + /* BIO_MAX_SIZE is 256 * PAGE_CACHE_SIZE, + * so for typical PAGE_CACHE_SIZE of 4k, that is (1<<20) Byte. + * Since we may live in a mixed-platform cluster, + * we limit us to a platform agnostic constant here for now. + * A followup commit may allow even bigger BIO sizes, + * once we thought that through. */ -#define DRBD_MAX_BIO_SIZE (1 << 20) ++#define DRBD_MAX_BIO_SIZE (1U << 20) + #if DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE + #error Architecture not supported: DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE + #endif -#define DRBD_MAX_BIO_SIZE_SAFE (1 << 12) /* Works always = 4k */ +#define DRBD_MAX_BIO_SIZE_SAFE (1U << 12) /* Works always = 4k */ - #define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* The old header only allows packets up to 32Kib data */ - - /* Number of elements in the app_reads_hash */ - #define APP_R_HSIZE 15 -#define DRBD_MAX_SIZE_H80_PACKET (1 << 15) /* Header 80 only allows packets up to 32KiB data */ -#define DRBD_MAX_BIO_SIZE_P95 (1 << 17) /* Protocol 95 to 99 allows bios up to 128KiB */ ++#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */ ++#define DRBD_MAX_BIO_SIZE_P95 (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */ extern int drbd_bm_init(struct drbd_conf *mdev); extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors, int set_new_bits); @@@ -1575,7 -1419,8 +1418,8 @@@ extern void conn_free_crypto(struct drb extern int proc_details; /* drbd_req */ + extern void __drbd_make_request(struct drbd_conf *, struct bio *, unsigned long); -extern int drbd_make_request(struct request_queue *q, struct bio *bio); +extern void drbd_make_request(struct request_queue *q, struct bio *bio); extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req); extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec); extern int is_valid_ar_handle(struct drbd_request *, sector_t); diff --cc drivers/block/drbd/drbd_interval.c index 00000000000,0e53f102e68..89c497c630b mode 000000,100644..100644 --- a/drivers/block/drbd/drbd_interval.c +++ b/drivers/block/drbd/drbd_interval.c @@@ -1,0 -1,177 +1,207 @@@ ++#include ++#include + #include "drbd_interval.h" + + /** + * interval_end - return end of @node + */ + static inline + sector_t interval_end(struct rb_node *node) + { + struct drbd_interval *this = rb_entry(node, struct drbd_interval, rb); + return this->end; + } + + /** - * update_interval_end - recompute end of @node ++ * compute_subtree_last - compute end of @node + * + * The end of an interval is the highest (start + (size >> 9)) value of this + * node and of its children. Called for @node and its parents whenever the end + * may have changed. + */ -static void -update_interval_end(struct rb_node *node, void *__unused) ++static inline sector_t ++compute_subtree_last(struct drbd_interval *node) + { - struct drbd_interval *this = rb_entry(node, struct drbd_interval, rb); - sector_t end; ++ sector_t max = node->sector + (node->size >> 9); + - end = this->sector + (this->size >> 9); - if (node->rb_left) { - sector_t left = interval_end(node->rb_left); - if (left > end) - end = left; ++ if (node->rb.rb_left) { ++ sector_t left = interval_end(node->rb.rb_left); ++ if (left > max) ++ max = left; ++ } ++ if (node->rb.rb_right) { ++ sector_t right = interval_end(node->rb.rb_right); ++ if (right > max) ++ max = right; + } - if (node->rb_right) { - sector_t right = interval_end(node->rb_right); - if (right > end) - end = right; ++ return max; ++} ++ ++static void augment_propagate(struct rb_node *rb, struct rb_node *stop) ++{ ++ while (rb != stop) { ++ struct drbd_interval *node = rb_entry(rb, struct drbd_interval, rb); ++ sector_t subtree_last = compute_subtree_last(node); ++ if (node->end == subtree_last) ++ break; ++ node->end = subtree_last; ++ rb = rb_parent(&node->rb); + } - this->end = end; + } + ++static void augment_copy(struct rb_node *rb_old, struct rb_node *rb_new) ++{ ++ struct drbd_interval *old = rb_entry(rb_old, struct drbd_interval, rb); ++ struct drbd_interval *new = rb_entry(rb_new, struct drbd_interval, rb); ++ ++ new->end = old->end; ++} ++ ++static void augment_rotate(struct rb_node *rb_old, struct rb_node *rb_new) ++{ ++ struct drbd_interval *old = rb_entry(rb_old, struct drbd_interval, rb); ++ struct drbd_interval *new = rb_entry(rb_new, struct drbd_interval, rb); ++ ++ new->end = old->end; ++ old->end = compute_subtree_last(old); ++} ++ ++static const struct rb_augment_callbacks augment_callbacks = { ++ augment_propagate, ++ augment_copy, ++ augment_rotate, ++}; ++ + /** + * drbd_insert_interval - insert a new interval into a tree + */ + bool + drbd_insert_interval(struct rb_root *root, struct drbd_interval *this) + { + struct rb_node **new = &root->rb_node, *parent = NULL; + + BUG_ON(!IS_ALIGNED(this->size, 512)); + + while (*new) { + struct drbd_interval *here = + rb_entry(*new, struct drbd_interval, rb); + + parent = *new; + if (this->sector < here->sector) + new = &(*new)->rb_left; + else if (this->sector > here->sector) + new = &(*new)->rb_right; + else if (this < here) + new = &(*new)->rb_left; + else if (this > here) + new = &(*new)->rb_right; + else + return false; + } + + rb_link_node(&this->rb, parent, new); - rb_insert_color(&this->rb, root); - rb_augment_insert(&this->rb, update_interval_end, NULL); ++ rb_insert_augmented(&this->rb, root, &augment_callbacks); + return true; + } + + /** + * drbd_contains_interval - check if a tree contains a given interval + * @sector: start sector of @interval + * @interval: may not be a valid pointer + * + * Returns if the tree contains the node @interval with start sector @start. + * Does not dereference @interval until @interval is known to be a valid object + * in @tree. Returns %false if @interval is in the tree but with a different + * sector number. + */ + bool + drbd_contains_interval(struct rb_root *root, sector_t sector, + struct drbd_interval *interval) + { + struct rb_node *node = root->rb_node; + + while (node) { + struct drbd_interval *here = + rb_entry(node, struct drbd_interval, rb); + + if (sector < here->sector) + node = node->rb_left; + else if (sector > here->sector) + node = node->rb_right; + else if (interval < here) + node = node->rb_left; + else if (interval > here) + node = node->rb_right; + else + return true; + } + return false; + } + + /** + * drbd_remove_interval - remove an interval from a tree + */ + void + drbd_remove_interval(struct rb_root *root, struct drbd_interval *this) + { - struct rb_node *deepest; - - deepest = rb_augment_erase_begin(&this->rb); - rb_erase(&this->rb, root); - rb_augment_erase_end(deepest, update_interval_end, NULL); ++ rb_erase_augmented(&this->rb, root, &augment_callbacks); + } + + /** + * drbd_find_overlap - search for an interval overlapping with [sector, sector + size) + * @sector: start sector + * @size: size, aligned to 512 bytes + * + * Returns an interval overlapping with [sector, sector + size), or NULL if + * there is none. When there is more than one overlapping interval in the + * tree, the interval with the lowest start sector is returned, and all other + * overlapping intervals will be on the right side of the tree, reachable with + * rb_next(). + */ + struct drbd_interval * + drbd_find_overlap(struct rb_root *root, sector_t sector, unsigned int size) + { + struct rb_node *node = root->rb_node; + struct drbd_interval *overlap = NULL; + sector_t end = sector + (size >> 9); + + BUG_ON(!IS_ALIGNED(size, 512)); + + while (node) { + struct drbd_interval *here = + rb_entry(node, struct drbd_interval, rb); + + if (node->rb_left && + sector < interval_end(node->rb_left)) { + /* Overlap if any must be on left side */ + node = node->rb_left; + } else if (here->sector < end && + sector < here->sector + (here->size >> 9)) { + overlap = here; + break; + } else if (sector >= here->sector) { + /* Overlap if any must be on right side */ + node = node->rb_right; + } else + break; + } + return overlap; + } + + struct drbd_interval * + drbd_next_overlap(struct drbd_interval *i, sector_t sector, unsigned int size) + { + sector_t end = sector + (size >> 9); + struct rb_node *node; + + for (;;) { + node = rb_next(&i->rb); + if (!node) + return NULL; + i = rb_entry(node, struct drbd_interval, rb); + if (i->sector >= end) + return NULL; + if (sector < i->sector + (i->size >> 9)) + return i; + } + } diff --cc drivers/block/drbd/drbd_main.c index 9b833e0fb44,be4f5827712..52de26daa1f --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@@ -118,9 -105,8 +105,8 @@@ module_param(fault_devs, int, 0644) /* module parameter, defined */ unsigned int minor_count = DRBD_MINOR_COUNT_DEF; -int disable_sendpage; -int allow_oos; +bool disable_sendpage; +bool allow_oos; - unsigned int cn_idx = CN_IDX_DRBD; int proc_details; /* Detail level in proc drbd*/ /* Module parameter for setting the user mode helper program @@@ -158,1609 -145,184 +145,184 @@@ DEFINE_RATELIMIT_STATE(drbd_ratelimit_s static const struct block_device_operations drbd_ops = { .owner = THIS_MODULE, - .open = drbd_open, - .release = drbd_release, - }; - - struct bio *bio_alloc_drbd(gfp_t gfp_mask) - { - if (!drbd_md_io_bio_set) - return bio_alloc(gfp_mask, 1); - - return bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set); - } - - #ifdef __CHECKER__ - /* When checking with sparse, and this is an inline function, sparse will - give tons of false positives. When this is a real functions sparse works. - */ - int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins) - { - int io_allowed; - - atomic_inc(&mdev->local_cnt); - io_allowed = (mdev->state.disk >= mins); - if (!io_allowed) { - if (atomic_dec_and_test(&mdev->local_cnt)) - wake_up(&mdev->misc_wait); - } - return io_allowed; - } - - #endif - - /** - * DOC: The transfer log - * - * The transfer log is a single linked list of &struct drbd_tl_epoch objects. - * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail - * of the list. There is always at least one &struct drbd_tl_epoch object. - * - * Each &struct drbd_tl_epoch has a circular double linked list of requests - * attached. - */ - static int tl_init(struct drbd_conf *mdev) - { - struct drbd_tl_epoch *b; - - /* during device minor initialization, we may well use GFP_KERNEL */ - b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL); - if (!b) - return 0; - INIT_LIST_HEAD(&b->requests); - INIT_LIST_HEAD(&b->w.list); - b->next = NULL; - b->br_number = 4711; - b->n_writes = 0; - b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */ - - mdev->oldest_tle = b; - mdev->newest_tle = b; - INIT_LIST_HEAD(&mdev->out_of_sequence_requests); - INIT_LIST_HEAD(&mdev->barrier_acked_requests); - - mdev->tl_hash = NULL; - mdev->tl_hash_s = 0; - - return 1; - } - - static void tl_cleanup(struct drbd_conf *mdev) - { - D_ASSERT(mdev->oldest_tle == mdev->newest_tle); - D_ASSERT(list_empty(&mdev->out_of_sequence_requests)); - kfree(mdev->oldest_tle); - mdev->oldest_tle = NULL; - kfree(mdev->unused_spare_tle); - mdev->unused_spare_tle = NULL; - kfree(mdev->tl_hash); - mdev->tl_hash = NULL; - mdev->tl_hash_s = 0; - } - - /** - * _tl_add_barrier() - Adds a barrier to the transfer log - * @mdev: DRBD device. - * @new: Barrier to be added before the current head of the TL. - * - * The caller must hold the req_lock. - */ - void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new) - { - struct drbd_tl_epoch *newest_before; - - INIT_LIST_HEAD(&new->requests); - INIT_LIST_HEAD(&new->w.list); - new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */ - new->next = NULL; - new->n_writes = 0; - - newest_before = mdev->newest_tle; - new->br_number = newest_before->br_number+1; - if (mdev->newest_tle != new) { - mdev->newest_tle->next = new; - mdev->newest_tle = new; - } - } - - /** - * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL - * @mdev: DRBD device. - * @barrier_nr: Expected identifier of the DRBD write barrier packet. - * @set_size: Expected number of requests before that barrier. - * - * In case the passed barrier_nr or set_size does not match the oldest - * &struct drbd_tl_epoch objects this function will cause a termination - * of the connection. - */ - void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, - unsigned int set_size) - { - struct drbd_tl_epoch *b, *nob; /* next old barrier */ - struct list_head *le, *tle; - struct drbd_request *r; - - spin_lock_irq(&mdev->req_lock); - - b = mdev->oldest_tle; - - /* first some paranoia code */ - if (b == NULL) { - dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n", - barrier_nr); - goto bail; - } - if (b->br_number != barrier_nr) { - dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n", - barrier_nr, b->br_number); - goto bail; - } - if (b->n_writes != set_size) { - dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n", - barrier_nr, set_size, b->n_writes); - goto bail; - } - - /* Clean up list of requests processed during current epoch */ - list_for_each_safe(le, tle, &b->requests) { - r = list_entry(le, struct drbd_request, tl_requests); - _req_mod(r, barrier_acked); - } - /* There could be requests on the list waiting for completion - of the write to the local disk. To avoid corruptions of - slab's data structures we have to remove the lists head. - - Also there could have been a barrier ack out of sequence, overtaking - the write acks - which would be a bug and violating write ordering. - To not deadlock in case we lose connection while such requests are - still pending, we need some way to find them for the - _req_mode(connection_lost_while_pending). - - These have been list_move'd to the out_of_sequence_requests list in - _req_mod(, barrier_acked) above. - */ - list_splice_init(&b->requests, &mdev->barrier_acked_requests); - - nob = b->next; - if (drbd_test_and_clear_flag(mdev, CREATE_BARRIER)) { - _tl_add_barrier(mdev, b); - if (nob) - mdev->oldest_tle = nob; - /* if nob == NULL b was the only barrier, and becomes the new - barrier. Therefore mdev->oldest_tle points already to b */ - } else { - D_ASSERT(nob != NULL); - mdev->oldest_tle = nob; - kfree(b); - } - - spin_unlock_irq(&mdev->req_lock); - dec_ap_pending(mdev); - - return; - - bail: - spin_unlock_irq(&mdev->req_lock); - drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); - } - - - /** - * _tl_restart() - Walks the transfer log, and applies an action to all requests - * @mdev: DRBD device. - * @what: The action/event to perform with all request objects - * - * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io, - * restart_frozen_disk_io. - */ - static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) - { - struct drbd_tl_epoch *b, *tmp, **pn; - struct list_head *le, *tle, carry_reads; - struct drbd_request *req; - int rv, n_writes, n_reads; - - b = mdev->oldest_tle; - pn = &mdev->oldest_tle; - while (b) { - n_writes = 0; - n_reads = 0; - INIT_LIST_HEAD(&carry_reads); - list_for_each_safe(le, tle, &b->requests) { - req = list_entry(le, struct drbd_request, tl_requests); - rv = _req_mod(req, what); - - n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT; - n_reads += (rv & MR_READ) >> MR_READ_SHIFT; - } - tmp = b->next; - - if (n_writes) { - if (what == resend) { - b->n_writes = n_writes; - if (b->w.cb == NULL) { - b->w.cb = w_send_barrier; - inc_ap_pending(mdev); - drbd_set_flag(mdev, CREATE_BARRIER); - } - - drbd_queue_work(&mdev->data.work, &b->w); - } - pn = &b->next; - } else { - if (n_reads) - list_add(&carry_reads, &b->requests); - /* there could still be requests on that ring list, - * in case local io is still pending */ - list_del(&b->requests); - - /* dec_ap_pending corresponding to queue_barrier. - * the newest barrier may not have been queued yet, - * in which case w.cb is still NULL. */ - if (b->w.cb != NULL) - dec_ap_pending(mdev); - - if (b == mdev->newest_tle) { - /* recycle, but reinit! */ - D_ASSERT(tmp == NULL); - INIT_LIST_HEAD(&b->requests); - list_splice(&carry_reads, &b->requests); - INIT_LIST_HEAD(&b->w.list); - b->w.cb = NULL; - b->br_number = net_random(); - b->n_writes = 0; - - *pn = b; - break; - } - *pn = tmp; - kfree(b); - } - b = tmp; - list_splice(&carry_reads, &b->requests); - } - - /* Actions operating on the disk state, also want to work on - requests that got barrier acked. */ - - list_for_each_safe(le, tle, &mdev->barrier_acked_requests) { - req = list_entry(le, struct drbd_request, tl_requests); - _req_mod(req, what); - } - } - - - /** - * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL - * @mdev: DRBD device. - * - * This is called after the connection to the peer was lost. The storage covered - * by the requests on the transfer gets marked as our of sync. Called from the - * receiver thread and the worker thread. - */ - void tl_clear(struct drbd_conf *mdev) - { - spin_lock_irq(&mdev->req_lock); - _tl_clear(mdev); - spin_unlock_irq(&mdev->req_lock); - } - - static void _tl_clear(struct drbd_conf *mdev) - { - struct list_head *le, *tle; - struct drbd_request *r; - - _tl_restart(mdev, connection_lost_while_pending); - - /* we expect this list to be empty. */ - D_ASSERT(list_empty(&mdev->out_of_sequence_requests)); - - /* but just in case, clean it up anyways! */ - list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) { - r = list_entry(le, struct drbd_request, tl_requests); - /* It would be nice to complete outside of spinlock. - * But this is easier for now. */ - _req_mod(r, connection_lost_while_pending); - } - - /* ensure bit indicating barrier is required is clear */ - drbd_clear_flag(mdev, CREATE_BARRIER); - - memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *)); - - } - - void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) - { - spin_lock_irq(&mdev->req_lock); - _tl_restart(mdev, what); - spin_unlock_irq(&mdev->req_lock); - } - - /** - * tl_abort_disk_io() - Abort disk I/O for all requests for a certain mdev in the TL - * @mdev: DRBD device. - */ - void tl_abort_disk_io(struct drbd_conf *mdev) - { - struct drbd_tl_epoch *b; - struct list_head *le, *tle; - struct drbd_request *req; - - spin_lock_irq(&mdev->req_lock); - b = mdev->oldest_tle; - while (b) { - list_for_each_safe(le, tle, &b->requests) { - req = list_entry(le, struct drbd_request, tl_requests); - if (!(req->rq_state & RQ_LOCAL_PENDING)) - continue; - _req_mod(req, abort_disk_io); - } - b = b->next; - } - - list_for_each_safe(le, tle, &mdev->barrier_acked_requests) { - req = list_entry(le, struct drbd_request, tl_requests); - if (!(req->rq_state & RQ_LOCAL_PENDING)) - continue; - _req_mod(req, abort_disk_io); - } - - spin_unlock_irq(&mdev->req_lock); - } - - /** - * cl_wide_st_chg() - true if the state change is a cluster wide one - * @mdev: DRBD device. - * @os: old (current) state. - * @ns: new (wanted) state. - */ - static int cl_wide_st_chg(struct drbd_conf *mdev, - union drbd_state os, union drbd_state ns) - { - return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED && - ((os.role != R_PRIMARY && ns.role == R_PRIMARY) || - (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || - (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) || - (os.disk != D_FAILED && ns.disk == D_FAILED))) || - (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) || - (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S); - } - - enum drbd_state_rv - drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f, - union drbd_state mask, union drbd_state val) - { - unsigned long flags; - union drbd_state os, ns; - enum drbd_state_rv rv; - - spin_lock_irqsave(&mdev->req_lock, flags); - os = mdev->state; - ns.i = (os.i & ~mask.i) | val.i; - rv = _drbd_set_state(mdev, ns, f, NULL); - ns = mdev->state; - spin_unlock_irqrestore(&mdev->req_lock, flags); - - return rv; - } - - /** - * drbd_force_state() - Impose a change which happens outside our control on our state - * @mdev: DRBD device. - * @mask: mask of state bits to change. - * @val: value of new state bits. - */ - void drbd_force_state(struct drbd_conf *mdev, - union drbd_state mask, union drbd_state val) - { - drbd_change_state(mdev, CS_HARD, mask, val); - } - - static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state); - static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *, - union drbd_state, - union drbd_state); - enum sanitize_state_warnings { - NO_WARNING, - ABORTED_ONLINE_VERIFY, - ABORTED_RESYNC, - CONNECTION_LOST_NEGOTIATING, - IMPLICITLY_UPGRADED_DISK, - IMPLICITLY_UPGRADED_PDSK, - }; - static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, - union drbd_state ns, enum sanitize_state_warnings *warn); - int drbd_send_state_req(struct drbd_conf *, - union drbd_state, union drbd_state); - - static enum drbd_state_rv - _req_st_cond(struct drbd_conf *mdev, union drbd_state mask, - union drbd_state val) - { - union drbd_state os, ns; - unsigned long flags; - enum drbd_state_rv rv; - - if (drbd_test_and_clear_flag(mdev, CL_ST_CHG_SUCCESS)) - return SS_CW_SUCCESS; - - if (drbd_test_and_clear_flag(mdev, CL_ST_CHG_FAIL)) - return SS_CW_FAILED_BY_PEER; - - rv = 0; - spin_lock_irqsave(&mdev->req_lock, flags); - os = mdev->state; - ns.i = (os.i & ~mask.i) | val.i; - ns = sanitize_state(mdev, os, ns, NULL); - - if (!cl_wide_st_chg(mdev, os, ns)) - rv = SS_CW_NO_NEED; - if (!rv) { - rv = is_valid_state(mdev, ns); - if (rv == SS_SUCCESS) { - rv = is_valid_state_transition(mdev, ns, os); - if (rv == SS_SUCCESS) - rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */ - } - } - spin_unlock_irqrestore(&mdev->req_lock, flags); - - return rv; - } - - /** - * drbd_req_state() - Perform an eventually cluster wide state change - * @mdev: DRBD device. - * @mask: mask of state bits to change. - * @val: value of new state bits. - * @f: flags - * - * Should not be called directly, use drbd_request_state() or - * _drbd_request_state(). - */ - static enum drbd_state_rv - drbd_req_state(struct drbd_conf *mdev, union drbd_state mask, - union drbd_state val, enum chg_state_flags f) - { - struct completion done; - unsigned long flags; - union drbd_state os, ns; - enum drbd_state_rv rv; - - init_completion(&done); - - if (f & CS_SERIALIZE) - mutex_lock(&mdev->state_mutex); - - spin_lock_irqsave(&mdev->req_lock, flags); - os = mdev->state; - ns.i = (os.i & ~mask.i) | val.i; - ns = sanitize_state(mdev, os, ns, NULL); - - if (cl_wide_st_chg(mdev, os, ns)) { - rv = is_valid_state(mdev, ns); - if (rv == SS_SUCCESS) - rv = is_valid_state_transition(mdev, ns, os); - spin_unlock_irqrestore(&mdev->req_lock, flags); - - if (rv < SS_SUCCESS) { - if (f & CS_VERBOSE) - print_st_err(mdev, os, ns, rv); - goto abort; - } - - drbd_state_lock(mdev); - if (!drbd_send_state_req(mdev, mask, val)) { - drbd_state_unlock(mdev); - rv = SS_CW_FAILED_BY_PEER; - if (f & CS_VERBOSE) - print_st_err(mdev, os, ns, rv); - goto abort; - } - - if (mask.conn == C_MASK && val.conn == C_DISCONNECTING) - drbd_set_flag(mdev, DISCONNECT_SENT); - - wait_event(mdev->state_wait, - (rv = _req_st_cond(mdev, mask, val))); - - if (rv < SS_SUCCESS) { - drbd_state_unlock(mdev); - if (f & CS_VERBOSE) - print_st_err(mdev, os, ns, rv); - goto abort; - } - spin_lock_irqsave(&mdev->req_lock, flags); - os = mdev->state; - ns.i = (os.i & ~mask.i) | val.i; - rv = _drbd_set_state(mdev, ns, f, &done); - drbd_state_unlock(mdev); - } else { - rv = _drbd_set_state(mdev, ns, f, &done); - } - - spin_unlock_irqrestore(&mdev->req_lock, flags); - - if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) { - D_ASSERT(current != mdev->worker.task); - wait_for_completion(&done); - } - - abort: - if (f & CS_SERIALIZE) - mutex_unlock(&mdev->state_mutex); - - return rv; - } - - /** - * _drbd_request_state() - Request a state change (with flags) - * @mdev: DRBD device. - * @mask: mask of state bits to change. - * @val: value of new state bits. - * @f: flags - * - * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE - * flag, or when logging of failed state change requests is not desired. - */ - enum drbd_state_rv - _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask, - union drbd_state val, enum chg_state_flags f) - { - enum drbd_state_rv rv; - - wait_event(mdev->state_wait, - (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE); - - return rv; - } - - static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns) - { - dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n", - name, - drbd_conn_str(ns.conn), - drbd_role_str(ns.role), - drbd_role_str(ns.peer), - drbd_disk_str(ns.disk), - drbd_disk_str(ns.pdsk), - is_susp(ns) ? 's' : 'r', - ns.aftr_isp ? 'a' : '-', - ns.peer_isp ? 'p' : '-', - ns.user_isp ? 'u' : '-' - ); - } - - void print_st_err(struct drbd_conf *mdev, union drbd_state os, - union drbd_state ns, enum drbd_state_rv err) - { - if (err == SS_IN_TRANSIENT_STATE) - return; - dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err)); - print_st(mdev, " state", os); - print_st(mdev, "wanted", ns); - } - - - /** - * is_valid_state() - Returns an SS_ error code if ns is not valid - * @mdev: DRBD device. - * @ns: State to consider. - */ - static enum drbd_state_rv - is_valid_state(struct drbd_conf *mdev, union drbd_state ns) - { - /* See drbd_state_sw_errors in drbd_strings.c */ - - enum drbd_fencing_p fp; - enum drbd_state_rv rv = SS_SUCCESS; - - fp = FP_DONT_CARE; - if (get_ldev(mdev)) { - fp = mdev->ldev->dc.fencing; - put_ldev(mdev); - } - - if (get_net_conf(mdev)) { - if (!mdev->net_conf->two_primaries && - ns.role == R_PRIMARY && ns.peer == R_PRIMARY) - rv = SS_TWO_PRIMARIES; - put_net_conf(mdev); - } - - if (rv <= 0) - /* already found a reason to abort */; - else if (ns.role == R_SECONDARY && mdev->open_cnt) - rv = SS_DEVICE_IN_USE; - - else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE) - rv = SS_NO_UP_TO_DATE_DISK; - - else if (fp >= FP_RESOURCE && - ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN) - rv = SS_PRIMARY_NOP; - - else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT) - rv = SS_NO_UP_TO_DATE_DISK; - - else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT) - rv = SS_NO_LOCAL_DISK; - - else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT) - rv = SS_NO_REMOTE_DISK; - - else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) - rv = SS_NO_UP_TO_DATE_DISK; - - else if ((ns.conn == C_CONNECTED || - ns.conn == C_WF_BITMAP_S || - ns.conn == C_SYNC_SOURCE || - ns.conn == C_PAUSED_SYNC_S) && - ns.disk == D_OUTDATED) - rv = SS_CONNECTED_OUTDATES; - - else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && - (mdev->sync_conf.verify_alg[0] == 0)) - rv = SS_NO_VERIFY_ALG; - - else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && - mdev->agreed_pro_version < 88) - rv = SS_NOT_SUPPORTED; - - else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN) - rv = SS_CONNECTED_OUTDATES; - - return rv; - } - - /** - * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible - * @mdev: DRBD device. - * @ns: new state. - * @os: old state. - */ - static enum drbd_state_rv - is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns, - union drbd_state os) - { - enum drbd_state_rv rv = SS_SUCCESS; - - if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) && - os.conn > C_CONNECTED) - rv = SS_RESYNC_RUNNING; - - if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE) - rv = SS_ALREADY_STANDALONE; - - if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS) - rv = SS_IS_DISKLESS; - - if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED) - rv = SS_NO_NET_CONFIG; - - if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING) - rv = SS_LOWER_THAN_OUTDATED; - - if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED) - rv = SS_IN_TRANSIENT_STATE; - - if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS) - rv = SS_IN_TRANSIENT_STATE; - - /* While establishing a connection only allow cstate to change. - Delay/refuse role changes, detach attach etc... */ - if (drbd_test_flag(mdev, STATE_SENT) && - !(os.conn == C_WF_REPORT_PARAMS || - (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION))) - rv = SS_IN_TRANSIENT_STATE; - - if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED) - rv = SS_NEED_CONNECTION; - - if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && - ns.conn != os.conn && os.conn > C_CONNECTED) - rv = SS_RESYNC_RUNNING; - - if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) && - os.conn < C_CONNECTED) - rv = SS_NEED_CONNECTION; - - if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE) - && os.conn < C_WF_REPORT_PARAMS) - rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */ - - return rv; - } - - static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn) - { - static const char *msg_table[] = { - [NO_WARNING] = "", - [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.", - [ABORTED_RESYNC] = "Resync aborted.", - [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!", - [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk", - [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk", - }; - - if (warn != NO_WARNING) - dev_warn(DEV, "%s\n", msg_table[warn]); - } - - /** - * sanitize_state() - Resolves implicitly necessary additional changes to a state transition - * @mdev: DRBD device. - * @os: old state. - * @ns: new state. - * @warn_sync_abort: - * - * When we loose connection, we have to set the state of the peers disk (pdsk) - * to D_UNKNOWN. This rule and many more along those lines are in this function. - */ - static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, - union drbd_state ns, enum sanitize_state_warnings *warn) - { - enum drbd_fencing_p fp; - enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max; - - if (warn) - *warn = NO_WARNING; - - fp = FP_DONT_CARE; - if (get_ldev(mdev)) { - fp = mdev->ldev->dc.fencing; - put_ldev(mdev); - } - - /* Disallow Network errors to configure a device's network part */ - if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) && - os.conn <= C_DISCONNECTING) - ns.conn = os.conn; - - /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow. - * If you try to go into some Sync* state, that shall fail (elsewhere). */ - if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN && - ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_CONNECTED) - ns.conn = os.conn; - - /* we cannot fail (again) if we already detached */ - if (ns.disk == D_FAILED && os.disk == D_DISKLESS) - ns.disk = D_DISKLESS; - - /* After C_DISCONNECTING only C_STANDALONE may follow */ - if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE) - ns.conn = os.conn; - - if (ns.conn < C_CONNECTED) { - ns.peer_isp = 0; - ns.peer = R_UNKNOWN; - if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT) - ns.pdsk = D_UNKNOWN; - } - - /* Clear the aftr_isp when becoming unconfigured */ - if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY) - ns.aftr_isp = 0; - - /* Abort resync if a disk fails/detaches */ - if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED && - (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) { - if (warn) - *warn = os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ? - ABORTED_ONLINE_VERIFY : ABORTED_RESYNC; - ns.conn = C_CONNECTED; - } - - /* Connection breaks down before we finished "Negotiating" */ - if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING && - get_ldev_if_state(mdev, D_NEGOTIATING)) { - if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) { - ns.disk = mdev->new_state_tmp.disk; - ns.pdsk = mdev->new_state_tmp.pdsk; - } else { - if (warn) - *warn = CONNECTION_LOST_NEGOTIATING; - ns.disk = D_DISKLESS; - ns.pdsk = D_UNKNOWN; - } - put_ldev(mdev); - } - - /* D_CONSISTENT and D_OUTDATED vanish when we get connected */ - if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) { - if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) - ns.disk = D_UP_TO_DATE; - if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED) - ns.pdsk = D_UP_TO_DATE; - } - - /* Implications of the connection stat on the disk states */ - disk_min = D_DISKLESS; - disk_max = D_UP_TO_DATE; - pdsk_min = D_INCONSISTENT; - pdsk_max = D_UNKNOWN; - switch ((enum drbd_conns)ns.conn) { - case C_WF_BITMAP_T: - case C_PAUSED_SYNC_T: - case C_STARTING_SYNC_T: - case C_WF_SYNC_UUID: - case C_BEHIND: - disk_min = D_INCONSISTENT; - disk_max = D_OUTDATED; - pdsk_min = D_UP_TO_DATE; - pdsk_max = D_UP_TO_DATE; - break; - case C_VERIFY_S: - case C_VERIFY_T: - disk_min = D_UP_TO_DATE; - disk_max = D_UP_TO_DATE; - pdsk_min = D_UP_TO_DATE; - pdsk_max = D_UP_TO_DATE; - break; - case C_CONNECTED: - disk_min = D_DISKLESS; - disk_max = D_UP_TO_DATE; - pdsk_min = D_DISKLESS; - pdsk_max = D_UP_TO_DATE; - break; - case C_WF_BITMAP_S: - case C_PAUSED_SYNC_S: - case C_STARTING_SYNC_S: - case C_AHEAD: - disk_min = D_UP_TO_DATE; - disk_max = D_UP_TO_DATE; - pdsk_min = D_INCONSISTENT; - pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/ - break; - case C_SYNC_TARGET: - disk_min = D_INCONSISTENT; - disk_max = D_INCONSISTENT; - pdsk_min = D_UP_TO_DATE; - pdsk_max = D_UP_TO_DATE; - break; - case C_SYNC_SOURCE: - disk_min = D_UP_TO_DATE; - disk_max = D_UP_TO_DATE; - pdsk_min = D_INCONSISTENT; - pdsk_max = D_INCONSISTENT; - break; - case C_STANDALONE: - case C_DISCONNECTING: - case C_UNCONNECTED: - case C_TIMEOUT: - case C_BROKEN_PIPE: - case C_NETWORK_FAILURE: - case C_PROTOCOL_ERROR: - case C_TEAR_DOWN: - case C_WF_CONNECTION: - case C_WF_REPORT_PARAMS: - case C_MASK: - break; - } - if (ns.disk > disk_max) - ns.disk = disk_max; - - if (ns.disk < disk_min) { - if (warn) - *warn = IMPLICITLY_UPGRADED_DISK; - ns.disk = disk_min; - } - if (ns.pdsk > pdsk_max) - ns.pdsk = pdsk_max; - - if (ns.pdsk < pdsk_min) { - if (warn) - *warn = IMPLICITLY_UPGRADED_PDSK; - ns.pdsk = pdsk_min; - } - - if (fp == FP_STONITH && - (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) && - !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED)) - ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */ - - if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO && - (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) && - !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE)) - ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */ - - if (ns.aftr_isp || ns.peer_isp || ns.user_isp) { - if (ns.conn == C_SYNC_SOURCE) - ns.conn = C_PAUSED_SYNC_S; - if (ns.conn == C_SYNC_TARGET) - ns.conn = C_PAUSED_SYNC_T; - } else { - if (ns.conn == C_PAUSED_SYNC_S) - ns.conn = C_SYNC_SOURCE; - if (ns.conn == C_PAUSED_SYNC_T) - ns.conn = C_SYNC_TARGET; - } - - return ns; - } - - /* helper for __drbd_set_state */ - static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs) - { - if (mdev->agreed_pro_version < 90) - mdev->ov_start_sector = 0; - mdev->rs_total = drbd_bm_bits(mdev); - mdev->ov_position = 0; - if (cs == C_VERIFY_T) { - /* starting online verify from an arbitrary position - * does not fit well into the existing protocol. - * on C_VERIFY_T, we initialize ov_left and friends - * implicitly in receive_DataRequest once the - * first P_OV_REQUEST is received */ - mdev->ov_start_sector = ~(sector_t)0; - } else { - unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector); - if (bit >= mdev->rs_total) { - mdev->ov_start_sector = - BM_BIT_TO_SECT(mdev->rs_total - 1); - mdev->rs_total = 1; - } else - mdev->rs_total -= bit; - mdev->ov_position = mdev->ov_start_sector; - } - mdev->ov_left = mdev->rs_total; - } - - static void drbd_resume_al(struct drbd_conf *mdev) - { - if (drbd_test_and_clear_flag(mdev, AL_SUSPENDED)) - dev_info(DEV, "Resumed AL updates\n"); - } - - /** - * __drbd_set_state() - Set a new DRBD state - * @mdev: DRBD device. - * @ns: new state. - * @flags: Flags - * @done: Optional completion, that will get completed after the after_state_ch() finished - * - * Caller needs to hold req_lock, and global_state_lock. Do not call directly. - */ - enum drbd_state_rv - __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, - enum chg_state_flags flags, struct completion *done) - { - union drbd_state os; - enum drbd_state_rv rv = SS_SUCCESS; - enum sanitize_state_warnings ssw; - struct after_state_chg_work *ascw; - - os = mdev->state; - - ns = sanitize_state(mdev, os, ns, &ssw); - - if (ns.i == os.i) - return SS_NOTHING_TO_DO; - - if (!(flags & CS_HARD)) { - /* pre-state-change checks ; only look at ns */ - /* See drbd_state_sw_errors in drbd_strings.c */ - - rv = is_valid_state(mdev, ns); - if (rv < SS_SUCCESS) { - /* If the old state was illegal as well, then let - this happen...*/ - - if (is_valid_state(mdev, os) == rv) - rv = is_valid_state_transition(mdev, ns, os); - } else - rv = is_valid_state_transition(mdev, ns, os); - } - - if (rv < SS_SUCCESS) { - if (flags & CS_VERBOSE) - print_st_err(mdev, os, ns, rv); - return rv; - } - - print_sanitize_warnings(mdev, ssw); - - { - char *pbp, pb[300]; - pbp = pb; - *pbp = 0; - if (ns.role != os.role) - pbp += sprintf(pbp, "role( %s -> %s ) ", - drbd_role_str(os.role), - drbd_role_str(ns.role)); - if (ns.peer != os.peer) - pbp += sprintf(pbp, "peer( %s -> %s ) ", - drbd_role_str(os.peer), - drbd_role_str(ns.peer)); - if (ns.conn != os.conn) - pbp += sprintf(pbp, "conn( %s -> %s ) ", - drbd_conn_str(os.conn), - drbd_conn_str(ns.conn)); - if (ns.disk != os.disk) - pbp += sprintf(pbp, "disk( %s -> %s ) ", - drbd_disk_str(os.disk), - drbd_disk_str(ns.disk)); - if (ns.pdsk != os.pdsk) - pbp += sprintf(pbp, "pdsk( %s -> %s ) ", - drbd_disk_str(os.pdsk), - drbd_disk_str(ns.pdsk)); - if (is_susp(ns) != is_susp(os)) - pbp += sprintf(pbp, "susp( %d -> %d ) ", - is_susp(os), - is_susp(ns)); - if (ns.aftr_isp != os.aftr_isp) - pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ", - os.aftr_isp, - ns.aftr_isp); - if (ns.peer_isp != os.peer_isp) - pbp += sprintf(pbp, "peer_isp( %d -> %d ) ", - os.peer_isp, - ns.peer_isp); - if (ns.user_isp != os.user_isp) - pbp += sprintf(pbp, "user_isp( %d -> %d ) ", - os.user_isp, - ns.user_isp); - dev_info(DEV, "%s\n", pb); - } - - /* solve the race between becoming unconfigured, - * worker doing the cleanup, and - * admin reconfiguring us: - * on (re)configure, first set CONFIG_PENDING, - * then wait for a potentially exiting worker, - * start the worker, and schedule one no_op. - * then proceed with configuration. - */ - if (ns.disk == D_DISKLESS && - ns.conn == C_STANDALONE && - ns.role == R_SECONDARY && - !drbd_test_and_set_flag(mdev, CONFIG_PENDING)) - drbd_set_flag(mdev, DEVICE_DYING); - - /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference - * on the ldev here, to be sure the transition -> D_DISKLESS resp. - * drbd_ldev_destroy() won't happen before our corresponding - * after_state_ch works run, where we put_ldev again. */ - if ((os.disk != D_FAILED && ns.disk == D_FAILED) || - (os.disk != D_DISKLESS && ns.disk == D_DISKLESS)) - atomic_inc(&mdev->local_cnt); - - mdev->state = ns; - - if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING) - drbd_print_uuids(mdev, "attached to UUIDs"); - - wake_up(&mdev->misc_wait); - wake_up(&mdev->state_wait); - - /* Aborted verify run, or we reached the stop sector. - * Log the last position, unless end-of-device. */ - if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) && - ns.conn <= C_CONNECTED) { - mdev->ov_start_sector = - BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left); - if (mdev->ov_left) - dev_info(DEV, "Online Verify reached sector %llu\n", - (unsigned long long)mdev->ov_start_sector); - } - - if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) && - (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) { - dev_info(DEV, "Syncer continues.\n"); - mdev->rs_paused += (long)jiffies - -(long)mdev->rs_mark_time[mdev->rs_last_mark]; - if (ns.conn == C_SYNC_TARGET) - mod_timer(&mdev->resync_timer, jiffies); - } - - if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) && - (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) { - dev_info(DEV, "Resync suspended\n"); - mdev->rs_mark_time[mdev->rs_last_mark] = jiffies; - } - - if (os.conn == C_CONNECTED && - (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) { - unsigned long now = jiffies; - int i; - - set_ov_position(mdev, ns.conn); - mdev->rs_start = now; - mdev->rs_last_events = 0; - mdev->rs_last_sect_ev = 0; - mdev->ov_last_oos_size = 0; - mdev->ov_last_oos_start = 0; - - for (i = 0; i < DRBD_SYNC_MARKS; i++) { - mdev->rs_mark_left[i] = mdev->ov_left; - mdev->rs_mark_time[i] = now; - } - - drbd_rs_controller_reset(mdev); - - if (ns.conn == C_VERIFY_S) { - dev_info(DEV, "Starting Online Verify from sector %llu\n", - (unsigned long long)mdev->ov_position); - mod_timer(&mdev->resync_timer, jiffies); - } - } - - if (get_ldev(mdev)) { - u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND| - MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE| - MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY); - - if (drbd_test_flag(mdev, CRASHED_PRIMARY)) - mdf |= MDF_CRASHED_PRIMARY; - if (mdev->state.role == R_PRIMARY || - (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY)) - mdf |= MDF_PRIMARY_IND; - if (mdev->state.conn > C_WF_REPORT_PARAMS) - mdf |= MDF_CONNECTED_IND; - if (mdev->state.disk > D_INCONSISTENT) - mdf |= MDF_CONSISTENT; - if (mdev->state.disk > D_OUTDATED) - mdf |= MDF_WAS_UP_TO_DATE; - if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT) - mdf |= MDF_PEER_OUT_DATED; - if (mdf != mdev->ldev->md.flags) { - mdev->ldev->md.flags = mdf; - drbd_md_mark_dirty(mdev); - } - if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT) - drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]); - put_ldev(mdev); - } - - /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */ - if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT && - os.peer == R_SECONDARY && ns.peer == R_PRIMARY) - drbd_set_flag(mdev, CONSIDER_RESYNC); - - /* Receiver should clean up itself */ - if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING) - drbd_thread_stop_nowait(&mdev->receiver); - - /* Now the receiver finished cleaning up itself, it should die */ - if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE) - drbd_thread_stop_nowait(&mdev->receiver); - - /* Upon network failure, we need to restart the receiver. */ - if (os.conn > C_WF_CONNECTION && - ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT) - drbd_thread_restart_nowait(&mdev->receiver); - - /* Resume AL writing if we get a connection */ - if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) - drbd_resume_al(mdev); - - /* remember last connect and attach times so request_timer_fn() won't - * kill newly established sessions while we are still trying to thaw - * previously frozen IO */ - if (os.conn != C_WF_REPORT_PARAMS && ns.conn == C_WF_REPORT_PARAMS) - mdev->last_reconnect_jif = jiffies; - if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && - ns.disk > D_NEGOTIATING) - mdev->last_reattach_jif = jiffies; - - ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC); - if (ascw) { - ascw->os = os; - ascw->ns = ns; - ascw->flags = flags; - ascw->w.cb = w_after_state_ch; - ascw->done = done; - drbd_queue_work(&mdev->data.work, &ascw->w); - } else { - dev_warn(DEV, "Could not kmalloc an ascw\n"); - } - - return rv; - } - - static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused) - { - struct after_state_chg_work *ascw = - container_of(w, struct after_state_chg_work, w); - after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags); - if (ascw->flags & CS_WAIT_COMPLETE) { - D_ASSERT(ascw->done != NULL); - complete(ascw->done); - } - kfree(ascw); - - return 1; - } - - static void abw_start_sync(struct drbd_conf *mdev, int rv) - { - if (rv) { - dev_err(DEV, "Writing the bitmap failed not starting resync.\n"); - _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE); - return; - } - - switch (mdev->state.conn) { - case C_STARTING_SYNC_T: - _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); - break; - case C_STARTING_SYNC_S: - drbd_start_resync(mdev, C_SYNC_SOURCE); - break; - } - } - - int drbd_bitmap_io_from_worker(struct drbd_conf *mdev, - int (*io_fn)(struct drbd_conf *), - char *why, enum bm_flag flags) - { - int rv; - - D_ASSERT(current == mdev->worker.task); - - /* open coded non-blocking drbd_suspend_io(mdev); */ - drbd_set_flag(mdev, SUSPEND_IO); + .open = drbd_open, + .release = drbd_release, + }; - drbd_bm_lock(mdev, why, flags); - rv = io_fn(mdev); - drbd_bm_unlock(mdev); -static void bio_destructor_drbd(struct bio *bio) -{ - bio_free(bio, drbd_md_io_bio_set); -} - + struct bio *bio_alloc_drbd(gfp_t gfp_mask) + { + struct bio *bio; - drbd_resume_io(mdev); + if (!drbd_md_io_bio_set) + return bio_alloc(gfp_mask, 1); - return rv; + bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set); + if (!bio) + return NULL; - bio->bi_destructor = bio_destructor_drbd; + return bio; } - /** - * after_state_ch() - Perform after state change actions that may sleep - * @mdev: DRBD device. - * @os: old state. - * @ns: new state. - * @flags: Flags + #ifdef __CHECKER__ + /* When checking with sparse, and this is an inline function, sparse will + give tons of false positives. When this is a real functions sparse works. */ - static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, - union drbd_state ns, enum chg_state_flags flags) + int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins) { - enum drbd_fencing_p fp; - enum drbd_req_event what = nothing; - union drbd_state nsm = (union drbd_state){ .i = -1 }; - - if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) { - drbd_clear_flag(mdev, CRASHED_PRIMARY); - if (mdev->p_uuid) - mdev->p_uuid[UI_FLAGS] &= ~((u64)2); - } + int io_allowed; - fp = FP_DONT_CARE; - if (get_ldev(mdev)) { - fp = mdev->ldev->dc.fencing; - put_ldev(mdev); + atomic_inc(&mdev->local_cnt); + io_allowed = (mdev->state.disk >= mins); + if (!io_allowed) { + if (atomic_dec_and_test(&mdev->local_cnt)) + wake_up(&mdev->misc_wait); } + return io_allowed; + } - /* Inform userspace about the change... */ - drbd_bcast_state(mdev, ns); - - if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) && - (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)) - drbd_khelper(mdev, "pri-on-incon-degr"); - - /* Here we have the actions that are performed after a - state change. This function might sleep */ - - if (os.disk <= D_NEGOTIATING && ns.disk > D_NEGOTIATING) - mod_timer(&mdev->request_timer, jiffies + HZ); - - nsm.i = -1; - if (ns.susp_nod) { - if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) - what = resend; - - if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && - ns.disk > D_NEGOTIATING) - what = restart_frozen_disk_io; - - if (what != nothing) - nsm.susp_nod = 0; - } + #endif - if (ns.susp_fen) { - /* case1: The outdate peer handler is successful: */ - if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) { - if (drbd_test_flag(mdev, NEW_CUR_UUID)) { - drbd_uuid_new_current(mdev); - drbd_clear_flag(mdev, NEW_CUR_UUID); - } - spin_lock_irq(&mdev->req_lock); - _tl_clear(mdev); - _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL); - spin_unlock_irq(&mdev->req_lock); - } - /* case2: The connection was established again: */ - if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) { - drbd_clear_flag(mdev, NEW_CUR_UUID); - what = resend; - nsm.susp_fen = 0; + /** + * tl_release() - mark as BARRIER_ACKED all requests in the corresponding transfer log epoch + * @tconn: DRBD connection. + * @barrier_nr: Expected identifier of the DRBD write barrier packet. + * @set_size: Expected number of requests before that barrier. + * + * In case the passed barrier_nr or set_size does not match the oldest + * epoch of not yet barrier-acked requests, this function will cause a + * termination of the connection. + */ + void tl_release(struct drbd_tconn *tconn, unsigned int barrier_nr, + unsigned int set_size) + { + struct drbd_request *r; + struct drbd_request *req = NULL; + int expect_epoch = 0; + int expect_size = 0; + + spin_lock_irq(&tconn->req_lock); + - /* find latest not yet barrier-acked write request, ++ /* find oldest not yet barrier-acked write request, + * count writes in its epoch. */ + list_for_each_entry(r, &tconn->transfer_log, tl_requests) { + const unsigned s = r->rq_state; + if (!req) { + if (!(s & RQ_WRITE)) + continue; + if (!(s & RQ_NET_MASK)) + continue; + if (s & RQ_NET_DONE) + continue; + req = r; + expect_epoch = req->epoch; + expect_size ++; + } else { + if (r->epoch != expect_epoch) + break; + if (!(s & RQ_WRITE)) + continue; + /* if (s & RQ_DONE): not expected */ + /* if (!(s & RQ_NET_MASK)): not expected */ + expect_size++; } } - if (what != nothing) { - spin_lock_irq(&mdev->req_lock); - _tl_restart(mdev, what); - nsm.i &= mdev->state.i; - _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL); - spin_unlock_irq(&mdev->req_lock); + /* first some paranoia code */ + if (req == NULL) { + conn_err(tconn, "BAD! BarrierAck #%u received, but no epoch in tl!?\n", + barrier_nr); + goto bail; } - - /* Became sync source. With protocol >= 96, we still need to send out - * the sync uuid now. Need to do that before any drbd_send_state, or - * the other side may go "paused sync" before receiving the sync uuids, - * which is unexpected. */ - if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) && - (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) && - mdev->agreed_pro_version >= 96 && get_ldev(mdev)) { - drbd_gen_and_send_sync_uuid(mdev); - put_ldev(mdev); + if (expect_epoch != barrier_nr) { + conn_err(tconn, "BAD! BarrierAck #%u received, expected #%u!\n", + barrier_nr, expect_epoch); + goto bail; } - /* Do not change the order of the if above and the two below... */ - if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */ - /* we probably will start a resync soon. - * make sure those things are properly reset. */ - mdev->rs_total = 0; - mdev->rs_failed = 0; - atomic_set(&mdev->rs_pending_cnt, 0); - drbd_rs_cancel_all(mdev); - - drbd_send_uuids(mdev); - drbd_send_state(mdev, ns); - } - /* No point in queuing send_bitmap if we don't have a connection - * anymore, so check also the _current_ state, not only the new state - * at the time this work was queued. */ - if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S && - mdev->state.conn == C_WF_BITMAP_S) - drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, - "send_bitmap (WFBitMapS)", - BM_LOCKED_TEST_ALLOWED); - - /* Lost contact to peer's copy of the data */ - if ((os.pdsk >= D_INCONSISTENT && - os.pdsk != D_UNKNOWN && - os.pdsk != D_OUTDATED) - && (ns.pdsk < D_INCONSISTENT || - ns.pdsk == D_UNKNOWN || - ns.pdsk == D_OUTDATED)) { - if (get_ldev(mdev)) { - if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) && - mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { - if (is_susp(mdev->state)) { - drbd_set_flag(mdev, NEW_CUR_UUID); - } else { - drbd_uuid_new_current(mdev); - drbd_send_uuids(mdev); - } - } - put_ldev(mdev); - } + if (expect_size != set_size) { + conn_err(tconn, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n", + barrier_nr, set_size, expect_size); + goto bail; } - if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) { - if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY && - mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { - drbd_uuid_new_current(mdev); - drbd_send_uuids(mdev); - } - /* D_DISKLESS Peer becomes secondary */ - if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) - /* We may still be Primary ourselves. - * No harm done if the bitmap still changes, - * redirtied pages will follow later. */ - drbd_bitmap_io_from_worker(mdev, &drbd_bm_write, - "demote diskless peer", BM_LOCKED_SET_ALLOWED); - put_ldev(mdev); - /* Clean up list of requests processed during current epoch */ - list_for_each_entry_safe(req, r, &tconn->transfer_log, tl_requests) { ++ /* Clean up list of requests processed during current epoch. */ ++ /* this extra list walk restart is paranoia, ++ * to catch requests being barrier-acked "unexpectedly". ++ * It usually should find the same req again, or some READ preceding it. */ ++ list_for_each_entry(req, &tconn->transfer_log, tl_requests) ++ if (req->epoch == expect_epoch) ++ break; ++ list_for_each_entry_safe_from(req, r, &tconn->transfer_log, tl_requests) { + if (req->epoch != expect_epoch) + break; + _req_mod(req, BARRIER_ACKED); } + spin_unlock_irq(&tconn->req_lock); - /* Write out all changed bits on demote. - * Though, no need to da that just yet - * if there is a resync going on still */ - if (os.role == R_PRIMARY && ns.role == R_SECONDARY && - mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) { - /* No changes to the bitmap expected this time, so assert that, - * even though no harm was done if it did change. */ - drbd_bitmap_io_from_worker(mdev, &drbd_bm_write, - "demote", BM_LOCKED_TEST_ALLOWED); - put_ldev(mdev); - } + return; - /* Last part of the attaching process ... */ - if (ns.conn >= C_CONNECTED && - os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) { - drbd_send_sizes(mdev, 0, 0); /* to start sync... */ - drbd_send_uuids(mdev); - drbd_send_state(mdev, ns); - } + bail: + spin_unlock_irq(&tconn->req_lock); + conn_request_state(tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD); + } - /* We want to pause/continue resync, tell peer. */ - if (ns.conn >= C_CONNECTED && - ((os.aftr_isp != ns.aftr_isp) || - (os.user_isp != ns.user_isp))) - drbd_send_state(mdev, ns); - - /* In case one of the isp bits got set, suspend other devices. */ - if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) && - (ns.aftr_isp || ns.peer_isp || ns.user_isp)) - suspend_other_sg(mdev); - - /* Make sure the peer gets informed about eventual state - changes (ISP bits) while we were in WFReportParams. */ - if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED) - drbd_send_state(mdev, ns); - - if (os.conn != C_AHEAD && ns.conn == C_AHEAD) - drbd_send_state(mdev, ns); - - /* We are in the progress to start a full sync... */ - if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || - (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S)) - /* no other bitmap changes expected during this phase */ - drbd_queue_bitmap_io(mdev, - &drbd_bmio_set_n_write, &abw_start_sync, - "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED); - - /* We are invalidating our self... */ - if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED && - os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT) - /* other bitmap operation expected during this phase */ - drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, - "set_n_write from invalidate", BM_LOCKED_MASK); - - /* first half of local IO error, failure to attach, - * or administrative detach */ - if (os.disk != D_FAILED && ns.disk == D_FAILED) { - /* corresponding get_ldev was in __drbd_set_state, to serialize - * our cleanup here with the transition to D_DISKLESS. - * But it is still not safe to dreference ldev here, we may end - * up here from a failed attach, before ldev was even set. */ - if (mdev->ldev) { - enum drbd_io_error_p eh = mdev->ldev->dc.on_io_error; - - /* In some setups, this handler triggers a suicide, - * basically mapping IO error to node failure, to - * reduce the number of different failure scenarios. - * - * This handler intentionally runs before we abort IO, - * notify the peer, or try to update our meta data. */ - if (eh == EP_CALL_HELPER && drbd_test_flag(mdev, WAS_IO_ERROR)) - drbd_khelper(mdev, "local-io-error"); - - /* Immediately allow completion of all application IO, - * that waits for completion from the local disk, - * if this was a force-detach due to disk_timeout - * or administrator request (drbdsetup detach --force). - * Do NOT abort otherwise. - * Aborting local requests may cause serious problems, - * if requests are completed to upper layers already, - * and then later the already submitted local bio completes. - * This can cause DMA into former bio pages that meanwhile - * have been re-used for other things. - * So aborting local requests may cause crashes, - * or even worse, silent data corruption. - */ - if (drbd_test_flag(mdev, FORCE_DETACH)) - tl_abort_disk_io(mdev); - - /* current state still has to be D_FAILED, - * there is only one way out: to D_DISKLESS, - * and that may only happen after our put_ldev below. */ - if (mdev->state.disk != D_FAILED) - dev_err(DEV, - "ASSERT FAILED: disk is %s during detach\n", - drbd_disk_str(mdev->state.disk)); - - if (ns.conn >= C_CONNECTED) - drbd_send_state(mdev, ns); - - drbd_rs_cancel_all(mdev); - - /* In case we want to get something to stable storage still, - * this may be the last chance. - * Following put_ldev may transition to D_DISKLESS. */ - drbd_md_sync(mdev); - } - put_ldev(mdev); - } - /* second half of local IO error, failure to attach, - * or administrative detach, - * after local_cnt references have reached zero again */ - if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) { - /* We must still be diskless, - * re-attach has to be serialized with this! */ - if (mdev->state.disk != D_DISKLESS) - dev_err(DEV, - "ASSERT FAILED: disk is %s while going diskless\n", - drbd_disk_str(mdev->state.disk)); - - if (ns.conn >= C_CONNECTED) - drbd_send_state(mdev, ns); - - /* corresponding get_ldev in __drbd_set_state - * this may finally trigger drbd_ldev_destroy. */ - put_ldev(mdev); - } + /** + * _tl_restart() - Walks the transfer log, and applies an action to all requests + * @mdev: DRBD device. + * @what: The action/event to perform with all request objects + * + * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO, + * RESTART_FROZEN_DISK_IO. + */ + /* must hold resource->req_lock */ + void _tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what) + { + struct drbd_request *req, *r; - /* Notify peer that I had a local IO error, and did not detached.. */ - if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED) - drbd_send_state(mdev, ns); + list_for_each_entry_safe(req, r, &tconn->transfer_log, tl_requests) + _req_mod(req, what); + } - /* Disks got bigger while they were detached */ - if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING && - drbd_test_and_clear_flag(mdev, RESYNC_AFTER_NEG)) { - if (ns.conn == C_CONNECTED) - resync_after_online_grow(mdev); - } + void tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what) + { + spin_lock_irq(&tconn->req_lock); + _tl_restart(tconn, what); + spin_unlock_irq(&tconn->req_lock); + } - /* A resync finished or aborted, wake paused devices... */ - if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) || - (os.peer_isp && !ns.peer_isp) || - (os.user_isp && !ns.user_isp)) - resume_next_sg(mdev); - - /* sync target done with resync. Explicitly notify peer, even though - * it should (at least for non-empty resyncs) already know itself. */ - if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED) - drbd_send_state(mdev, ns); - - /* Verify finished, or reached stop sector. Peer did not know about - * the stop sector, and we may even have changed the stop sector during - * verify to interrupt/stop early. Send the new state. */ - if (os.conn == C_VERIFY_S && ns.conn == C_CONNECTED - && mdev->agreed_pro_version >= 97) - drbd_send_state(mdev, ns); - - /* Wake up role changes, that were delayed because of connection establishing */ - if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS) { - drbd_clear_flag(mdev, STATE_SENT); - wake_up(&mdev->state_wait); - } + /** + * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL + * @mdev: DRBD device. + * + * This is called after the connection to the peer was lost. The storage covered + * by the requests on the transfer gets marked as our of sync. Called from the + * receiver thread and the worker thread. + */ + void tl_clear(struct drbd_tconn *tconn) + { + tl_restart(tconn, CONNECTION_LOST_WHILE_PENDING); + } - /* This triggers bitmap writeout of potentially still unwritten pages - * if the resync finished cleanly, or aborted because of peer disk - * failure, or because of connection loss. - * For resync aborted because of local disk failure, we cannot do - * any bitmap writeout anymore. - * No harm done if some bits change during this phase. - */ - if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) { - drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL, - "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED); - put_ldev(mdev); - } + /** + * tl_abort_disk_io() - Abort disk I/O for all requests for a certain mdev in the TL + * @mdev: DRBD device. + */ + void tl_abort_disk_io(struct drbd_conf *mdev) + { + struct drbd_tconn *tconn = mdev->tconn; + struct drbd_request *req, *r; - /* free tl_hash if we Got thawed and are C_STANDALONE */ - if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash) - drbd_free_tl_hash(mdev); - - /* Upon network connection, we need to start the receiver */ - if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED) - drbd_thread_start(&mdev->receiver); - - /* Terminate worker thread if we are unconfigured - it will be - restarted as needed... */ - if (ns.disk == D_DISKLESS && - ns.conn == C_STANDALONE && - ns.role == R_SECONDARY) { - if (os.aftr_isp != ns.aftr_isp) - resume_next_sg(mdev); - /* set in __drbd_set_state, unless CONFIG_PENDING was set */ - if (drbd_test_flag(mdev, DEVICE_DYING)) - drbd_thread_stop_nowait(&mdev->worker); + spin_lock_irq(&tconn->req_lock); + list_for_each_entry_safe(req, r, &tconn->transfer_log, tl_requests) { + if (!(req->rq_state & RQ_LOCAL_PENDING)) + continue; + if (req->w.mdev != mdev) + continue; + _req_mod(req, ABORT_DISK_IO); } - - drbd_md_sync(mdev); + spin_unlock_irq(&tconn->req_lock); } - static int drbd_thread_setup(void *arg) { struct drbd_thread *thi = (struct drbd_thread *) arg; @@@ -2209,19 -911,20 +911,21 @@@ void drbd_gen_and_send_sync_uuid(struc int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags) { - struct p_sizes p; + struct drbd_socket *sock; + struct p_sizes *p; sector_t d_size, u_size; - int q_order_type, max_bio_size; + int q_order_type; + unsigned int max_bio_size; - int ok; if (get_ldev_if_state(mdev, D_NEGOTIATING)) { D_ASSERT(mdev->ldev->backing_bdev); d_size = drbd_get_max_capacity(mdev->ldev); - u_size = mdev->ldev->dc.disk_size; + rcu_read_lock(); + u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size; + rcu_read_unlock(); q_order_type = drbd_queue_order_type(mdev); max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9; - max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE); + max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE); put_ldev(mdev); } else { d_size = 0; @@@ -2230,20 -933,23 +934,23 @@@ max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */ } - /* Never allow old drbd (up to 8.3.7) to see more than 32KiB */ - if (mdev->agreed_pro_version <= 94) - max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET); + sock = &mdev->tconn->data; + p = drbd_prepare_command(mdev, sock); + if (!p) + return -EIO; - p.d_size = cpu_to_be64(d_size); - p.u_size = cpu_to_be64(u_size); - p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev)); - p.max_bio_size = cpu_to_be32(max_bio_size); - p.queue_order_type = cpu_to_be16(q_order_type); - p.dds_flags = cpu_to_be16(flags); + if (mdev->tconn->agreed_pro_version <= 94) - max_bio_size = min_t(int, max_bio_size, DRBD_MAX_SIZE_H80_PACKET); ++ max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET); + else if (mdev->tconn->agreed_pro_version < 100) - max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE_P95); ++ max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE_P95); - ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES, - (struct p_header80 *)&p, sizeof(p)); - return ok; + p->d_size = cpu_to_be64(d_size); + p->u_size = cpu_to_be64(u_size); + p->c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev)); + p->max_bio_size = cpu_to_be32(max_bio_size); + p->queue_order_type = cpu_to_be16(q_order_type); + p->dds_flags = cpu_to_be16(flags); + return drbd_send_command(mdev, sock, P_SIZES, sizeof(*p), NULL, 0); } /** @@@ -3980,20 -2989,16 +2990,16 @@@ int drbd_md_read(struct drbd_conf *mdev for (i = UI_CURRENT; i < UI_SIZE; i++) bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]); bdev->md.flags = be32_to_cpu(buffer->flags); - mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents); bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid); - spin_lock_irq(&mdev->req_lock); + spin_lock_irq(&mdev->tconn->req_lock); if (mdev->state.conn < C_CONNECTED) { - int peer; + unsigned int peer; peer = be32_to_cpu(buffer->la_peer_max_bio_size); - peer = max_t(int, peer, DRBD_MAX_BIO_SIZE_SAFE); + peer = max(peer, DRBD_MAX_BIO_SIZE_SAFE); mdev->peer_max_bio_size = peer; } - spin_unlock_irq(&mdev->req_lock); - - if (mdev->sync_conf.al_extents < 7) - mdev->sync_conf.al_extents = 127; + spin_unlock_irq(&mdev->tconn->req_lock); err: drbd_md_put_buffer(mdev); diff --cc drivers/block/drbd/drbd_nl.c index c8dda4e8dfc,d339a2754a8..76bb3a684b8 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@@ -36,103 -35,283 +35,282 @@@ #include "drbd_req.h" #include "drbd_wrappers.h" #include - #include #include - #include #include - static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int); - static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *); - static unsigned short *tl_add_int(unsigned short *, enum drbd_tags, const void *); - - /* see get_sb_bdev and bd_claim */ + #include + + /* .doit */ + // int drbd_adm_create_resource(struct sk_buff *skb, struct genl_info *info); + // int drbd_adm_delete_resource(struct sk_buff *skb, struct genl_info *info); + + int drbd_adm_add_minor(struct sk_buff *skb, struct genl_info *info); + int drbd_adm_delete_minor(struct sk_buff *skb, struct genl_info *info); + + int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info); + int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info); + int drbd_adm_down(struct sk_buff *skb, struct genl_info *info); + + int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info); + int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info); + int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info); + int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info); + int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info); + int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info); + int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info); + int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info); + int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info); + int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info); + int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info); + int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info); + int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info); + int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info); + int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info); + int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info); + int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info); + int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info); + int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info); + int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info); + /* .dumpit */ + int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb); + + #include + #include "drbd_nla.h" + #include + + /* used blkdev_get_by_path, to claim our meta data device(s) */ static char *drbd_m_holder = "Hands off! this is DRBD's meta data device."; - /* Generate the tag_list to struct functions */ - #define NL_PACKET(name, number, fields) \ - static int name ## _from_tags(struct drbd_conf *mdev, \ - unsigned short *tags, struct name *arg) __attribute__ ((unused)); \ - static int name ## _from_tags(struct drbd_conf *mdev, \ - unsigned short *tags, struct name *arg) \ - { \ - int tag; \ - int dlen; \ - \ - while ((tag = get_unaligned(tags++)) != TT_END) { \ - dlen = get_unaligned(tags++); \ - switch (tag_number(tag)) { \ - fields \ - default: \ - if (tag & T_MANDATORY) { \ - dev_err(DEV, "Unknown tag: %d\n", tag_number(tag)); \ - return 0; \ - } \ - } \ - tags = (unsigned short *)((char *)tags + dlen); \ - } \ - return 1; \ - } - #define NL_INTEGER(pn, pr, member) \ - case pn: /* D_ASSERT( tag_type(tag) == TT_INTEGER ); */ \ - arg->member = get_unaligned((int *)(tags)); \ - break; - #define NL_INT64(pn, pr, member) \ - case pn: /* D_ASSERT( tag_type(tag) == TT_INT64 ); */ \ - arg->member = get_unaligned((u64 *)(tags)); \ + /* Configuration is strictly serialized, because generic netlink message + * processing is strictly serialized by the genl_lock(). + * Which means we can use one static global drbd_config_context struct. + */ + static struct drbd_config_context { + /* assigned from drbd_genlmsghdr */ + unsigned int minor; + /* assigned from request attributes, if present */ + unsigned int volume; + #define VOLUME_UNSPECIFIED (-1U) + /* pointer into the request skb, + * limited lifetime! */ + char *resource_name; + struct nlattr *my_addr; + struct nlattr *peer_addr; + + /* reply buffer */ + struct sk_buff *reply_skb; + /* pointer into reply buffer */ + struct drbd_genlmsghdr *reply_dh; + /* resolved from attributes, if possible */ + struct drbd_conf *mdev; + struct drbd_tconn *tconn; + } adm_ctx; + + static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info) + { + genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb)))); + if (genlmsg_reply(skb, info)) + printk(KERN_ERR "drbd: error sending genl reply\n"); + } + + /* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only + * reason it could fail was no space in skb, and there are 4k available. */ + int drbd_msg_put_info(const char *info) + { + struct sk_buff *skb = adm_ctx.reply_skb; + struct nlattr *nla; + int err = -EMSGSIZE; + + if (!info || !info[0]) + return 0; + + nla = nla_nest_start(skb, DRBD_NLA_CFG_REPLY); + if (!nla) + return err; + + err = nla_put_string(skb, T_info_text, info); + if (err) { + nla_nest_cancel(skb, nla); + return err; + } else + nla_nest_end(skb, nla); + return 0; + } + + /* This would be a good candidate for a "pre_doit" hook, + * and per-family private info->pointers. + * But we need to stay compatible with older kernels. + * If it returns successfully, adm_ctx members are valid. + */ + #define DRBD_ADM_NEED_MINOR 1 + #define DRBD_ADM_NEED_RESOURCE 2 + #define DRBD_ADM_NEED_CONNECTION 4 + static int drbd_adm_prepare(struct sk_buff *skb, struct genl_info *info, + unsigned flags) + { + struct drbd_genlmsghdr *d_in = info->userhdr; + const u8 cmd = info->genlhdr->cmd; + int err; + + memset(&adm_ctx, 0, sizeof(adm_ctx)); + + /* genl_rcv_msg only checks for CAP_NET_ADMIN on "GENL_ADMIN_PERM" :( */ - if (cmd != DRBD_ADM_GET_STATUS - && security_netlink_recv(skb, CAP_SYS_ADMIN)) ++ if (cmd != DRBD_ADM_GET_STATUS && !capable(CAP_NET_ADMIN)) + return -EPERM; + + adm_ctx.reply_skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!adm_ctx.reply_skb) { + err = -ENOMEM; + goto fail; + } + + adm_ctx.reply_dh = genlmsg_put_reply(adm_ctx.reply_skb, + info, &drbd_genl_family, 0, cmd); + /* put of a few bytes into a fresh skb of >= 4k will always succeed. + * but anyways */ + if (!adm_ctx.reply_dh) { + err = -ENOMEM; + goto fail; + } + + adm_ctx.reply_dh->minor = d_in->minor; + adm_ctx.reply_dh->ret_code = NO_ERROR; + + adm_ctx.volume = VOLUME_UNSPECIFIED; + if (info->attrs[DRBD_NLA_CFG_CONTEXT]) { + struct nlattr *nla; + /* parse and validate only */ + err = drbd_cfg_context_from_attrs(NULL, info); + if (err) + goto fail; + + /* It was present, and valid, + * copy it over to the reply skb. */ + err = nla_put_nohdr(adm_ctx.reply_skb, + info->attrs[DRBD_NLA_CFG_CONTEXT]->nla_len, + info->attrs[DRBD_NLA_CFG_CONTEXT]); + if (err) + goto fail; + + /* and assign stuff to the global adm_ctx */ + nla = nested_attr_tb[__nla_type(T_ctx_volume)]; + if (nla) + adm_ctx.volume = nla_get_u32(nla); + nla = nested_attr_tb[__nla_type(T_ctx_resource_name)]; + if (nla) + adm_ctx.resource_name = nla_data(nla); + adm_ctx.my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)]; + adm_ctx.peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)]; + if ((adm_ctx.my_addr && + nla_len(adm_ctx.my_addr) > sizeof(adm_ctx.tconn->my_addr)) || + (adm_ctx.peer_addr && + nla_len(adm_ctx.peer_addr) > sizeof(adm_ctx.tconn->peer_addr))) { + err = -EINVAL; + goto fail; + } + } + + adm_ctx.minor = d_in->minor; + adm_ctx.mdev = minor_to_mdev(d_in->minor); + adm_ctx.tconn = conn_get_by_name(adm_ctx.resource_name); + + if (!adm_ctx.mdev && (flags & DRBD_ADM_NEED_MINOR)) { + drbd_msg_put_info("unknown minor"); + return ERR_MINOR_INVALID; + } + if (!adm_ctx.tconn && (flags & DRBD_ADM_NEED_RESOURCE)) { + drbd_msg_put_info("unknown resource"); + return ERR_INVALID_REQUEST; + } + + if (flags & DRBD_ADM_NEED_CONNECTION) { + if (adm_ctx.tconn && !(flags & DRBD_ADM_NEED_RESOURCE)) { + drbd_msg_put_info("no resource name expected"); + return ERR_INVALID_REQUEST; + } + if (adm_ctx.mdev) { + drbd_msg_put_info("no minor number expected"); + return ERR_INVALID_REQUEST; + } + if (adm_ctx.my_addr && adm_ctx.peer_addr) + adm_ctx.tconn = conn_get_by_addrs(nla_data(adm_ctx.my_addr), + nla_len(adm_ctx.my_addr), + nla_data(adm_ctx.peer_addr), + nla_len(adm_ctx.peer_addr)); + if (!adm_ctx.tconn) { + drbd_msg_put_info("unknown connection"); + return ERR_INVALID_REQUEST; + } + } + + /* some more paranoia, if the request was over-determined */ + if (adm_ctx.mdev && adm_ctx.tconn && + adm_ctx.mdev->tconn != adm_ctx.tconn) { + pr_warning("request: minor=%u, resource=%s; but that minor belongs to connection %s\n", + adm_ctx.minor, adm_ctx.resource_name, + adm_ctx.mdev->tconn->name); + drbd_msg_put_info("minor exists in different resource"); + return ERR_INVALID_REQUEST; + } + if (adm_ctx.mdev && + adm_ctx.volume != VOLUME_UNSPECIFIED && + adm_ctx.volume != adm_ctx.mdev->vnr) { + pr_warning("request: minor=%u, volume=%u; but that minor is volume %u in %s\n", + adm_ctx.minor, adm_ctx.volume, + adm_ctx.mdev->vnr, adm_ctx.mdev->tconn->name); + drbd_msg_put_info("minor exists as different volume"); + return ERR_INVALID_REQUEST; + } + + return NO_ERROR; + + fail: + nlmsg_free(adm_ctx.reply_skb); + adm_ctx.reply_skb = NULL; + return err; + } + + static int drbd_adm_finish(struct genl_info *info, int retcode) + { + if (adm_ctx.tconn) { + kref_put(&adm_ctx.tconn->kref, &conn_destroy); + adm_ctx.tconn = NULL; + } + + if (!adm_ctx.reply_skb) + return -ENOMEM; + + adm_ctx.reply_dh->ret_code = retcode; + drbd_adm_send_reply(adm_ctx.reply_skb, info); + return 0; + } + + static void setup_khelper_env(struct drbd_tconn *tconn, char **envp) + { + char *afs; + + /* FIXME: A future version will not allow this case. */ + if (tconn->my_addr_len == 0 || tconn->peer_addr_len == 0) + return; + + switch (((struct sockaddr *)&tconn->peer_addr)->sa_family) { + case AF_INET6: + afs = "ipv6"; + snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI6", + &((struct sockaddr_in6 *)&tconn->peer_addr)->sin6_addr); break; - #define NL_BIT(pn, pr, member) \ - case pn: /* D_ASSERT( tag_type(tag) == TT_BIT ); */ \ - arg->member = *(char *)(tags) ? 1 : 0; \ + case AF_INET: + afs = "ipv4"; + snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4", + &((struct sockaddr_in *)&tconn->peer_addr)->sin_addr); break; - #define NL_STRING(pn, pr, member, len) \ - case pn: /* D_ASSERT( tag_type(tag) == TT_STRING ); */ \ - if (dlen > len) { \ - dev_err(DEV, "arg too long: %s (%u wanted, max len: %u bytes)\n", \ - #member, dlen, (unsigned int)len); \ - return 0; \ - } \ - arg->member ## _len = dlen; \ - memcpy(arg->member, tags, min_t(size_t, dlen, len)); \ - break; - #include - - /* Generate the struct to tag_list functions */ - #define NL_PACKET(name, number, fields) \ - static unsigned short* \ - name ## _to_tags(struct drbd_conf *mdev, \ - struct name *arg, unsigned short *tags) __attribute__ ((unused)); \ - static unsigned short* \ - name ## _to_tags(struct drbd_conf *mdev, \ - struct name *arg, unsigned short *tags) \ - { \ - fields \ - return tags; \ - } - - #define NL_INTEGER(pn, pr, member) \ - put_unaligned(pn | pr | TT_INTEGER, tags++); \ - put_unaligned(sizeof(int), tags++); \ - put_unaligned(arg->member, (int *)tags); \ - tags = (unsigned short *)((char *)tags+sizeof(int)); - #define NL_INT64(pn, pr, member) \ - put_unaligned(pn | pr | TT_INT64, tags++); \ - put_unaligned(sizeof(u64), tags++); \ - put_unaligned(arg->member, (u64 *)tags); \ - tags = (unsigned short *)((char *)tags+sizeof(u64)); - #define NL_BIT(pn, pr, member) \ - put_unaligned(pn | pr | TT_BIT, tags++); \ - put_unaligned(sizeof(char), tags++); \ - *(char *)tags = arg->member; \ - tags = (unsigned short *)((char *)tags+sizeof(char)); - #define NL_STRING(pn, pr, member, len) \ - put_unaligned(pn | pr | TT_STRING, tags++); \ - put_unaligned(arg->member ## _len, tags++); \ - memcpy(tags, arg->member, arg->member ## _len); \ - tags = (unsigned short *)((char *)tags + arg->member ## _len); - #include - - void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name); - void drbd_nl_send_reply(struct cn_msg *, int); + default: + afs = "ssocks"; + snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4", + &((struct sockaddr_in *)&tconn->peer_addr)->sin_addr); + } + snprintf(envp[3], 20, "DRBD_PEER_AF=%s", afs); + } int drbd_khelper(struct drbd_conf *mdev, char *cmd) { @@@ -180,9 -338,10 +337,10 @@@ drbd_md_sync(mdev); dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb); - - drbd_bcast_ev_helper(mdev, cmd); + sib.sib_reason = SIB_HELPER_PRE; + sib.helper_name = cmd; + drbd_bcast_event(mdev, &sib); - ret = call_usermodehelper(usermode_helper, argv, envp, 1); + ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC); if (ret) dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n", usermode_helper, cmd, mb, @@@ -191,9 -350,46 +349,46 @@@ dev_info(DEV, "helper command: %s %s %s exit code %u (0x%x)\n", usermode_helper, cmd, mb, (ret >> 8) & 0xff, ret); + sib.sib_reason = SIB_HELPER_POST; + sib.helper_exit_code = ret; + drbd_bcast_event(mdev, &sib); + + if (current == tconn->worker.task) + clear_bit(CALLBACK_PENDING, &tconn->flags); - if (current == mdev->worker.task) - drbd_clear_flag(mdev, CALLBACK_PENDING); + if (ret < 0) /* Ignore any ERRNOs we got. */ + ret = 0; + + return ret; + } + + int conn_khelper(struct drbd_tconn *tconn, char *cmd) + { + char *envp[] = { "HOME=/", + "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", + (char[20]) { }, /* address family */ + (char[60]) { }, /* address */ + NULL }; + char *argv[] = {usermode_helper, cmd, tconn->name, NULL }; + int ret; + + setup_khelper_env(tconn, envp); + conn_md_sync(tconn); + + conn_info(tconn, "helper command: %s %s %s\n", usermode_helper, cmd, tconn->name); + /* TODO: conn_bcast_event() ?? */ + - ret = call_usermodehelper(usermode_helper, argv, envp, 1); ++ ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC); + if (ret) + conn_warn(tconn, "helper command: %s %s %s exit code %u (0x%x)\n", + usermode_helper, cmd, tconn->name, + (ret >> 8) & 0xff, ret); + else + conn_info(tconn, "helper command: %s %s %s exit code %u (0x%x)\n", + usermode_helper, cmd, tconn->name, + (ret >> 8) & 0xff, ret); + /* TODO: conn_bcast_event() ?? */ if (ret < 0) /* Ignore any ERRNOs we got. */ ret = 0; @@@ -852,12 -1054,14 +1054,14 @@@ void drbd_reconsider_max_bio_size(struc Because new from 8.3.8 onwards the peer can use multiple BIOs for a single peer_request */ if (mdev->state.conn >= C_CONNECTED) { - if (mdev->agreed_pro_version < 94) { - peer = min(mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); + if (mdev->tconn->agreed_pro_version < 94) - peer = min_t(int, mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); ++ peer = min( mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */ - } else if (mdev->agreed_pro_version == 94) + else if (mdev->tconn->agreed_pro_version == 94) peer = DRBD_MAX_SIZE_H80_PACKET; - else /* drbd 8.3.8 onwards */ + else if (mdev->tconn->agreed_pro_version < 100) + peer = DRBD_MAX_BIO_SIZE_P95; /* drbd 8.3.8 onwards, before 8.4.0 */ + else peer = DRBD_MAX_BIO_SIZE; } @@@ -1383,170 -1706,144 +1706,141 @@@ static int adm_detach(struct drbd_conf retcode = SS_NOTHING_TO_DO; if (ret) retcode = ERR_INTR; - reply->ret_code = retcode; out: - return 0; + return retcode; } - static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, - struct drbd_nl_cfg_reply *reply) + /* Detaching the disk is a process in multiple stages. First we need to lock + * out application IO, in-flight IO, IO stuck in drbd_al_begin_io. + * Then we transition to D_DISKLESS, and wait for put_ldev() to return all + * internal references as well. + * Only then we have finally detached. */ + int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info) { - int i, ns; enum drbd_ret_code retcode; - struct net_conf *new_conf = NULL; - struct crypto_hash *tfm = NULL; - struct crypto_hash *integrity_w_tfm = NULL; - struct crypto_hash *integrity_r_tfm = NULL; - struct hlist_head *new_tl_hash = NULL; - struct hlist_head *new_ee_hash = NULL; - struct drbd_conf *odev; - char hmac_name[CRYPTO_MAX_ALG_NAME]; - void *int_dig_out = NULL; - void *int_dig_in = NULL; - void *int_dig_vv = NULL; - struct sockaddr *new_my_addr, *new_peer_addr, *taken_addr; + struct detach_parms parms = { }; + int err; - drbd_reconfig_start(mdev); + retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; - if (mdev->state.conn > C_STANDALONE) { - retcode = ERR_NET_CONFIGURED; - goto fail; + if (info->attrs[DRBD_NLA_DETACH_PARMS]) { + err = detach_parms_from_attrs(&parms, info); + if (err) { + retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(from_attrs_err_to_txt(err)); + goto out; + } } - /* allocation not in the IO path, cqueue thread context */ - new_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL); - if (!new_conf) { - retcode = ERR_NOMEM; - goto fail; - } + retcode = adm_detach(adm_ctx.mdev, parms.force_detach); + out: + drbd_adm_finish(info, retcode); + return 0; + } - new_conf->timeout = DRBD_TIMEOUT_DEF; - new_conf->try_connect_int = DRBD_CONNECT_INT_DEF; - new_conf->ping_int = DRBD_PING_INT_DEF; - new_conf->max_epoch_size = DRBD_MAX_EPOCH_SIZE_DEF; - new_conf->max_buffers = DRBD_MAX_BUFFERS_DEF; - new_conf->unplug_watermark = DRBD_UNPLUG_WATERMARK_DEF; - new_conf->sndbuf_size = DRBD_SNDBUF_SIZE_DEF; - new_conf->rcvbuf_size = DRBD_RCVBUF_SIZE_DEF; - new_conf->ko_count = DRBD_KO_COUNT_DEF; - new_conf->after_sb_0p = DRBD_AFTER_SB_0P_DEF; - new_conf->after_sb_1p = DRBD_AFTER_SB_1P_DEF; - new_conf->after_sb_2p = DRBD_AFTER_SB_2P_DEF; - new_conf->want_lose = 0; - new_conf->two_primaries = 0; - new_conf->wire_protocol = DRBD_PROT_C; - new_conf->ping_timeo = DRBD_PING_TIMEO_DEF; - new_conf->rr_conflict = DRBD_RR_CONFLICT_DEF; - new_conf->on_congestion = DRBD_ON_CONGESTION_DEF; - new_conf->cong_extents = DRBD_CONG_EXTENTS_DEF; - - if (!net_conf_from_tags(mdev, nlp->tag_list, new_conf)) { - retcode = ERR_MANDATORY_TAG; - goto fail; + static bool conn_resync_running(struct drbd_tconn *tconn) + { + struct drbd_conf *mdev; + bool rv = false; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&tconn->volumes, mdev, vnr) { + if (mdev->state.conn == C_SYNC_SOURCE || + mdev->state.conn == C_SYNC_TARGET || + mdev->state.conn == C_PAUSED_SYNC_S || + mdev->state.conn == C_PAUSED_SYNC_T) { + rv = true; + break; + } } + rcu_read_unlock(); - if (new_conf->two_primaries - && (new_conf->wire_protocol != DRBD_PROT_C)) { - retcode = ERR_NOT_PROTO_C; - goto fail; - } + return rv; + } - if (get_ldev(mdev)) { - enum drbd_fencing_p fp = mdev->ldev->dc.fencing; - put_ldev(mdev); - if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH) { - retcode = ERR_STONITH_AND_PROT_A; - goto fail; + static bool conn_ov_running(struct drbd_tconn *tconn) + { + struct drbd_conf *mdev; + bool rv = false; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&tconn->volumes, mdev, vnr) { + if (mdev->state.conn == C_VERIFY_S || + mdev->state.conn == C_VERIFY_T) { + rv = true; + break; } } + rcu_read_unlock(); - if (new_conf->on_congestion != OC_BLOCK && new_conf->wire_protocol != DRBD_PROT_A) { - retcode = ERR_CONG_NOT_PROTO_A; - goto fail; - } + return rv; + } - if (mdev->state.role == R_PRIMARY && new_conf->want_lose) { - retcode = ERR_DISCARD; - goto fail; - } + static enum drbd_ret_code + _check_net_options(struct drbd_tconn *tconn, struct net_conf *old_conf, struct net_conf *new_conf) + { + struct drbd_conf *mdev; + int i; - retcode = NO_ERROR; + if (old_conf && tconn->cstate == C_WF_REPORT_PARAMS && tconn->agreed_pro_version < 100) { + if (new_conf->wire_protocol != old_conf->wire_protocol) + return ERR_NEED_APV_100; - new_my_addr = (struct sockaddr *)&new_conf->my_addr; - new_peer_addr = (struct sockaddr *)&new_conf->peer_addr; - for (i = 0; i < minor_count; i++) { - odev = minor_to_mdev(i); - if (!odev || odev == mdev) - continue; - if (get_net_conf(odev)) { - taken_addr = (struct sockaddr *)&odev->net_conf->my_addr; - if (new_conf->my_addr_len == odev->net_conf->my_addr_len && - !memcmp(new_my_addr, taken_addr, new_conf->my_addr_len)) - retcode = ERR_LOCAL_ADDR; - - taken_addr = (struct sockaddr *)&odev->net_conf->peer_addr; - if (new_conf->peer_addr_len == odev->net_conf->peer_addr_len && - !memcmp(new_peer_addr, taken_addr, new_conf->peer_addr_len)) - retcode = ERR_PEER_ADDR; - - put_net_conf(odev); - if (retcode != NO_ERROR) - goto fail; - } + if (new_conf->two_primaries != old_conf->two_primaries) + return ERR_NEED_APV_100; + - if (!new_conf->integrity_alg != !old_conf->integrity_alg) - return ERR_NEED_APV_100; - + if (strcmp(new_conf->integrity_alg, old_conf->integrity_alg)) + return ERR_NEED_APV_100; } - if (new_conf->cram_hmac_alg[0] != 0) { - snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)", - new_conf->cram_hmac_alg); - tfm = crypto_alloc_hash(hmac_name, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(tfm)) { - tfm = NULL; - retcode = ERR_AUTH_ALG; - goto fail; - } + if (!new_conf->two_primaries && + conn_highest_role(tconn) == R_PRIMARY && + conn_highest_peer(tconn) == R_PRIMARY) + return ERR_NEED_ALLOW_TWO_PRI; - if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) { - retcode = ERR_AUTH_ALG_ND; - goto fail; + if (new_conf->two_primaries && + (new_conf->wire_protocol != DRBD_PROT_C)) + return ERR_NOT_PROTO_C; + + idr_for_each_entry(&tconn->volumes, mdev, i) { + if (get_ldev(mdev)) { + enum drbd_fencing_p fp = rcu_dereference(mdev->ldev->disk_conf)->fencing; + put_ldev(mdev); + if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH) + return ERR_STONITH_AND_PROT_A; } + if (mdev->state.role == R_PRIMARY && new_conf->discard_my_data) + return ERR_DISCARD_IMPOSSIBLE; } - if (new_conf->integrity_alg[0]) { - integrity_w_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(integrity_w_tfm)) { - integrity_w_tfm = NULL; - retcode=ERR_INTEGRITY_ALG; - goto fail; - } + if (new_conf->on_congestion != OC_BLOCK && new_conf->wire_protocol != DRBD_PROT_A) + return ERR_CONG_NOT_PROTO_A; - if (!drbd_crypto_is_hash(crypto_hash_tfm(integrity_w_tfm))) { - retcode=ERR_INTEGRITY_ALG_ND; - goto fail; - } + return NO_ERROR; + } - integrity_r_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(integrity_r_tfm)) { - integrity_r_tfm = NULL; - retcode=ERR_INTEGRITY_ALG; - goto fail; - } - } + static enum drbd_ret_code + check_net_options(struct drbd_tconn *tconn, struct net_conf *new_conf) + { + static enum drbd_ret_code rv; + struct drbd_conf *mdev; + int i; - ns = new_conf->max_epoch_size/8; - if (mdev->tl_hash_s != ns) { - new_tl_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL); - if (!new_tl_hash) { - retcode = ERR_NOMEM; - goto fail; - } - } + rcu_read_lock(); + rv = _check_net_options(tconn, rcu_dereference(tconn->net_conf), new_conf); + rcu_read_unlock(); - ns = new_conf->max_buffers/8; - if (new_conf->two_primaries && (mdev->ee_hash_s != ns)) { - new_ee_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL); - if (!new_ee_hash) { - retcode = ERR_NOMEM; - goto fail; + /* tconn->volumes protected by genl_lock() here */ + idr_for_each_entry(&tconn->volumes, mdev, i) { + if (!mdev->bitmap) { + if(drbd_bm_init(mdev)) + return ERR_NOMEM; } } @@@ -2113,115 -2574,402 +2571,402 @@@ out return 0; } - static int drbd_nl_outdate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, - struct drbd_nl_cfg_reply *reply) + int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info) { - reply->ret_code = drbd_request_state(mdev, NS(disk, D_OUTDATED)); - return 0; + return drbd_adm_simple_request_state(skb, info, NS(disk, D_OUTDATED)); } - static int drbd_nl_get_config(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, - struct drbd_nl_cfg_reply *reply) + int nla_put_drbd_cfg_context(struct sk_buff *skb, struct drbd_tconn *tconn, unsigned vnr) { - unsigned short *tl; + struct nlattr *nla; + nla = nla_nest_start(skb, DRBD_NLA_CFG_CONTEXT); + if (!nla) + goto nla_put_failure; + if (vnr != VOLUME_UNSPECIFIED && + nla_put_u32(skb, T_ctx_volume, vnr)) + goto nla_put_failure; + if (nla_put_string(skb, T_ctx_resource_name, tconn->name)) + goto nla_put_failure; + if (tconn->my_addr_len && + nla_put(skb, T_ctx_my_addr, tconn->my_addr_len, &tconn->my_addr)) + goto nla_put_failure; + if (tconn->peer_addr_len && + nla_put(skb, T_ctx_peer_addr, tconn->peer_addr_len, &tconn->peer_addr)) + goto nla_put_failure; + nla_nest_end(skb, nla); + return 0; - tl = reply->tag_list; + nla_put_failure: + if (nla) + nla_nest_cancel(skb, nla); + return -EMSGSIZE; + } - if (get_ldev(mdev)) { - tl = disk_conf_to_tags(mdev, &mdev->ldev->dc, tl); - put_ldev(mdev); - } + int nla_put_status_info(struct sk_buff *skb, struct drbd_conf *mdev, + const struct sib_info *sib) + { + struct state_info *si = NULL; /* for sizeof(si->member); */ + struct net_conf *nc; + struct nlattr *nla; + int got_ldev; + int err = 0; + int exclude_sensitive; + + /* If sib != NULL, this is drbd_bcast_event, which anyone can listen + * to. So we better exclude_sensitive information. + * + * If sib == NULL, this is drbd_adm_get_status, executed synchronously + * in the context of the requesting user process. Exclude sensitive + * information, unless current has superuser. + * + * NOTE: for drbd_adm_get_status_all(), this is a netlink dump, and + * relies on the current implementation of netlink_dump(), which + * executes the dump callback successively from netlink_recvmsg(), + * always in the context of the receiving process */ + exclude_sensitive = sib || !capable(CAP_SYS_ADMIN); + + got_ldev = get_ldev(mdev); + + /* We need to add connection name and volume number information still. + * Minor number is in drbd_genlmsghdr. */ + if (nla_put_drbd_cfg_context(skb, mdev->tconn, mdev->vnr)) + goto nla_put_failure; + + if (res_opts_to_skb(skb, &mdev->tconn->res_opts, exclude_sensitive)) + goto nla_put_failure; + + rcu_read_lock(); + if (got_ldev) + if (disk_conf_to_skb(skb, rcu_dereference(mdev->ldev->disk_conf), exclude_sensitive)) + goto nla_put_failure; + + nc = rcu_dereference(mdev->tconn->net_conf); + if (nc) + err = net_conf_to_skb(skb, nc, exclude_sensitive); + rcu_read_unlock(); + if (err) + goto nla_put_failure; + + nla = nla_nest_start(skb, DRBD_NLA_STATE_INFO); + if (!nla) + goto nla_put_failure; + if (nla_put_u32(skb, T_sib_reason, sib ? sib->sib_reason : SIB_GET_STATUS_REPLY) || + nla_put_u32(skb, T_current_state, mdev->state.i) || + nla_put_u64(skb, T_ed_uuid, mdev->ed_uuid) || + nla_put_u64(skb, T_capacity, drbd_get_capacity(mdev->this_bdev)) || + nla_put_u64(skb, T_send_cnt, mdev->send_cnt) || + nla_put_u64(skb, T_recv_cnt, mdev->recv_cnt) || + nla_put_u64(skb, T_read_cnt, mdev->read_cnt) || + nla_put_u64(skb, T_writ_cnt, mdev->writ_cnt) || + nla_put_u64(skb, T_al_writ_cnt, mdev->al_writ_cnt) || + nla_put_u64(skb, T_bm_writ_cnt, mdev->bm_writ_cnt) || + nla_put_u32(skb, T_ap_bio_cnt, atomic_read(&mdev->ap_bio_cnt)) || + nla_put_u32(skb, T_ap_pending_cnt, atomic_read(&mdev->ap_pending_cnt)) || + nla_put_u32(skb, T_rs_pending_cnt, atomic_read(&mdev->rs_pending_cnt))) + goto nla_put_failure; + + if (got_ldev) { + int err; - if (get_net_conf(mdev)) { - tl = net_conf_to_tags(mdev, mdev->net_conf, tl); - put_net_conf(mdev); + spin_lock_irq(&mdev->ldev->md.uuid_lock); + err = nla_put(skb, T_uuids, sizeof(si->uuids), mdev->ldev->md.uuid); + spin_unlock_irq(&mdev->ldev->md.uuid_lock); + + if (err) + goto nla_put_failure; + + if (nla_put_u32(skb, T_disk_flags, mdev->ldev->md.flags) || + nla_put_u64(skb, T_bits_total, drbd_bm_bits(mdev)) || + nla_put_u64(skb, T_bits_oos, drbd_bm_total_weight(mdev))) + goto nla_put_failure; + if (C_SYNC_SOURCE <= mdev->state.conn && + C_PAUSED_SYNC_T >= mdev->state.conn) { + if (nla_put_u64(skb, T_bits_rs_total, mdev->rs_total) || + nla_put_u64(skb, T_bits_rs_failed, mdev->rs_failed)) + goto nla_put_failure; + } } - tl = syncer_conf_to_tags(mdev, &mdev->sync_conf, tl); - put_unaligned(TT_END, tl++); /* Close the tag list */ + if (sib) { + switch(sib->sib_reason) { + case SIB_SYNC_PROGRESS: + case SIB_GET_STATUS_REPLY: + break; + case SIB_STATE_CHANGE: + if (nla_put_u32(skb, T_prev_state, sib->os.i) || + nla_put_u32(skb, T_new_state, sib->ns.i)) + goto nla_put_failure; + break; + case SIB_HELPER_POST: + if (nla_put_u32(skb, T_helper_exit_code, + sib->helper_exit_code)) + goto nla_put_failure; + /* fall through */ + case SIB_HELPER_PRE: + if (nla_put_string(skb, T_helper, sib->helper_name)) + goto nla_put_failure; + break; + } + } + nla_nest_end(skb, nla); - return (int)((char *)tl - (char *)reply->tag_list); + if (0) + nla_put_failure: + err = -EMSGSIZE; + if (got_ldev) + put_ldev(mdev); + return err; } - static int drbd_nl_get_state(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, - struct drbd_nl_cfg_reply *reply) + int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info) { - unsigned short *tl = reply->tag_list; - union drbd_state s = mdev->state; - unsigned long rs_left; - unsigned int res; + enum drbd_ret_code retcode; + int err; - tl = get_state_to_tags(mdev, (struct get_state *)&s, tl); + retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; - /* no local ref, no bitmap, no syncer progress. */ - if (s.conn >= C_SYNC_SOURCE && s.conn <= C_PAUSED_SYNC_T) { - if (get_ldev(mdev)) { - drbd_get_syncer_progress(mdev, &rs_left, &res); - tl = tl_add_int(tl, T_sync_progress, &res); - put_ldev(mdev); - } + err = nla_put_status_info(adm_ctx.reply_skb, adm_ctx.mdev, NULL); + if (err) { + nlmsg_free(adm_ctx.reply_skb); + return err; } - put_unaligned(TT_END, tl++); /* Close the tag list */ - - return (int)((char *)tl - (char *)reply->tag_list); + out: + drbd_adm_finish(info, retcode); + return 0; } - static int drbd_nl_get_uuids(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, - struct drbd_nl_cfg_reply *reply) + int get_one_status(struct sk_buff *skb, struct netlink_callback *cb) { - unsigned short *tl; - - tl = reply->tag_list; + struct drbd_conf *mdev; + struct drbd_genlmsghdr *dh; + struct drbd_tconn *pos = (struct drbd_tconn*)cb->args[0]; + struct drbd_tconn *tconn = NULL; + struct drbd_tconn *tmp; + unsigned volume = cb->args[1]; + + /* Open coded, deferred, iteration: + * list_for_each_entry_safe(tconn, tmp, &drbd_tconns, all_tconn) { + * idr_for_each_entry(&tconn->volumes, mdev, i) { + * ... + * } + * } + * where tconn is cb->args[0]; + * and i is cb->args[1]; + * + * cb->args[2] indicates if we shall loop over all resources, + * or just dump all volumes of a single resource. + * + * This may miss entries inserted after this dump started, + * or entries deleted before they are reached. + * + * We need to make sure the mdev won't disappear while + * we are looking at it, and revalidate our iterators + * on each iteration. + */ - if (get_ldev(mdev)) { - unsigned long flags; - spin_lock_irqsave(&mdev->ldev->md.uuid_lock, flags); - tl = tl_add_blob(tl, T_uuids, mdev->ldev->md.uuid, UI_SIZE*sizeof(u64)); - tl = tl_add_int(tl, T_uuids_flags, &mdev->ldev->md.flags); - spin_unlock_irqrestore(&mdev->ldev->md.uuid_lock, flags); - put_ldev(mdev); + /* synchronize with conn_create()/conn_destroy() */ + rcu_read_lock(); + /* revalidate iterator position */ + list_for_each_entry_rcu(tmp, &drbd_tconns, all_tconn) { + if (pos == NULL) { + /* first iteration */ + pos = tmp; + tconn = pos; + break; + } + if (tmp == pos) { + tconn = pos; + break; + } } - put_unaligned(TT_END, tl++); /* Close the tag list */ + if (tconn) { + next_tconn: + mdev = idr_get_next(&tconn->volumes, &volume); + if (!mdev) { + /* No more volumes to dump on this tconn. + * Advance tconn iterator. */ + pos = list_entry_rcu(tconn->all_tconn.next, + struct drbd_tconn, all_tconn); + /* Did we dump any volume on this tconn yet? */ + if (volume != 0) { + /* If we reached the end of the list, + * or only a single resource dump was requested, + * we are done. */ + if (&pos->all_tconn == &drbd_tconns || cb->args[2]) + goto out; + volume = 0; + tconn = pos; + goto next_tconn; + } + } + - dh = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, ++ dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, &drbd_genl_family, + NLM_F_MULTI, DRBD_ADM_GET_STATUS); + if (!dh) + goto out; + + if (!mdev) { + /* This is a tconn without a single volume. + * Suprisingly enough, it may have a network + * configuration. */ + struct net_conf *nc; + dh->minor = -1U; + dh->ret_code = NO_ERROR; + if (nla_put_drbd_cfg_context(skb, tconn, VOLUME_UNSPECIFIED)) + goto cancel; + nc = rcu_dereference(tconn->net_conf); + if (nc && net_conf_to_skb(skb, nc, 1) != 0) + goto cancel; + goto done; + } - return (int)((char *)tl - (char *)reply->tag_list); + D_ASSERT(mdev->vnr == volume); + D_ASSERT(mdev->tconn == tconn); + + dh->minor = mdev_to_minor(mdev); + dh->ret_code = NO_ERROR; + + if (nla_put_status_info(skb, mdev, NULL)) { + cancel: + genlmsg_cancel(skb, dh); + goto out; + } + done: + genlmsg_end(skb, dh); + } + + out: + rcu_read_unlock(); + /* where to start the next iteration */ + cb->args[0] = (long)pos; + cb->args[1] = (pos == tconn) ? volume + 1 : 0; + + /* No more tconns/volumes/minors found results in an empty skb. + * Which will terminate the dump. */ + return skb->len; } - /** - * drbd_nl_get_timeout_flag() - Used by drbdsetup to find out which timeout value to use - * @mdev: DRBD device. - * @nlp: Netlink/connector packet from drbdsetup - * @reply: Reply packet for drbdsetup + /* + * Request status of all resources, or of all volumes within a single resource. + * + * This is a dump, as the answer may not fit in a single reply skb otherwise. + * Which means we cannot use the family->attrbuf or other such members, because + * dump is NOT protected by the genl_lock(). During dump, we only have access + * to the incoming skb, and need to opencode "parsing" of the nlattr payload. + * + * Once things are setup properly, we call into get_one_status(). */ - static int drbd_nl_get_timeout_flag(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, - struct drbd_nl_cfg_reply *reply) + int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb) { - unsigned short *tl; - char rv; + const unsigned hdrlen = GENL_HDRLEN + GENL_MAGIC_FAMILY_HDRSZ; + struct nlattr *nla; + const char *resource_name; + struct drbd_tconn *tconn; + int maxtype; + + /* Is this a followup call? */ + if (cb->args[0]) { + /* ... of a single resource dump, + * and the resource iterator has been advanced already? */ + if (cb->args[2] && cb->args[2] != cb->args[0]) + return 0; /* DONE. */ + goto dump; + } + + /* First call (from netlink_dump_start). We need to figure out + * which resource(s) the user wants us to dump. */ + nla = nla_find(nlmsg_attrdata(cb->nlh, hdrlen), + nlmsg_attrlen(cb->nlh, hdrlen), + DRBD_NLA_CFG_CONTEXT); + + /* No explicit context given. Dump all. */ + if (!nla) + goto dump; + maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1; + nla = drbd_nla_find_nested(maxtype, nla, __nla_type(T_ctx_resource_name)); + if (IS_ERR(nla)) + return PTR_ERR(nla); + /* context given, but no name present? */ + if (!nla) + return -EINVAL; + resource_name = nla_data(nla); + tconn = conn_get_by_name(resource_name); + + if (!tconn) + return -ENODEV; + + kref_put(&tconn->kref, &conn_destroy); /* get_one_status() (re)validates tconn by itself */ + + /* prime iterators, and set "filter" mode mark: + * only dump this tconn. */ + cb->args[0] = (long)tconn; + /* cb->args[1] = 0; passed in this way. */ + cb->args[2] = (long)tconn; + + dump: + return get_one_status(skb, cb); + } - tl = reply->tag_list; + int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info) + { + enum drbd_ret_code retcode; + struct timeout_parms tp; + int err; - rv = mdev->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED : - drbd_test_flag(mdev, USE_DEGR_WFC_T) ? UT_DEGRADED : UT_DEFAULT; + retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; - tl = tl_add_blob(tl, T_use_degraded, &rv, sizeof(rv)); - put_unaligned(TT_END, tl++); /* Close the tag list */ + tp.timeout_type = + adm_ctx.mdev->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED : + test_bit(USE_DEGR_WFC_T, &adm_ctx.mdev->flags) ? UT_DEGRADED : + UT_DEFAULT; - return (int)((char *)tl - (char *)reply->tag_list); + err = timeout_parms_to_priv_skb(adm_ctx.reply_skb, &tp); + if (err) { + nlmsg_free(adm_ctx.reply_skb); + return err; + } + out: + drbd_adm_finish(info, retcode); + return 0; } - static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, - struct drbd_nl_cfg_reply *reply) + int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info) { - /* default to resume from last known position, if possible */ - struct start_ov args = { - .start_sector = mdev->ov_start_sector, - .stop_sector = ULLONG_MAX, - }; + struct drbd_conf *mdev; + enum drbd_ret_code retcode; + struct start_ov_parms parms; - if (!start_ov_from_tags(mdev, nlp->tag_list, &args)) { - reply->ret_code = ERR_MANDATORY_TAG; - return 0; + retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + + mdev = adm_ctx.mdev; + + /* resume from last known position, if possible */ + parms.ov_start_sector = mdev->ov_start_sector; + parms.ov_stop_sector = ULLONG_MAX; + if (info->attrs[DRBD_NLA_START_OV_PARMS]) { + int err = start_ov_parms_from_attrs(&parms, info); + if (err) { + retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(from_attrs_err_to_txt(err)); + goto out; + } } + /* w_make_ov_request expects position to be aligned */ + mdev->ov_start_sector = parms.ov_start_sector & ~(BM_SECT_PER_BIT-1); + mdev->ov_stop_sector = parms.ov_stop_sector; /* If there is still bitmap IO pending, e.g. previous resync or verify * just being finished, wait for it before requesting a new resync. */ diff --cc drivers/block/drbd/drbd_receiver.c index eb0cafea142,813759f1b6a..0331ad0b61e --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@@ -656,19 -699,11 +699,11 @@@ static int prepare_listen_socket(struc goto out; } - timeo = mdev->net_conf->try_connect_int * HZ; - timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */ - - s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ - s_listen->sk->sk_rcvtimeo = timeo; - s_listen->sk->sk_sndtimeo = timeo; - drbd_setbufsize(s_listen, mdev->net_conf->sndbuf_size, - mdev->net_conf->rcvbuf_size); - s_listen->sk->sk_reuse = 1; /* SO_REUSEADDR */ ++ s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ + drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size); what = "bind before listen"; - err = s_listen->ops->bind(s_listen, - (struct sockaddr *) mdev->net_conf->my_addr, - mdev->net_conf->my_addr_len); + err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len); if (err < 0) goto out; @@@ -829,33 -967,39 +967,39 @@@ randomize goto out_release_sockets; } - if (sock && msock) { - ok = drbd_socket_okay(mdev, &sock); - ok = drbd_socket_okay(mdev, &msock) && ok; - if (ok) - break; - } - } while (1); + ok = drbd_socket_okay(&sock.socket); + ok = drbd_socket_okay(&msock.socket) && ok; + } while (!ok); + + if (ad.s_listen) + sock_release(ad.s_listen); - msock->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ - sock->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ - sock.socket->sk->sk_reuse = 1; /* SO_REUSEADDR */ - msock.socket->sk->sk_reuse = 1; /* SO_REUSEADDR */ ++ sock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ ++ msock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ - sock->sk->sk_allocation = GFP_NOIO; - msock->sk->sk_allocation = GFP_NOIO; + sock.socket->sk->sk_allocation = GFP_NOIO; + msock.socket->sk->sk_allocation = GFP_NOIO; - sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK; - msock->sk->sk_priority = TC_PRIO_INTERACTIVE; + sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK; + msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE; /* NOT YET ... - * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; - * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; - * first set it to the P_HAND_SHAKE timeout, + * sock.socket->sk->sk_sndtimeo = tconn->net_conf->timeout*HZ/10; + * sock.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; + * first set it to the P_CONNECTION_FEATURES timeout, * which we set to 4x the configured ping_timeout. */ - sock->sk->sk_sndtimeo = - sock->sk->sk_rcvtimeo = mdev->net_conf->ping_timeo*4*HZ/10; + rcu_read_lock(); + nc = rcu_dereference(tconn->net_conf); + + sock.socket->sk->sk_sndtimeo = + sock.socket->sk->sk_rcvtimeo = nc->ping_timeo*4*HZ/10; - msock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; - msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ; + msock.socket->sk->sk_rcvtimeo = nc->ping_int*HZ; + timeout = nc->timeout * HZ / 10; + discard_my_data = nc->discard_my_data; + rcu_read_unlock(); + + msock.socket->sk->sk_sndtimeo = timeout; /* we don't want delays. * we use TCP_CORK where appropriate, though */ diff --cc drivers/block/drbd/drbd_req.c index 135ea76ed50,b905a0453bf..f58a4a4b4df --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@@ -1030,119 -1094,54 +1094,54 @@@ void __drbd_make_request(struct drbd_co /* no point in adding empty flushes to the transfer log, * they are mapped to drbd barriers already. */ - if (likely(size!=0)) - list_add_tail(&req->tl_requests, &mdev->newest_tle->requests); + if (likely(req->i.size!=0)) { + if (rw == WRITE) + mdev->tconn->current_tle_writes++; - /* NOTE remote first: to get the concurrent write detection right, - * we must register the request before start of local IO. */ - if (remote) { - /* either WRITE and C_CONNECTED, - * or READ, and no local disk, - * or READ, but not in sync. - */ - _req_mod(req, (rw == WRITE) - ? queue_for_net_write - : queue_for_net_read); + list_add_tail(&req->tl_requests, &mdev->tconn->transfer_log); } - if (send_oos && drbd_set_out_of_sync(mdev, sector, size)) - _req_mod(req, queue_for_send_oos); - - if (remote && - mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96) - maybe_pull_ahead(mdev); - /* If this was a flush, queue a drbd barrier/start a new epoch. - * Unless the current epoch was empty anyways, or we are not currently - * replicating, in which case there is no point. */ - if (unlikely(bio->bi_rw & REQ_FLUSH) - && mdev->newest_tle->n_writes - && drbd_should_do_remote(mdev->state)) - queue_barrier(mdev); - - spin_unlock_irq(&mdev->req_lock); - kfree(b); /* if someone else has beaten us to it... */ - - if (local) { - req->private_bio->bi_bdev = mdev->ldev->backing_bdev; - - /* State may have changed since we grabbed our reference on the - * mdev->ldev member. Double check, and short-circuit to endio. - * In case the last activity log transaction failed to get on - * stable storage, and this is a WRITE, we may not even submit - * this bio. */ - if (get_ldev(mdev)) { - if (drbd_insert_fault(mdev, rw == WRITE ? DRBD_FAULT_DT_WR - : rw == READ ? DRBD_FAULT_DT_RD - : DRBD_FAULT_DT_RA)) - bio_endio(req->private_bio, -EIO); - else - generic_make_request(req->private_bio); - put_ldev(mdev); + if (rw == WRITE) { + if (!drbd_process_write_request(req)) + no_remote = true; + } else { + /* We either have a private_bio, or we can read from remote. + * Otherwise we had done the goto nodata above. */ + if (req->private_bio == NULL) { + _req_mod(req, TO_BE_SENT); + _req_mod(req, QUEUE_FOR_NET_READ); } else - bio_endio(req->private_bio, -EIO); + no_remote = true; } - return 0; - - fail_conflicting: - /* this is a conflicting request. - * even though it may have been only _partially_ - * overlapping with one of the currently pending requests, - * without even submitting or sending it, we will - * pretend that it was successfully served right now. - */ - _drbd_end_io_acct(mdev, req); - spin_unlock_irq(&mdev->req_lock); - if (remote) - dec_ap_pending(mdev); - /* THINK: do we want to fail it (-EIO), or pretend success? - * this pretends success. */ - err = 0; - - fail_free_complete: - if (req->rq_state & RQ_IN_ACT_LOG) - drbd_al_complete_io(mdev, sector); - fail_and_free_req: - if (local) { - bio_put(req->private_bio); - req->private_bio = NULL; - put_ldev(mdev); + if (req->private_bio) { + /* needs to be marked within the same spinlock */ + _req_mod(req, TO_BE_SUBMITTED); + /* but we need to give up the spinlock to submit */ + spin_unlock_irq(&mdev->tconn->req_lock); + drbd_submit_req_private_bio(req); + spin_lock_irq(&mdev->tconn->req_lock); + } else if (no_remote) { + nodata: + if (__ratelimit(&drbd_ratelimit_state)) + dev_err(DEV, "IO ERROR: neither local nor remote data, sector %llu+%u\n", + (unsigned long long)req->i.sector, req->i.size >> 9); + /* A write may have been queued for send_oos, however. + * So we can not simply free it, we must go through drbd_req_put_completion_ref() */ } - if (!ret) - bio_endio(bio, err); - - drbd_req_free(req); - dec_ap_bio(mdev); - kfree(b); - - return ret; - } - /* helper function for drbd_make_request - * if we can determine just by the mdev (state) that this request will fail, - * return 1 - * otherwise return 0 - */ - static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write) - { - if (mdev->state.role != R_PRIMARY && - (!allow_oos || is_write)) { - if (__ratelimit(&drbd_ratelimit_state)) { - dev_err(DEV, "Process %s[%u] tried to %s; " - "since we are not in Primary state, " - "we cannot allow this\n", - current->comm, current->pid, - is_write ? "WRITE" : "READ"); - } - return 1; - } + out: + if (drbd_req_put_completion_ref(req, &m, 1)) + kref_put(&req->kref, drbd_req_destroy); + spin_unlock_irq(&mdev->tconn->req_lock); - return 0; + if (m.bio) + complete_master_bio(mdev, &m); + return; } -int drbd_make_request(struct request_queue *q, struct bio *bio) +void drbd_make_request(struct request_queue *q, struct bio *bio) { - unsigned int s_enr, e_enr; struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata; unsigned long start_time; @@@ -1156,79 -1150,25 +1150,23 @@@ /* * what we "blindly" assume: */ - D_ASSERT((bio->bi_size & 0x1ff) == 0); - - /* to make some things easier, force alignment of requests within the - * granularity of our hash tables */ - s_enr = bio->bi_sector >> HT_SHIFT; - e_enr = bio->bi_size ? (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT : s_enr; - - if (likely(s_enr == e_enr)) { - do { - inc_ap_bio(mdev, 1); - } while (drbd_make_request_common(mdev, bio, start_time)); - return; - } - - /* can this bio be split generically? - * Maybe add our own split-arbitrary-bios function. */ - if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_BIO_SIZE) { - /* rather error out here than BUG in bio_split */ - dev_err(DEV, "bio would need to, but cannot, be split: " - "(vcnt=%u,idx=%u,size=%u,sector=%llu)\n", - bio->bi_vcnt, bio->bi_idx, bio->bi_size, - (unsigned long long)bio->bi_sector); - bio_endio(bio, -EINVAL); - } else { - /* This bio crosses some boundary, so we have to split it. */ - struct bio_pair *bp; - /* works for the "do not cross hash slot boundaries" case - * e.g. sector 262269, size 4096 - * s_enr = 262269 >> 6 = 4097 - * e_enr = (262269+8-1) >> 6 = 4098 - * HT_SHIFT = 6 - * sps = 64, mask = 63 - * first_sectors = 64 - (262269 & 63) = 3 - */ - const sector_t sect = bio->bi_sector; - const int sps = 1 << HT_SHIFT; /* sectors per slot */ - const int mask = sps - 1; - const sector_t first_sectors = sps - (sect & mask); - bp = bio_split(bio, first_sectors); + D_ASSERT(IS_ALIGNED(bio->bi_size, 512)); - /* we need to get a "reference count" (ap_bio_cnt) - * to avoid races with the disconnect/reconnect/suspend code. - * In case we need to split the bio here, we need to get three references - * atomically, otherwise we might deadlock when trying to submit the - * second one! */ - inc_ap_bio(mdev, 3); - - D_ASSERT(e_enr == s_enr + 1); - - while (drbd_make_request_common(mdev, &bp->bio1, start_time)) - inc_ap_bio(mdev, 1); - - while (drbd_make_request_common(mdev, &bp->bio2, start_time)) - inc_ap_bio(mdev, 1); - - dec_ap_bio(mdev); - - bio_pair_release(bp); - } + inc_ap_bio(mdev); + __drbd_make_request(mdev, bio, start_time); - - return 0; } - /* This is called by bio_add_page(). With this function we reduce - * the number of BIOs that span over multiple DRBD_MAX_BIO_SIZEs - * units (was AL_EXTENTs). + /* This is called by bio_add_page(). + * + * q->max_hw_sectors and other global limits are already enforced there. * - * we do the calculation within the lower 32bit of the byte offsets, - * since we don't care for actual offset, but only check whether it - * would cross "activity log extent" boundaries. + * We need to call down to our lower level device, + * in case it has special restrictions. + * + * We also may need to enforce configured max-bio-bvecs limits. * * As long as the BIO is empty we have to allow at least one bvec, - * regardless of size and offset. so the resulting bio may still - * cross extent boundaries. those are dealt with (bio_split) in - * drbd_make_request. + * regardless of size and offset, so no need to ask lower levels. */ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec) {