]> Pileus Git - ~andy/linux/blob - fs/btrfs/volumes.c
Merge branch 'restriper' of git://github.com/idryomov/btrfs-unstable into integration
[~andy/linux] / fs / btrfs / volumes.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/bio.h>
20 #include <linux/slab.h>
21 #include <linux/buffer_head.h>
22 #include <linux/blkdev.h>
23 #include <linux/random.h>
24 #include <linux/iocontext.h>
25 #include <linux/capability.h>
26 #include <linux/kthread.h>
27 #include <asm/div64.h>
28 #include "compat.h"
29 #include "ctree.h"
30 #include "extent_map.h"
31 #include "disk-io.h"
32 #include "transaction.h"
33 #include "print-tree.h"
34 #include "volumes.h"
35 #include "async-thread.h"
36
37 static int init_first_rw_device(struct btrfs_trans_handle *trans,
38                                 struct btrfs_root *root,
39                                 struct btrfs_device *device);
40 static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
41
42 static DEFINE_MUTEX(uuid_mutex);
43 static LIST_HEAD(fs_uuids);
44
45 static void lock_chunks(struct btrfs_root *root)
46 {
47         mutex_lock(&root->fs_info->chunk_mutex);
48 }
49
50 static void unlock_chunks(struct btrfs_root *root)
51 {
52         mutex_unlock(&root->fs_info->chunk_mutex);
53 }
54
55 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
56 {
57         struct btrfs_device *device;
58         WARN_ON(fs_devices->opened);
59         while (!list_empty(&fs_devices->devices)) {
60                 device = list_entry(fs_devices->devices.next,
61                                     struct btrfs_device, dev_list);
62                 list_del(&device->dev_list);
63                 kfree(device->name);
64                 kfree(device);
65         }
66         kfree(fs_devices);
67 }
68
69 int btrfs_cleanup_fs_uuids(void)
70 {
71         struct btrfs_fs_devices *fs_devices;
72
73         while (!list_empty(&fs_uuids)) {
74                 fs_devices = list_entry(fs_uuids.next,
75                                         struct btrfs_fs_devices, list);
76                 list_del(&fs_devices->list);
77                 free_fs_devices(fs_devices);
78         }
79         return 0;
80 }
81
82 static noinline struct btrfs_device *__find_device(struct list_head *head,
83                                                    u64 devid, u8 *uuid)
84 {
85         struct btrfs_device *dev;
86
87         list_for_each_entry(dev, head, dev_list) {
88                 if (dev->devid == devid &&
89                     (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
90                         return dev;
91                 }
92         }
93         return NULL;
94 }
95
96 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
97 {
98         struct btrfs_fs_devices *fs_devices;
99
100         list_for_each_entry(fs_devices, &fs_uuids, list) {
101                 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
102                         return fs_devices;
103         }
104         return NULL;
105 }
106
107 static void requeue_list(struct btrfs_pending_bios *pending_bios,
108                         struct bio *head, struct bio *tail)
109 {
110
111         struct bio *old_head;
112
113         old_head = pending_bios->head;
114         pending_bios->head = head;
115         if (pending_bios->tail)
116                 tail->bi_next = old_head;
117         else
118                 pending_bios->tail = tail;
119 }
120
121 /*
122  * we try to collect pending bios for a device so we don't get a large
123  * number of procs sending bios down to the same device.  This greatly
124  * improves the schedulers ability to collect and merge the bios.
125  *
126  * But, it also turns into a long list of bios to process and that is sure
127  * to eventually make the worker thread block.  The solution here is to
128  * make some progress and then put this work struct back at the end of
129  * the list if the block device is congested.  This way, multiple devices
130  * can make progress from a single worker thread.
131  */
132 static noinline int run_scheduled_bios(struct btrfs_device *device)
133 {
134         struct bio *pending;
135         struct backing_dev_info *bdi;
136         struct btrfs_fs_info *fs_info;
137         struct btrfs_pending_bios *pending_bios;
138         struct bio *tail;
139         struct bio *cur;
140         int again = 0;
141         unsigned long num_run;
142         unsigned long batch_run = 0;
143         unsigned long limit;
144         unsigned long last_waited = 0;
145         int force_reg = 0;
146         int sync_pending = 0;
147         struct blk_plug plug;
148
149         /*
150          * this function runs all the bios we've collected for
151          * a particular device.  We don't want to wander off to
152          * another device without first sending all of these down.
153          * So, setup a plug here and finish it off before we return
154          */
155         blk_start_plug(&plug);
156
157         bdi = blk_get_backing_dev_info(device->bdev);
158         fs_info = device->dev_root->fs_info;
159         limit = btrfs_async_submit_limit(fs_info);
160         limit = limit * 2 / 3;
161
162 loop:
163         spin_lock(&device->io_lock);
164
165 loop_lock:
166         num_run = 0;
167
168         /* take all the bios off the list at once and process them
169          * later on (without the lock held).  But, remember the
170          * tail and other pointers so the bios can be properly reinserted
171          * into the list if we hit congestion
172          */
173         if (!force_reg && device->pending_sync_bios.head) {
174                 pending_bios = &device->pending_sync_bios;
175                 force_reg = 1;
176         } else {
177                 pending_bios = &device->pending_bios;
178                 force_reg = 0;
179         }
180
181         pending = pending_bios->head;
182         tail = pending_bios->tail;
183         WARN_ON(pending && !tail);
184
185         /*
186          * if pending was null this time around, no bios need processing
187          * at all and we can stop.  Otherwise it'll loop back up again
188          * and do an additional check so no bios are missed.
189          *
190          * device->running_pending is used to synchronize with the
191          * schedule_bio code.
192          */
193         if (device->pending_sync_bios.head == NULL &&
194             device->pending_bios.head == NULL) {
195                 again = 0;
196                 device->running_pending = 0;
197         } else {
198                 again = 1;
199                 device->running_pending = 1;
200         }
201
202         pending_bios->head = NULL;
203         pending_bios->tail = NULL;
204
205         spin_unlock(&device->io_lock);
206
207         while (pending) {
208
209                 rmb();
210                 /* we want to work on both lists, but do more bios on the
211                  * sync list than the regular list
212                  */
213                 if ((num_run > 32 &&
214                     pending_bios != &device->pending_sync_bios &&
215                     device->pending_sync_bios.head) ||
216                    (num_run > 64 && pending_bios == &device->pending_sync_bios &&
217                     device->pending_bios.head)) {
218                         spin_lock(&device->io_lock);
219                         requeue_list(pending_bios, pending, tail);
220                         goto loop_lock;
221                 }
222
223                 cur = pending;
224                 pending = pending->bi_next;
225                 cur->bi_next = NULL;
226                 atomic_dec(&fs_info->nr_async_bios);
227
228                 if (atomic_read(&fs_info->nr_async_bios) < limit &&
229                     waitqueue_active(&fs_info->async_submit_wait))
230                         wake_up(&fs_info->async_submit_wait);
231
232                 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
233
234                 /*
235                  * if we're doing the sync list, record that our
236                  * plug has some sync requests on it
237                  *
238                  * If we're doing the regular list and there are
239                  * sync requests sitting around, unplug before
240                  * we add more
241                  */
242                 if (pending_bios == &device->pending_sync_bios) {
243                         sync_pending = 1;
244                 } else if (sync_pending) {
245                         blk_finish_plug(&plug);
246                         blk_start_plug(&plug);
247                         sync_pending = 0;
248                 }
249
250                 submit_bio(cur->bi_rw, cur);
251                 num_run++;
252                 batch_run++;
253                 if (need_resched())
254                         cond_resched();
255
256                 /*
257                  * we made progress, there is more work to do and the bdi
258                  * is now congested.  Back off and let other work structs
259                  * run instead
260                  */
261                 if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
262                     fs_info->fs_devices->open_devices > 1) {
263                         struct io_context *ioc;
264
265                         ioc = current->io_context;
266
267                         /*
268                          * the main goal here is that we don't want to
269                          * block if we're going to be able to submit
270                          * more requests without blocking.
271                          *
272                          * This code does two great things, it pokes into
273                          * the elevator code from a filesystem _and_
274                          * it makes assumptions about how batching works.
275                          */
276                         if (ioc && ioc->nr_batch_requests > 0 &&
277                             time_before(jiffies, ioc->last_waited + HZ/50UL) &&
278                             (last_waited == 0 ||
279                              ioc->last_waited == last_waited)) {
280                                 /*
281                                  * we want to go through our batch of
282                                  * requests and stop.  So, we copy out
283                                  * the ioc->last_waited time and test
284                                  * against it before looping
285                                  */
286                                 last_waited = ioc->last_waited;
287                                 if (need_resched())
288                                         cond_resched();
289                                 continue;
290                         }
291                         spin_lock(&device->io_lock);
292                         requeue_list(pending_bios, pending, tail);
293                         device->running_pending = 1;
294
295                         spin_unlock(&device->io_lock);
296                         btrfs_requeue_work(&device->work);
297                         goto done;
298                 }
299                 /* unplug every 64 requests just for good measure */
300                 if (batch_run % 64 == 0) {
301                         blk_finish_plug(&plug);
302                         blk_start_plug(&plug);
303                         sync_pending = 0;
304                 }
305         }
306
307         cond_resched();
308         if (again)
309                 goto loop;
310
311         spin_lock(&device->io_lock);
312         if (device->pending_bios.head || device->pending_sync_bios.head)
313                 goto loop_lock;
314         spin_unlock(&device->io_lock);
315
316 done:
317         blk_finish_plug(&plug);
318         return 0;
319 }
320
321 static void pending_bios_fn(struct btrfs_work *work)
322 {
323         struct btrfs_device *device;
324
325         device = container_of(work, struct btrfs_device, work);
326         run_scheduled_bios(device);
327 }
328
329 static noinline int device_list_add(const char *path,
330                            struct btrfs_super_block *disk_super,
331                            u64 devid, struct btrfs_fs_devices **fs_devices_ret)
332 {
333         struct btrfs_device *device;
334         struct btrfs_fs_devices *fs_devices;
335         u64 found_transid = btrfs_super_generation(disk_super);
336         char *name;
337
338         fs_devices = find_fsid(disk_super->fsid);
339         if (!fs_devices) {
340                 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
341                 if (!fs_devices)
342                         return -ENOMEM;
343                 INIT_LIST_HEAD(&fs_devices->devices);
344                 INIT_LIST_HEAD(&fs_devices->alloc_list);
345                 list_add(&fs_devices->list, &fs_uuids);
346                 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
347                 fs_devices->latest_devid = devid;
348                 fs_devices->latest_trans = found_transid;
349                 mutex_init(&fs_devices->device_list_mutex);
350                 device = NULL;
351         } else {
352                 device = __find_device(&fs_devices->devices, devid,
353                                        disk_super->dev_item.uuid);
354         }
355         if (!device) {
356                 if (fs_devices->opened)
357                         return -EBUSY;
358
359                 device = kzalloc(sizeof(*device), GFP_NOFS);
360                 if (!device) {
361                         /* we can safely leave the fs_devices entry around */
362                         return -ENOMEM;
363                 }
364                 device->devid = devid;
365                 device->work.func = pending_bios_fn;
366                 memcpy(device->uuid, disk_super->dev_item.uuid,
367                        BTRFS_UUID_SIZE);
368                 spin_lock_init(&device->io_lock);
369                 device->name = kstrdup(path, GFP_NOFS);
370                 if (!device->name) {
371                         kfree(device);
372                         return -ENOMEM;
373                 }
374                 INIT_LIST_HEAD(&device->dev_alloc_list);
375
376                 /* init readahead state */
377                 spin_lock_init(&device->reada_lock);
378                 device->reada_curr_zone = NULL;
379                 atomic_set(&device->reada_in_flight, 0);
380                 device->reada_next = 0;
381                 INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT);
382                 INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT);
383
384                 mutex_lock(&fs_devices->device_list_mutex);
385                 list_add_rcu(&device->dev_list, &fs_devices->devices);
386                 mutex_unlock(&fs_devices->device_list_mutex);
387
388                 device->fs_devices = fs_devices;
389                 fs_devices->num_devices++;
390         } else if (!device->name || strcmp(device->name, path)) {
391                 name = kstrdup(path, GFP_NOFS);
392                 if (!name)
393                         return -ENOMEM;
394                 kfree(device->name);
395                 device->name = name;
396                 if (device->missing) {
397                         fs_devices->missing_devices--;
398                         device->missing = 0;
399                 }
400         }
401
402         if (found_transid > fs_devices->latest_trans) {
403                 fs_devices->latest_devid = devid;
404                 fs_devices->latest_trans = found_transid;
405         }
406         *fs_devices_ret = fs_devices;
407         return 0;
408 }
409
410 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
411 {
412         struct btrfs_fs_devices *fs_devices;
413         struct btrfs_device *device;
414         struct btrfs_device *orig_dev;
415
416         fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
417         if (!fs_devices)
418                 return ERR_PTR(-ENOMEM);
419
420         INIT_LIST_HEAD(&fs_devices->devices);
421         INIT_LIST_HEAD(&fs_devices->alloc_list);
422         INIT_LIST_HEAD(&fs_devices->list);
423         mutex_init(&fs_devices->device_list_mutex);
424         fs_devices->latest_devid = orig->latest_devid;
425         fs_devices->latest_trans = orig->latest_trans;
426         memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
427
428         /* We have held the volume lock, it is safe to get the devices. */
429         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
430                 device = kzalloc(sizeof(*device), GFP_NOFS);
431                 if (!device)
432                         goto error;
433
434                 device->name = kstrdup(orig_dev->name, GFP_NOFS);
435                 if (!device->name) {
436                         kfree(device);
437                         goto error;
438                 }
439
440                 device->devid = orig_dev->devid;
441                 device->work.func = pending_bios_fn;
442                 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
443                 spin_lock_init(&device->io_lock);
444                 INIT_LIST_HEAD(&device->dev_list);
445                 INIT_LIST_HEAD(&device->dev_alloc_list);
446
447                 list_add(&device->dev_list, &fs_devices->devices);
448                 device->fs_devices = fs_devices;
449                 fs_devices->num_devices++;
450         }
451         return fs_devices;
452 error:
453         free_fs_devices(fs_devices);
454         return ERR_PTR(-ENOMEM);
455 }
456
457 int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
458 {
459         struct btrfs_device *device, *next;
460
461         mutex_lock(&uuid_mutex);
462 again:
463         /* This is the initialized path, it is safe to release the devices. */
464         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
465                 if (device->in_fs_metadata)
466                         continue;
467
468                 if (device->bdev) {
469                         blkdev_put(device->bdev, device->mode);
470                         device->bdev = NULL;
471                         fs_devices->open_devices--;
472                 }
473                 if (device->writeable) {
474                         list_del_init(&device->dev_alloc_list);
475                         device->writeable = 0;
476                         fs_devices->rw_devices--;
477                 }
478                 list_del_init(&device->dev_list);
479                 fs_devices->num_devices--;
480                 kfree(device->name);
481                 kfree(device);
482         }
483
484         if (fs_devices->seed) {
485                 fs_devices = fs_devices->seed;
486                 goto again;
487         }
488
489         mutex_unlock(&uuid_mutex);
490         return 0;
491 }
492
493 static void __free_device(struct work_struct *work)
494 {
495         struct btrfs_device *device;
496
497         device = container_of(work, struct btrfs_device, rcu_work);
498
499         if (device->bdev)
500                 blkdev_put(device->bdev, device->mode);
501
502         kfree(device->name);
503         kfree(device);
504 }
505
506 static void free_device(struct rcu_head *head)
507 {
508         struct btrfs_device *device;
509
510         device = container_of(head, struct btrfs_device, rcu);
511
512         INIT_WORK(&device->rcu_work, __free_device);
513         schedule_work(&device->rcu_work);
514 }
515
516 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
517 {
518         struct btrfs_device *device;
519
520         if (--fs_devices->opened > 0)
521                 return 0;
522
523         mutex_lock(&fs_devices->device_list_mutex);
524         list_for_each_entry(device, &fs_devices->devices, dev_list) {
525                 struct btrfs_device *new_device;
526
527                 if (device->bdev)
528                         fs_devices->open_devices--;
529
530                 if (device->writeable) {
531                         list_del_init(&device->dev_alloc_list);
532                         fs_devices->rw_devices--;
533                 }
534
535                 if (device->can_discard)
536                         fs_devices->num_can_discard--;
537
538                 new_device = kmalloc(sizeof(*new_device), GFP_NOFS);
539                 BUG_ON(!new_device);
540                 memcpy(new_device, device, sizeof(*new_device));
541                 new_device->name = kstrdup(device->name, GFP_NOFS);
542                 BUG_ON(device->name && !new_device->name);
543                 new_device->bdev = NULL;
544                 new_device->writeable = 0;
545                 new_device->in_fs_metadata = 0;
546                 new_device->can_discard = 0;
547                 list_replace_rcu(&device->dev_list, &new_device->dev_list);
548
549                 call_rcu(&device->rcu, free_device);
550         }
551         mutex_unlock(&fs_devices->device_list_mutex);
552
553         WARN_ON(fs_devices->open_devices);
554         WARN_ON(fs_devices->rw_devices);
555         fs_devices->opened = 0;
556         fs_devices->seeding = 0;
557
558         return 0;
559 }
560
561 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
562 {
563         struct btrfs_fs_devices *seed_devices = NULL;
564         int ret;
565
566         mutex_lock(&uuid_mutex);
567         ret = __btrfs_close_devices(fs_devices);
568         if (!fs_devices->opened) {
569                 seed_devices = fs_devices->seed;
570                 fs_devices->seed = NULL;
571         }
572         mutex_unlock(&uuid_mutex);
573
574         while (seed_devices) {
575                 fs_devices = seed_devices;
576                 seed_devices = fs_devices->seed;
577                 __btrfs_close_devices(fs_devices);
578                 free_fs_devices(fs_devices);
579         }
580         return ret;
581 }
582
583 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
584                                 fmode_t flags, void *holder)
585 {
586         struct request_queue *q;
587         struct block_device *bdev;
588         struct list_head *head = &fs_devices->devices;
589         struct btrfs_device *device;
590         struct block_device *latest_bdev = NULL;
591         struct buffer_head *bh;
592         struct btrfs_super_block *disk_super;
593         u64 latest_devid = 0;
594         u64 latest_transid = 0;
595         u64 devid;
596         int seeding = 1;
597         int ret = 0;
598
599         flags |= FMODE_EXCL;
600
601         list_for_each_entry(device, head, dev_list) {
602                 if (device->bdev)
603                         continue;
604                 if (!device->name)
605                         continue;
606
607                 bdev = blkdev_get_by_path(device->name, flags, holder);
608                 if (IS_ERR(bdev)) {
609                         printk(KERN_INFO "open %s failed\n", device->name);
610                         goto error;
611                 }
612                 set_blocksize(bdev, 4096);
613
614                 bh = btrfs_read_dev_super(bdev);
615                 if (!bh)
616                         goto error_close;
617
618                 disk_super = (struct btrfs_super_block *)bh->b_data;
619                 devid = btrfs_stack_device_id(&disk_super->dev_item);
620                 if (devid != device->devid)
621                         goto error_brelse;
622
623                 if (memcmp(device->uuid, disk_super->dev_item.uuid,
624                            BTRFS_UUID_SIZE))
625                         goto error_brelse;
626
627                 device->generation = btrfs_super_generation(disk_super);
628                 if (!latest_transid || device->generation > latest_transid) {
629                         latest_devid = devid;
630                         latest_transid = device->generation;
631                         latest_bdev = bdev;
632                 }
633
634                 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
635                         device->writeable = 0;
636                 } else {
637                         device->writeable = !bdev_read_only(bdev);
638                         seeding = 0;
639                 }
640
641                 q = bdev_get_queue(bdev);
642                 if (blk_queue_discard(q)) {
643                         device->can_discard = 1;
644                         fs_devices->num_can_discard++;
645                 }
646
647                 device->bdev = bdev;
648                 device->in_fs_metadata = 0;
649                 device->mode = flags;
650
651                 if (!blk_queue_nonrot(bdev_get_queue(bdev)))
652                         fs_devices->rotating = 1;
653
654                 fs_devices->open_devices++;
655                 if (device->writeable) {
656                         fs_devices->rw_devices++;
657                         list_add(&device->dev_alloc_list,
658                                  &fs_devices->alloc_list);
659                 }
660                 brelse(bh);
661                 continue;
662
663 error_brelse:
664                 brelse(bh);
665 error_close:
666                 blkdev_put(bdev, flags);
667 error:
668                 continue;
669         }
670         if (fs_devices->open_devices == 0) {
671                 ret = -EINVAL;
672                 goto out;
673         }
674         fs_devices->seeding = seeding;
675         fs_devices->opened = 1;
676         fs_devices->latest_bdev = latest_bdev;
677         fs_devices->latest_devid = latest_devid;
678         fs_devices->latest_trans = latest_transid;
679         fs_devices->total_rw_bytes = 0;
680 out:
681         return ret;
682 }
683
684 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
685                        fmode_t flags, void *holder)
686 {
687         int ret;
688
689         mutex_lock(&uuid_mutex);
690         if (fs_devices->opened) {
691                 fs_devices->opened++;
692                 ret = 0;
693         } else {
694                 ret = __btrfs_open_devices(fs_devices, flags, holder);
695         }
696         mutex_unlock(&uuid_mutex);
697         return ret;
698 }
699
700 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
701                           struct btrfs_fs_devices **fs_devices_ret)
702 {
703         struct btrfs_super_block *disk_super;
704         struct block_device *bdev;
705         struct buffer_head *bh;
706         int ret;
707         u64 devid;
708         u64 transid;
709
710         mutex_lock(&uuid_mutex);
711
712         flags |= FMODE_EXCL;
713         bdev = blkdev_get_by_path(path, flags, holder);
714
715         if (IS_ERR(bdev)) {
716                 ret = PTR_ERR(bdev);
717                 goto error;
718         }
719
720         ret = set_blocksize(bdev, 4096);
721         if (ret)
722                 goto error_close;
723         bh = btrfs_read_dev_super(bdev);
724         if (!bh) {
725                 ret = -EINVAL;
726                 goto error_close;
727         }
728         disk_super = (struct btrfs_super_block *)bh->b_data;
729         devid = btrfs_stack_device_id(&disk_super->dev_item);
730         transid = btrfs_super_generation(disk_super);
731         if (disk_super->label[0])
732                 printk(KERN_INFO "device label %s ", disk_super->label);
733         else
734                 printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
735         printk(KERN_CONT "devid %llu transid %llu %s\n",
736                (unsigned long long)devid, (unsigned long long)transid, path);
737         ret = device_list_add(path, disk_super, devid, fs_devices_ret);
738
739         brelse(bh);
740 error_close:
741         blkdev_put(bdev, flags);
742 error:
743         mutex_unlock(&uuid_mutex);
744         return ret;
745 }
746
747 /* helper to account the used device space in the range */
748 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
749                                    u64 end, u64 *length)
750 {
751         struct btrfs_key key;
752         struct btrfs_root *root = device->dev_root;
753         struct btrfs_dev_extent *dev_extent;
754         struct btrfs_path *path;
755         u64 extent_end;
756         int ret;
757         int slot;
758         struct extent_buffer *l;
759
760         *length = 0;
761
762         if (start >= device->total_bytes)
763                 return 0;
764
765         path = btrfs_alloc_path();
766         if (!path)
767                 return -ENOMEM;
768         path->reada = 2;
769
770         key.objectid = device->devid;
771         key.offset = start;
772         key.type = BTRFS_DEV_EXTENT_KEY;
773
774         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
775         if (ret < 0)
776                 goto out;
777         if (ret > 0) {
778                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
779                 if (ret < 0)
780                         goto out;
781         }
782
783         while (1) {
784                 l = path->nodes[0];
785                 slot = path->slots[0];
786                 if (slot >= btrfs_header_nritems(l)) {
787                         ret = btrfs_next_leaf(root, path);
788                         if (ret == 0)
789                                 continue;
790                         if (ret < 0)
791                                 goto out;
792
793                         break;
794                 }
795                 btrfs_item_key_to_cpu(l, &key, slot);
796
797                 if (key.objectid < device->devid)
798                         goto next;
799
800                 if (key.objectid > device->devid)
801                         break;
802
803                 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
804                         goto next;
805
806                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
807                 extent_end = key.offset + btrfs_dev_extent_length(l,
808                                                                   dev_extent);
809                 if (key.offset <= start && extent_end > end) {
810                         *length = end - start + 1;
811                         break;
812                 } else if (key.offset <= start && extent_end > start)
813                         *length += extent_end - start;
814                 else if (key.offset > start && extent_end <= end)
815                         *length += extent_end - key.offset;
816                 else if (key.offset > start && key.offset <= end) {
817                         *length += end - key.offset + 1;
818                         break;
819                 } else if (key.offset > end)
820                         break;
821
822 next:
823                 path->slots[0]++;
824         }
825         ret = 0;
826 out:
827         btrfs_free_path(path);
828         return ret;
829 }
830
831 /*
832  * find_free_dev_extent - find free space in the specified device
833  * @trans:      transaction handler
834  * @device:     the device which we search the free space in
835  * @num_bytes:  the size of the free space that we need
836  * @start:      store the start of the free space.
837  * @len:        the size of the free space. that we find, or the size of the max
838  *              free space if we don't find suitable free space
839  *
840  * this uses a pretty simple search, the expectation is that it is
841  * called very infrequently and that a given device has a small number
842  * of extents
843  *
844  * @start is used to store the start of the free space if we find. But if we
845  * don't find suitable free space, it will be used to store the start position
846  * of the max free space.
847  *
848  * @len is used to store the size of the free space that we find.
849  * But if we don't find suitable free space, it is used to store the size of
850  * the max free space.
851  */
852 int find_free_dev_extent(struct btrfs_trans_handle *trans,
853                          struct btrfs_device *device, u64 num_bytes,
854                          u64 *start, u64 *len)
855 {
856         struct btrfs_key key;
857         struct btrfs_root *root = device->dev_root;
858         struct btrfs_dev_extent *dev_extent;
859         struct btrfs_path *path;
860         u64 hole_size;
861         u64 max_hole_start;
862         u64 max_hole_size;
863         u64 extent_end;
864         u64 search_start;
865         u64 search_end = device->total_bytes;
866         int ret;
867         int slot;
868         struct extent_buffer *l;
869
870         /* FIXME use last free of some kind */
871
872         /* we don't want to overwrite the superblock on the drive,
873          * so we make sure to start at an offset of at least 1MB
874          */
875         search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
876
877         max_hole_start = search_start;
878         max_hole_size = 0;
879         hole_size = 0;
880
881         if (search_start >= search_end) {
882                 ret = -ENOSPC;
883                 goto error;
884         }
885
886         path = btrfs_alloc_path();
887         if (!path) {
888                 ret = -ENOMEM;
889                 goto error;
890         }
891         path->reada = 2;
892
893         key.objectid = device->devid;
894         key.offset = search_start;
895         key.type = BTRFS_DEV_EXTENT_KEY;
896
897         ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
898         if (ret < 0)
899                 goto out;
900         if (ret > 0) {
901                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
902                 if (ret < 0)
903                         goto out;
904         }
905
906         while (1) {
907                 l = path->nodes[0];
908                 slot = path->slots[0];
909                 if (slot >= btrfs_header_nritems(l)) {
910                         ret = btrfs_next_leaf(root, path);
911                         if (ret == 0)
912                                 continue;
913                         if (ret < 0)
914                                 goto out;
915
916                         break;
917                 }
918                 btrfs_item_key_to_cpu(l, &key, slot);
919
920                 if (key.objectid < device->devid)
921                         goto next;
922
923                 if (key.objectid > device->devid)
924                         break;
925
926                 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
927                         goto next;
928
929                 if (key.offset > search_start) {
930                         hole_size = key.offset - search_start;
931
932                         if (hole_size > max_hole_size) {
933                                 max_hole_start = search_start;
934                                 max_hole_size = hole_size;
935                         }
936
937                         /*
938                          * If this free space is greater than which we need,
939                          * it must be the max free space that we have found
940                          * until now, so max_hole_start must point to the start
941                          * of this free space and the length of this free space
942                          * is stored in max_hole_size. Thus, we return
943                          * max_hole_start and max_hole_size and go back to the
944                          * caller.
945                          */
946                         if (hole_size >= num_bytes) {
947                                 ret = 0;
948                                 goto out;
949                         }
950                 }
951
952                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
953                 extent_end = key.offset + btrfs_dev_extent_length(l,
954                                                                   dev_extent);
955                 if (extent_end > search_start)
956                         search_start = extent_end;
957 next:
958                 path->slots[0]++;
959                 cond_resched();
960         }
961
962         /*
963          * At this point, search_start should be the end of
964          * allocated dev extents, and when shrinking the device,
965          * search_end may be smaller than search_start.
966          */
967         if (search_end > search_start)
968                 hole_size = search_end - search_start;
969
970         if (hole_size > max_hole_size) {
971                 max_hole_start = search_start;
972                 max_hole_size = hole_size;
973         }
974
975         /* See above. */
976         if (hole_size < num_bytes)
977                 ret = -ENOSPC;
978         else
979                 ret = 0;
980
981 out:
982         btrfs_free_path(path);
983 error:
984         *start = max_hole_start;
985         if (len)
986                 *len = max_hole_size;
987         return ret;
988 }
989
990 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
991                           struct btrfs_device *device,
992                           u64 start)
993 {
994         int ret;
995         struct btrfs_path *path;
996         struct btrfs_root *root = device->dev_root;
997         struct btrfs_key key;
998         struct btrfs_key found_key;
999         struct extent_buffer *leaf = NULL;
1000         struct btrfs_dev_extent *extent = NULL;
1001
1002         path = btrfs_alloc_path();
1003         if (!path)
1004                 return -ENOMEM;
1005
1006         key.objectid = device->devid;
1007         key.offset = start;
1008         key.type = BTRFS_DEV_EXTENT_KEY;
1009 again:
1010         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1011         if (ret > 0) {
1012                 ret = btrfs_previous_item(root, path, key.objectid,
1013                                           BTRFS_DEV_EXTENT_KEY);
1014                 if (ret)
1015                         goto out;
1016                 leaf = path->nodes[0];
1017                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1018                 extent = btrfs_item_ptr(leaf, path->slots[0],
1019                                         struct btrfs_dev_extent);
1020                 BUG_ON(found_key.offset > start || found_key.offset +
1021                        btrfs_dev_extent_length(leaf, extent) < start);
1022                 key = found_key;
1023                 btrfs_release_path(path);
1024                 goto again;
1025         } else if (ret == 0) {
1026                 leaf = path->nodes[0];
1027                 extent = btrfs_item_ptr(leaf, path->slots[0],
1028                                         struct btrfs_dev_extent);
1029         }
1030         BUG_ON(ret);
1031
1032         if (device->bytes_used > 0) {
1033                 u64 len = btrfs_dev_extent_length(leaf, extent);
1034                 device->bytes_used -= len;
1035                 spin_lock(&root->fs_info->free_chunk_lock);
1036                 root->fs_info->free_chunk_space += len;
1037                 spin_unlock(&root->fs_info->free_chunk_lock);
1038         }
1039         ret = btrfs_del_item(trans, root, path);
1040
1041 out:
1042         btrfs_free_path(path);
1043         return ret;
1044 }
1045
1046 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1047                            struct btrfs_device *device,
1048                            u64 chunk_tree, u64 chunk_objectid,
1049                            u64 chunk_offset, u64 start, u64 num_bytes)
1050 {
1051         int ret;
1052         struct btrfs_path *path;
1053         struct btrfs_root *root = device->dev_root;
1054         struct btrfs_dev_extent *extent;
1055         struct extent_buffer *leaf;
1056         struct btrfs_key key;
1057
1058         WARN_ON(!device->in_fs_metadata);
1059         path = btrfs_alloc_path();
1060         if (!path)
1061                 return -ENOMEM;
1062
1063         key.objectid = device->devid;
1064         key.offset = start;
1065         key.type = BTRFS_DEV_EXTENT_KEY;
1066         ret = btrfs_insert_empty_item(trans, root, path, &key,
1067                                       sizeof(*extent));
1068         BUG_ON(ret);
1069
1070         leaf = path->nodes[0];
1071         extent = btrfs_item_ptr(leaf, path->slots[0],
1072                                 struct btrfs_dev_extent);
1073         btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
1074         btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
1075         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1076
1077         write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
1078                     (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
1079                     BTRFS_UUID_SIZE);
1080
1081         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1082         btrfs_mark_buffer_dirty(leaf);
1083         btrfs_free_path(path);
1084         return ret;
1085 }
1086
1087 static noinline int find_next_chunk(struct btrfs_root *root,
1088                                     u64 objectid, u64 *offset)
1089 {
1090         struct btrfs_path *path;
1091         int ret;
1092         struct btrfs_key key;
1093         struct btrfs_chunk *chunk;
1094         struct btrfs_key found_key;
1095
1096         path = btrfs_alloc_path();
1097         if (!path)
1098                 return -ENOMEM;
1099
1100         key.objectid = objectid;
1101         key.offset = (u64)-1;
1102         key.type = BTRFS_CHUNK_ITEM_KEY;
1103
1104         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1105         if (ret < 0)
1106                 goto error;
1107
1108         BUG_ON(ret == 0);
1109
1110         ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
1111         if (ret) {
1112                 *offset = 0;
1113         } else {
1114                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1115                                       path->slots[0]);
1116                 if (found_key.objectid != objectid)
1117                         *offset = 0;
1118                 else {
1119                         chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
1120                                                struct btrfs_chunk);
1121                         *offset = found_key.offset +
1122                                 btrfs_chunk_length(path->nodes[0], chunk);
1123                 }
1124         }
1125         ret = 0;
1126 error:
1127         btrfs_free_path(path);
1128         return ret;
1129 }
1130
1131 static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
1132 {
1133         int ret;
1134         struct btrfs_key key;
1135         struct btrfs_key found_key;
1136         struct btrfs_path *path;
1137
1138         root = root->fs_info->chunk_root;
1139
1140         path = btrfs_alloc_path();
1141         if (!path)
1142                 return -ENOMEM;
1143
1144         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1145         key.type = BTRFS_DEV_ITEM_KEY;
1146         key.offset = (u64)-1;
1147
1148         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1149         if (ret < 0)
1150                 goto error;
1151
1152         BUG_ON(ret == 0);
1153
1154         ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
1155                                   BTRFS_DEV_ITEM_KEY);
1156         if (ret) {
1157                 *objectid = 1;
1158         } else {
1159                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1160                                       path->slots[0]);
1161                 *objectid = found_key.offset + 1;
1162         }
1163         ret = 0;
1164 error:
1165         btrfs_free_path(path);
1166         return ret;
1167 }
1168
1169 /*
1170  * the device information is stored in the chunk root
1171  * the btrfs_device struct should be fully filled in
1172  */
1173 int btrfs_add_device(struct btrfs_trans_handle *trans,
1174                      struct btrfs_root *root,
1175                      struct btrfs_device *device)
1176 {
1177         int ret;
1178         struct btrfs_path *path;
1179         struct btrfs_dev_item *dev_item;
1180         struct extent_buffer *leaf;
1181         struct btrfs_key key;
1182         unsigned long ptr;
1183
1184         root = root->fs_info->chunk_root;
1185
1186         path = btrfs_alloc_path();
1187         if (!path)
1188                 return -ENOMEM;
1189
1190         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1191         key.type = BTRFS_DEV_ITEM_KEY;
1192         key.offset = device->devid;
1193
1194         ret = btrfs_insert_empty_item(trans, root, path, &key,
1195                                       sizeof(*dev_item));
1196         if (ret)
1197                 goto out;
1198
1199         leaf = path->nodes[0];
1200         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1201
1202         btrfs_set_device_id(leaf, dev_item, device->devid);
1203         btrfs_set_device_generation(leaf, dev_item, 0);
1204         btrfs_set_device_type(leaf, dev_item, device->type);
1205         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1206         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1207         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1208         btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
1209         btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1210         btrfs_set_device_group(leaf, dev_item, 0);
1211         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1212         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1213         btrfs_set_device_start_offset(leaf, dev_item, 0);
1214
1215         ptr = (unsigned long)btrfs_device_uuid(dev_item);
1216         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1217         ptr = (unsigned long)btrfs_device_fsid(dev_item);
1218         write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
1219         btrfs_mark_buffer_dirty(leaf);
1220
1221         ret = 0;
1222 out:
1223         btrfs_free_path(path);
1224         return ret;
1225 }
1226
1227 static int btrfs_rm_dev_item(struct btrfs_root *root,
1228                              struct btrfs_device *device)
1229 {
1230         int ret;
1231         struct btrfs_path *path;
1232         struct btrfs_key key;
1233         struct btrfs_trans_handle *trans;
1234
1235         root = root->fs_info->chunk_root;
1236
1237         path = btrfs_alloc_path();
1238         if (!path)
1239                 return -ENOMEM;
1240
1241         trans = btrfs_start_transaction(root, 0);
1242         if (IS_ERR(trans)) {
1243                 btrfs_free_path(path);
1244                 return PTR_ERR(trans);
1245         }
1246         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1247         key.type = BTRFS_DEV_ITEM_KEY;
1248         key.offset = device->devid;
1249         lock_chunks(root);
1250
1251         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1252         if (ret < 0)
1253                 goto out;
1254
1255         if (ret > 0) {
1256                 ret = -ENOENT;
1257                 goto out;
1258         }
1259
1260         ret = btrfs_del_item(trans, root, path);
1261         if (ret)
1262                 goto out;
1263 out:
1264         btrfs_free_path(path);
1265         unlock_chunks(root);
1266         btrfs_commit_transaction(trans, root);
1267         return ret;
1268 }
1269
1270 int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1271 {
1272         struct btrfs_device *device;
1273         struct btrfs_device *next_device;
1274         struct block_device *bdev;
1275         struct buffer_head *bh = NULL;
1276         struct btrfs_super_block *disk_super;
1277         struct btrfs_fs_devices *cur_devices;
1278         u64 all_avail;
1279         u64 devid;
1280         u64 num_devices;
1281         u8 *dev_uuid;
1282         int ret = 0;
1283         bool clear_super = false;
1284
1285         mutex_lock(&uuid_mutex);
1286
1287         all_avail = root->fs_info->avail_data_alloc_bits |
1288                 root->fs_info->avail_system_alloc_bits |
1289                 root->fs_info->avail_metadata_alloc_bits;
1290
1291         if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
1292             root->fs_info->fs_devices->num_devices <= 4) {
1293                 printk(KERN_ERR "btrfs: unable to go below four devices "
1294                        "on raid10\n");
1295                 ret = -EINVAL;
1296                 goto out;
1297         }
1298
1299         if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
1300             root->fs_info->fs_devices->num_devices <= 2) {
1301                 printk(KERN_ERR "btrfs: unable to go below two "
1302                        "devices on raid1\n");
1303                 ret = -EINVAL;
1304                 goto out;
1305         }
1306
1307         if (strcmp(device_path, "missing") == 0) {
1308                 struct list_head *devices;
1309                 struct btrfs_device *tmp;
1310
1311                 device = NULL;
1312                 devices = &root->fs_info->fs_devices->devices;
1313                 /*
1314                  * It is safe to read the devices since the volume_mutex
1315                  * is held.
1316                  */
1317                 list_for_each_entry(tmp, devices, dev_list) {
1318                         if (tmp->in_fs_metadata && !tmp->bdev) {
1319                                 device = tmp;
1320                                 break;
1321                         }
1322                 }
1323                 bdev = NULL;
1324                 bh = NULL;
1325                 disk_super = NULL;
1326                 if (!device) {
1327                         printk(KERN_ERR "btrfs: no missing devices found to "
1328                                "remove\n");
1329                         goto out;
1330                 }
1331         } else {
1332                 bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
1333                                           root->fs_info->bdev_holder);
1334                 if (IS_ERR(bdev)) {
1335                         ret = PTR_ERR(bdev);
1336                         goto out;
1337                 }
1338
1339                 set_blocksize(bdev, 4096);
1340                 bh = btrfs_read_dev_super(bdev);
1341                 if (!bh) {
1342                         ret = -EINVAL;
1343                         goto error_close;
1344                 }
1345                 disk_super = (struct btrfs_super_block *)bh->b_data;
1346                 devid = btrfs_stack_device_id(&disk_super->dev_item);
1347                 dev_uuid = disk_super->dev_item.uuid;
1348                 device = btrfs_find_device(root, devid, dev_uuid,
1349                                            disk_super->fsid);
1350                 if (!device) {
1351                         ret = -ENOENT;
1352                         goto error_brelse;
1353                 }
1354         }
1355
1356         if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1357                 printk(KERN_ERR "btrfs: unable to remove the only writeable "
1358                        "device\n");
1359                 ret = -EINVAL;
1360                 goto error_brelse;
1361         }
1362
1363         if (device->writeable) {
1364                 lock_chunks(root);
1365                 list_del_init(&device->dev_alloc_list);
1366                 unlock_chunks(root);
1367                 root->fs_info->fs_devices->rw_devices--;
1368                 clear_super = true;
1369         }
1370
1371         ret = btrfs_shrink_device(device, 0);
1372         if (ret)
1373                 goto error_undo;
1374
1375         ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1376         if (ret)
1377                 goto error_undo;
1378
1379         spin_lock(&root->fs_info->free_chunk_lock);
1380         root->fs_info->free_chunk_space = device->total_bytes -
1381                 device->bytes_used;
1382         spin_unlock(&root->fs_info->free_chunk_lock);
1383
1384         device->in_fs_metadata = 0;
1385         btrfs_scrub_cancel_dev(root, device);
1386
1387         /*
1388          * the device list mutex makes sure that we don't change
1389          * the device list while someone else is writing out all
1390          * the device supers.
1391          */
1392
1393         cur_devices = device->fs_devices;
1394         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1395         list_del_rcu(&device->dev_list);
1396
1397         device->fs_devices->num_devices--;
1398
1399         if (device->missing)
1400                 root->fs_info->fs_devices->missing_devices--;
1401
1402         next_device = list_entry(root->fs_info->fs_devices->devices.next,
1403                                  struct btrfs_device, dev_list);
1404         if (device->bdev == root->fs_info->sb->s_bdev)
1405                 root->fs_info->sb->s_bdev = next_device->bdev;
1406         if (device->bdev == root->fs_info->fs_devices->latest_bdev)
1407                 root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1408
1409         if (device->bdev)
1410                 device->fs_devices->open_devices--;
1411
1412         call_rcu(&device->rcu, free_device);
1413         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1414
1415         num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
1416         btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
1417
1418         if (cur_devices->open_devices == 0) {
1419                 struct btrfs_fs_devices *fs_devices;
1420                 fs_devices = root->fs_info->fs_devices;
1421                 while (fs_devices) {
1422                         if (fs_devices->seed == cur_devices)
1423                                 break;
1424                         fs_devices = fs_devices->seed;
1425                 }
1426                 fs_devices->seed = cur_devices->seed;
1427                 cur_devices->seed = NULL;
1428                 lock_chunks(root);
1429                 __btrfs_close_devices(cur_devices);
1430                 unlock_chunks(root);
1431                 free_fs_devices(cur_devices);
1432         }
1433
1434         /*
1435          * at this point, the device is zero sized.  We want to
1436          * remove it from the devices list and zero out the old super
1437          */
1438         if (clear_super) {
1439                 /* make sure this device isn't detected as part of
1440                  * the FS anymore
1441                  */
1442                 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
1443                 set_buffer_dirty(bh);
1444                 sync_dirty_buffer(bh);
1445         }
1446
1447         ret = 0;
1448
1449 error_brelse:
1450         brelse(bh);
1451 error_close:
1452         if (bdev)
1453                 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1454 out:
1455         mutex_unlock(&uuid_mutex);
1456         return ret;
1457 error_undo:
1458         if (device->writeable) {
1459                 lock_chunks(root);
1460                 list_add(&device->dev_alloc_list,
1461                          &root->fs_info->fs_devices->alloc_list);
1462                 unlock_chunks(root);
1463                 root->fs_info->fs_devices->rw_devices++;
1464         }
1465         goto error_brelse;
1466 }
1467
1468 /*
1469  * does all the dirty work required for changing file system's UUID.
1470  */
1471 static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
1472                                 struct btrfs_root *root)
1473 {
1474         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1475         struct btrfs_fs_devices *old_devices;
1476         struct btrfs_fs_devices *seed_devices;
1477         struct btrfs_super_block *disk_super = root->fs_info->super_copy;
1478         struct btrfs_device *device;
1479         u64 super_flags;
1480
1481         BUG_ON(!mutex_is_locked(&uuid_mutex));
1482         if (!fs_devices->seeding)
1483                 return -EINVAL;
1484
1485         seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
1486         if (!seed_devices)
1487                 return -ENOMEM;
1488
1489         old_devices = clone_fs_devices(fs_devices);
1490         if (IS_ERR(old_devices)) {
1491                 kfree(seed_devices);
1492                 return PTR_ERR(old_devices);
1493         }
1494
1495         list_add(&old_devices->list, &fs_uuids);
1496
1497         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
1498         seed_devices->opened = 1;
1499         INIT_LIST_HEAD(&seed_devices->devices);
1500         INIT_LIST_HEAD(&seed_devices->alloc_list);
1501         mutex_init(&seed_devices->device_list_mutex);
1502
1503         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1504         list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
1505                               synchronize_rcu);
1506         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1507
1508         list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
1509         list_for_each_entry(device, &seed_devices->devices, dev_list) {
1510                 device->fs_devices = seed_devices;
1511         }
1512
1513         fs_devices->seeding = 0;
1514         fs_devices->num_devices = 0;
1515         fs_devices->open_devices = 0;
1516         fs_devices->seed = seed_devices;
1517
1518         generate_random_uuid(fs_devices->fsid);
1519         memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1520         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1521         super_flags = btrfs_super_flags(disk_super) &
1522                       ~BTRFS_SUPER_FLAG_SEEDING;
1523         btrfs_set_super_flags(disk_super, super_flags);
1524
1525         return 0;
1526 }
1527
1528 /*
1529  * strore the expected generation for seed devices in device items.
1530  */
1531 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
1532                                struct btrfs_root *root)
1533 {
1534         struct btrfs_path *path;
1535         struct extent_buffer *leaf;
1536         struct btrfs_dev_item *dev_item;
1537         struct btrfs_device *device;
1538         struct btrfs_key key;
1539         u8 fs_uuid[BTRFS_UUID_SIZE];
1540         u8 dev_uuid[BTRFS_UUID_SIZE];
1541         u64 devid;
1542         int ret;
1543
1544         path = btrfs_alloc_path();
1545         if (!path)
1546                 return -ENOMEM;
1547
1548         root = root->fs_info->chunk_root;
1549         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1550         key.offset = 0;
1551         key.type = BTRFS_DEV_ITEM_KEY;
1552
1553         while (1) {
1554                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1555                 if (ret < 0)
1556                         goto error;
1557
1558                 leaf = path->nodes[0];
1559 next_slot:
1560                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1561                         ret = btrfs_next_leaf(root, path);
1562                         if (ret > 0)
1563                                 break;
1564                         if (ret < 0)
1565                                 goto error;
1566                         leaf = path->nodes[0];
1567                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1568                         btrfs_release_path(path);
1569                         continue;
1570                 }
1571
1572                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1573                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
1574                     key.type != BTRFS_DEV_ITEM_KEY)
1575                         break;
1576
1577                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
1578                                           struct btrfs_dev_item);
1579                 devid = btrfs_device_id(leaf, dev_item);
1580                 read_extent_buffer(leaf, dev_uuid,
1581                                    (unsigned long)btrfs_device_uuid(dev_item),
1582                                    BTRFS_UUID_SIZE);
1583                 read_extent_buffer(leaf, fs_uuid,
1584                                    (unsigned long)btrfs_device_fsid(dev_item),
1585                                    BTRFS_UUID_SIZE);
1586                 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
1587                 BUG_ON(!device);
1588
1589                 if (device->fs_devices->seeding) {
1590                         btrfs_set_device_generation(leaf, dev_item,
1591                                                     device->generation);
1592                         btrfs_mark_buffer_dirty(leaf);
1593                 }
1594
1595                 path->slots[0]++;
1596                 goto next_slot;
1597         }
1598         ret = 0;
1599 error:
1600         btrfs_free_path(path);
1601         return ret;
1602 }
1603
1604 int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1605 {
1606         struct request_queue *q;
1607         struct btrfs_trans_handle *trans;
1608         struct btrfs_device *device;
1609         struct block_device *bdev;
1610         struct list_head *devices;
1611         struct super_block *sb = root->fs_info->sb;
1612         u64 total_bytes;
1613         int seeding_dev = 0;
1614         int ret = 0;
1615
1616         if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1617                 return -EINVAL;
1618
1619         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
1620                                   root->fs_info->bdev_holder);
1621         if (IS_ERR(bdev))
1622                 return PTR_ERR(bdev);
1623
1624         if (root->fs_info->fs_devices->seeding) {
1625                 seeding_dev = 1;
1626                 down_write(&sb->s_umount);
1627                 mutex_lock(&uuid_mutex);
1628         }
1629
1630         filemap_write_and_wait(bdev->bd_inode->i_mapping);
1631
1632         devices = &root->fs_info->fs_devices->devices;
1633         /*
1634          * we have the volume lock, so we don't need the extra
1635          * device list mutex while reading the list here.
1636          */
1637         list_for_each_entry(device, devices, dev_list) {
1638                 if (device->bdev == bdev) {
1639                         ret = -EEXIST;
1640                         goto error;
1641                 }
1642         }
1643
1644         device = kzalloc(sizeof(*device), GFP_NOFS);
1645         if (!device) {
1646                 /* we can safely leave the fs_devices entry around */
1647                 ret = -ENOMEM;
1648                 goto error;
1649         }
1650
1651         device->name = kstrdup(device_path, GFP_NOFS);
1652         if (!device->name) {
1653                 kfree(device);
1654                 ret = -ENOMEM;
1655                 goto error;
1656         }
1657
1658         ret = find_next_devid(root, &device->devid);
1659         if (ret) {
1660                 kfree(device->name);
1661                 kfree(device);
1662                 goto error;
1663         }
1664
1665         trans = btrfs_start_transaction(root, 0);
1666         if (IS_ERR(trans)) {
1667                 kfree(device->name);
1668                 kfree(device);
1669                 ret = PTR_ERR(trans);
1670                 goto error;
1671         }
1672
1673         lock_chunks(root);
1674
1675         q = bdev_get_queue(bdev);
1676         if (blk_queue_discard(q))
1677                 device->can_discard = 1;
1678         device->writeable = 1;
1679         device->work.func = pending_bios_fn;
1680         generate_random_uuid(device->uuid);
1681         spin_lock_init(&device->io_lock);
1682         device->generation = trans->transid;
1683         device->io_width = root->sectorsize;
1684         device->io_align = root->sectorsize;
1685         device->sector_size = root->sectorsize;
1686         device->total_bytes = i_size_read(bdev->bd_inode);
1687         device->disk_total_bytes = device->total_bytes;
1688         device->dev_root = root->fs_info->dev_root;
1689         device->bdev = bdev;
1690         device->in_fs_metadata = 1;
1691         device->mode = FMODE_EXCL;
1692         set_blocksize(device->bdev, 4096);
1693
1694         if (seeding_dev) {
1695                 sb->s_flags &= ~MS_RDONLY;
1696                 ret = btrfs_prepare_sprout(trans, root);
1697                 BUG_ON(ret);
1698         }
1699
1700         device->fs_devices = root->fs_info->fs_devices;
1701
1702         /*
1703          * we don't want write_supers to jump in here with our device
1704          * half setup
1705          */
1706         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1707         list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
1708         list_add(&device->dev_alloc_list,
1709                  &root->fs_info->fs_devices->alloc_list);
1710         root->fs_info->fs_devices->num_devices++;
1711         root->fs_info->fs_devices->open_devices++;
1712         root->fs_info->fs_devices->rw_devices++;
1713         if (device->can_discard)
1714                 root->fs_info->fs_devices->num_can_discard++;
1715         root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1716
1717         spin_lock(&root->fs_info->free_chunk_lock);
1718         root->fs_info->free_chunk_space += device->total_bytes;
1719         spin_unlock(&root->fs_info->free_chunk_lock);
1720
1721         if (!blk_queue_nonrot(bdev_get_queue(bdev)))
1722                 root->fs_info->fs_devices->rotating = 1;
1723
1724         total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
1725         btrfs_set_super_total_bytes(root->fs_info->super_copy,
1726                                     total_bytes + device->total_bytes);
1727
1728         total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
1729         btrfs_set_super_num_devices(root->fs_info->super_copy,
1730                                     total_bytes + 1);
1731         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1732
1733         if (seeding_dev) {
1734                 ret = init_first_rw_device(trans, root, device);
1735                 BUG_ON(ret);
1736                 ret = btrfs_finish_sprout(trans, root);
1737                 BUG_ON(ret);
1738         } else {
1739                 ret = btrfs_add_device(trans, root, device);
1740         }
1741
1742         /*
1743          * we've got more storage, clear any full flags on the space
1744          * infos
1745          */
1746         btrfs_clear_space_info_full(root->fs_info);
1747
1748         unlock_chunks(root);
1749         btrfs_commit_transaction(trans, root);
1750
1751         if (seeding_dev) {
1752                 mutex_unlock(&uuid_mutex);
1753                 up_write(&sb->s_umount);
1754
1755                 ret = btrfs_relocate_sys_chunks(root);
1756                 BUG_ON(ret);
1757         }
1758
1759         return ret;
1760 error:
1761         blkdev_put(bdev, FMODE_EXCL);
1762         if (seeding_dev) {
1763                 mutex_unlock(&uuid_mutex);
1764                 up_write(&sb->s_umount);
1765         }
1766         return ret;
1767 }
1768
1769 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
1770                                         struct btrfs_device *device)
1771 {
1772         int ret;
1773         struct btrfs_path *path;
1774         struct btrfs_root *root;
1775         struct btrfs_dev_item *dev_item;
1776         struct extent_buffer *leaf;
1777         struct btrfs_key key;
1778
1779         root = device->dev_root->fs_info->chunk_root;
1780
1781         path = btrfs_alloc_path();
1782         if (!path)
1783                 return -ENOMEM;
1784
1785         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1786         key.type = BTRFS_DEV_ITEM_KEY;
1787         key.offset = device->devid;
1788
1789         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1790         if (ret < 0)
1791                 goto out;
1792
1793         if (ret > 0) {
1794                 ret = -ENOENT;
1795                 goto out;
1796         }
1797
1798         leaf = path->nodes[0];
1799         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1800
1801         btrfs_set_device_id(leaf, dev_item, device->devid);
1802         btrfs_set_device_type(leaf, dev_item, device->type);
1803         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1804         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1805         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1806         btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
1807         btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1808         btrfs_mark_buffer_dirty(leaf);
1809
1810 out:
1811         btrfs_free_path(path);
1812         return ret;
1813 }
1814
1815 static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1816                       struct btrfs_device *device, u64 new_size)
1817 {
1818         struct btrfs_super_block *super_copy =
1819                 device->dev_root->fs_info->super_copy;
1820         u64 old_total = btrfs_super_total_bytes(super_copy);
1821         u64 diff = new_size - device->total_bytes;
1822
1823         if (!device->writeable)
1824                 return -EACCES;
1825         if (new_size <= device->total_bytes)
1826                 return -EINVAL;
1827
1828         btrfs_set_super_total_bytes(super_copy, old_total + diff);
1829         device->fs_devices->total_rw_bytes += diff;
1830
1831         device->total_bytes = new_size;
1832         device->disk_total_bytes = new_size;
1833         btrfs_clear_space_info_full(device->dev_root->fs_info);
1834
1835         return btrfs_update_device(trans, device);
1836 }
1837
1838 int btrfs_grow_device(struct btrfs_trans_handle *trans,
1839                       struct btrfs_device *device, u64 new_size)
1840 {
1841         int ret;
1842         lock_chunks(device->dev_root);
1843         ret = __btrfs_grow_device(trans, device, new_size);
1844         unlock_chunks(device->dev_root);
1845         return ret;
1846 }
1847
1848 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
1849                             struct btrfs_root *root,
1850                             u64 chunk_tree, u64 chunk_objectid,
1851                             u64 chunk_offset)
1852 {
1853         int ret;
1854         struct btrfs_path *path;
1855         struct btrfs_key key;
1856
1857         root = root->fs_info->chunk_root;
1858         path = btrfs_alloc_path();
1859         if (!path)
1860                 return -ENOMEM;
1861
1862         key.objectid = chunk_objectid;
1863         key.offset = chunk_offset;
1864         key.type = BTRFS_CHUNK_ITEM_KEY;
1865
1866         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1867         BUG_ON(ret);
1868
1869         ret = btrfs_del_item(trans, root, path);
1870
1871         btrfs_free_path(path);
1872         return ret;
1873 }
1874
1875 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
1876                         chunk_offset)
1877 {
1878         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
1879         struct btrfs_disk_key *disk_key;
1880         struct btrfs_chunk *chunk;
1881         u8 *ptr;
1882         int ret = 0;
1883         u32 num_stripes;
1884         u32 array_size;
1885         u32 len = 0;
1886         u32 cur;
1887         struct btrfs_key key;
1888
1889         array_size = btrfs_super_sys_array_size(super_copy);
1890
1891         ptr = super_copy->sys_chunk_array;
1892         cur = 0;
1893
1894         while (cur < array_size) {
1895                 disk_key = (struct btrfs_disk_key *)ptr;
1896                 btrfs_disk_key_to_cpu(&key, disk_key);
1897
1898                 len = sizeof(*disk_key);
1899
1900                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
1901                         chunk = (struct btrfs_chunk *)(ptr + len);
1902                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
1903                         len += btrfs_chunk_item_size(num_stripes);
1904                 } else {
1905                         ret = -EIO;
1906                         break;
1907                 }
1908                 if (key.objectid == chunk_objectid &&
1909                     key.offset == chunk_offset) {
1910                         memmove(ptr, ptr + len, array_size - (cur + len));
1911                         array_size -= len;
1912                         btrfs_set_super_sys_array_size(super_copy, array_size);
1913                 } else {
1914                         ptr += len;
1915                         cur += len;
1916                 }
1917         }
1918         return ret;
1919 }
1920
1921 static int btrfs_relocate_chunk(struct btrfs_root *root,
1922                          u64 chunk_tree, u64 chunk_objectid,
1923                          u64 chunk_offset)
1924 {
1925         struct extent_map_tree *em_tree;
1926         struct btrfs_root *extent_root;
1927         struct btrfs_trans_handle *trans;
1928         struct extent_map *em;
1929         struct map_lookup *map;
1930         int ret;
1931         int i;
1932
1933         root = root->fs_info->chunk_root;
1934         extent_root = root->fs_info->extent_root;
1935         em_tree = &root->fs_info->mapping_tree.map_tree;
1936
1937         ret = btrfs_can_relocate(extent_root, chunk_offset);
1938         if (ret)
1939                 return -ENOSPC;
1940
1941         /* step one, relocate all the extents inside this chunk */
1942         ret = btrfs_relocate_block_group(extent_root, chunk_offset);
1943         if (ret)
1944                 return ret;
1945
1946         trans = btrfs_start_transaction(root, 0);
1947         BUG_ON(IS_ERR(trans));
1948
1949         lock_chunks(root);
1950
1951         /*
1952          * step two, delete the device extents and the
1953          * chunk tree entries
1954          */
1955         read_lock(&em_tree->lock);
1956         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
1957         read_unlock(&em_tree->lock);
1958
1959         BUG_ON(em->start > chunk_offset ||
1960                em->start + em->len < chunk_offset);
1961         map = (struct map_lookup *)em->bdev;
1962
1963         for (i = 0; i < map->num_stripes; i++) {
1964                 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
1965                                             map->stripes[i].physical);
1966                 BUG_ON(ret);
1967
1968                 if (map->stripes[i].dev) {
1969                         ret = btrfs_update_device(trans, map->stripes[i].dev);
1970                         BUG_ON(ret);
1971                 }
1972         }
1973         ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
1974                                chunk_offset);
1975
1976         BUG_ON(ret);
1977
1978         trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
1979
1980         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
1981                 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
1982                 BUG_ON(ret);
1983         }
1984
1985         ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
1986         BUG_ON(ret);
1987
1988         write_lock(&em_tree->lock);
1989         remove_extent_mapping(em_tree, em);
1990         write_unlock(&em_tree->lock);
1991
1992         kfree(map);
1993         em->bdev = NULL;
1994
1995         /* once for the tree */
1996         free_extent_map(em);
1997         /* once for us */
1998         free_extent_map(em);
1999
2000         unlock_chunks(root);
2001         btrfs_end_transaction(trans, root);
2002         return 0;
2003 }
2004
2005 static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
2006 {
2007         struct btrfs_root *chunk_root = root->fs_info->chunk_root;
2008         struct btrfs_path *path;
2009         struct extent_buffer *leaf;
2010         struct btrfs_chunk *chunk;
2011         struct btrfs_key key;
2012         struct btrfs_key found_key;
2013         u64 chunk_tree = chunk_root->root_key.objectid;
2014         u64 chunk_type;
2015         bool retried = false;
2016         int failed = 0;
2017         int ret;
2018
2019         path = btrfs_alloc_path();
2020         if (!path)
2021                 return -ENOMEM;
2022
2023 again:
2024         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2025         key.offset = (u64)-1;
2026         key.type = BTRFS_CHUNK_ITEM_KEY;
2027
2028         while (1) {
2029                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2030                 if (ret < 0)
2031                         goto error;
2032                 BUG_ON(ret == 0);
2033
2034                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
2035                                           key.type);
2036                 if (ret < 0)
2037                         goto error;
2038                 if (ret > 0)
2039                         break;
2040
2041                 leaf = path->nodes[0];
2042                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2043
2044                 chunk = btrfs_item_ptr(leaf, path->slots[0],
2045                                        struct btrfs_chunk);
2046                 chunk_type = btrfs_chunk_type(leaf, chunk);
2047                 btrfs_release_path(path);
2048
2049                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
2050                         ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
2051                                                    found_key.objectid,
2052                                                    found_key.offset);
2053                         if (ret == -ENOSPC)
2054                                 failed++;
2055                         else if (ret)
2056                                 BUG();
2057                 }
2058
2059                 if (found_key.offset == 0)
2060                         break;
2061                 key.offset = found_key.offset - 1;
2062         }
2063         ret = 0;
2064         if (failed && !retried) {
2065                 failed = 0;
2066                 retried = true;
2067                 goto again;
2068         } else if (failed && retried) {
2069                 WARN_ON(1);
2070                 ret = -ENOSPC;
2071         }
2072 error:
2073         btrfs_free_path(path);
2074         return ret;
2075 }
2076
2077 static int insert_balance_item(struct btrfs_root *root,
2078                                struct btrfs_balance_control *bctl)
2079 {
2080         struct btrfs_trans_handle *trans;
2081         struct btrfs_balance_item *item;
2082         struct btrfs_disk_balance_args disk_bargs;
2083         struct btrfs_path *path;
2084         struct extent_buffer *leaf;
2085         struct btrfs_key key;
2086         int ret, err;
2087
2088         path = btrfs_alloc_path();
2089         if (!path)
2090                 return -ENOMEM;
2091
2092         trans = btrfs_start_transaction(root, 0);
2093         if (IS_ERR(trans)) {
2094                 btrfs_free_path(path);
2095                 return PTR_ERR(trans);
2096         }
2097
2098         key.objectid = BTRFS_BALANCE_OBJECTID;
2099         key.type = BTRFS_BALANCE_ITEM_KEY;
2100         key.offset = 0;
2101
2102         ret = btrfs_insert_empty_item(trans, root, path, &key,
2103                                       sizeof(*item));
2104         if (ret)
2105                 goto out;
2106
2107         leaf = path->nodes[0];
2108         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
2109
2110         memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
2111
2112         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
2113         btrfs_set_balance_data(leaf, item, &disk_bargs);
2114         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
2115         btrfs_set_balance_meta(leaf, item, &disk_bargs);
2116         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
2117         btrfs_set_balance_sys(leaf, item, &disk_bargs);
2118
2119         btrfs_set_balance_flags(leaf, item, bctl->flags);
2120
2121         btrfs_mark_buffer_dirty(leaf);
2122 out:
2123         btrfs_free_path(path);
2124         err = btrfs_commit_transaction(trans, root);
2125         if (err && !ret)
2126                 ret = err;
2127         return ret;
2128 }
2129
2130 static int del_balance_item(struct btrfs_root *root)
2131 {
2132         struct btrfs_trans_handle *trans;
2133         struct btrfs_path *path;
2134         struct btrfs_key key;
2135         int ret, err;
2136
2137         path = btrfs_alloc_path();
2138         if (!path)
2139                 return -ENOMEM;
2140
2141         trans = btrfs_start_transaction(root, 0);
2142         if (IS_ERR(trans)) {
2143                 btrfs_free_path(path);
2144                 return PTR_ERR(trans);
2145         }
2146
2147         key.objectid = BTRFS_BALANCE_OBJECTID;
2148         key.type = BTRFS_BALANCE_ITEM_KEY;
2149         key.offset = 0;
2150
2151         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2152         if (ret < 0)
2153                 goto out;
2154         if (ret > 0) {
2155                 ret = -ENOENT;
2156                 goto out;
2157         }
2158
2159         ret = btrfs_del_item(trans, root, path);
2160 out:
2161         btrfs_free_path(path);
2162         err = btrfs_commit_transaction(trans, root);
2163         if (err && !ret)
2164                 ret = err;
2165         return ret;
2166 }
2167
2168 /*
2169  * This is a heuristic used to reduce the number of chunks balanced on
2170  * resume after balance was interrupted.
2171  */
2172 static void update_balance_args(struct btrfs_balance_control *bctl)
2173 {
2174         /*
2175          * Turn on soft mode for chunk types that were being converted.
2176          */
2177         if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
2178                 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
2179         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
2180                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
2181         if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
2182                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
2183
2184         /*
2185          * Turn on usage filter if is not already used.  The idea is
2186          * that chunks that we have already balanced should be
2187          * reasonably full.  Don't do it for chunks that are being
2188          * converted - that will keep us from relocating unconverted
2189          * (albeit full) chunks.
2190          */
2191         if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2192             !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2193                 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
2194                 bctl->data.usage = 90;
2195         }
2196         if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2197             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2198                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
2199                 bctl->sys.usage = 90;
2200         }
2201         if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2202             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2203                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
2204                 bctl->meta.usage = 90;
2205         }
2206 }
2207
2208 /*
2209  * Should be called with both balance and volume mutexes held to
2210  * serialize other volume operations (add_dev/rm_dev/resize) with
2211  * restriper.  Same goes for unset_balance_control.
2212  */
2213 static void set_balance_control(struct btrfs_balance_control *bctl)
2214 {
2215         struct btrfs_fs_info *fs_info = bctl->fs_info;
2216
2217         BUG_ON(fs_info->balance_ctl);
2218
2219         spin_lock(&fs_info->balance_lock);
2220         fs_info->balance_ctl = bctl;
2221         spin_unlock(&fs_info->balance_lock);
2222 }
2223
2224 static void unset_balance_control(struct btrfs_fs_info *fs_info)
2225 {
2226         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
2227
2228         BUG_ON(!fs_info->balance_ctl);
2229
2230         spin_lock(&fs_info->balance_lock);
2231         fs_info->balance_ctl = NULL;
2232         spin_unlock(&fs_info->balance_lock);
2233
2234         kfree(bctl);
2235 }
2236
2237 /*
2238  * Balance filters.  Return 1 if chunk should be filtered out
2239  * (should not be balanced).
2240  */
2241 static int chunk_profiles_filter(u64 chunk_profile,
2242                                  struct btrfs_balance_args *bargs)
2243 {
2244         chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
2245
2246         if (chunk_profile == 0)
2247                 chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2248
2249         if (bargs->profiles & chunk_profile)
2250                 return 0;
2251
2252         return 1;
2253 }
2254
2255 static u64 div_factor_fine(u64 num, int factor)
2256 {
2257         if (factor <= 0)
2258                 return 0;
2259         if (factor >= 100)
2260                 return num;
2261
2262         num *= factor;
2263         do_div(num, 100);
2264         return num;
2265 }
2266
2267 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
2268                               struct btrfs_balance_args *bargs)
2269 {
2270         struct btrfs_block_group_cache *cache;
2271         u64 chunk_used, user_thresh;
2272         int ret = 1;
2273
2274         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2275         chunk_used = btrfs_block_group_used(&cache->item);
2276
2277         user_thresh = div_factor_fine(cache->key.offset, bargs->usage);
2278         if (chunk_used < user_thresh)
2279                 ret = 0;
2280
2281         btrfs_put_block_group(cache);
2282         return ret;
2283 }
2284
2285 static int chunk_devid_filter(struct extent_buffer *leaf,
2286                               struct btrfs_chunk *chunk,
2287                               struct btrfs_balance_args *bargs)
2288 {
2289         struct btrfs_stripe *stripe;
2290         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2291         int i;
2292
2293         for (i = 0; i < num_stripes; i++) {
2294                 stripe = btrfs_stripe_nr(chunk, i);
2295                 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
2296                         return 0;
2297         }
2298
2299         return 1;
2300 }
2301
2302 /* [pstart, pend) */
2303 static int chunk_drange_filter(struct extent_buffer *leaf,
2304                                struct btrfs_chunk *chunk,
2305                                u64 chunk_offset,
2306                                struct btrfs_balance_args *bargs)
2307 {
2308         struct btrfs_stripe *stripe;
2309         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2310         u64 stripe_offset;
2311         u64 stripe_length;
2312         int factor;
2313         int i;
2314
2315         if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
2316                 return 0;
2317
2318         if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
2319              BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))
2320                 factor = 2;
2321         else
2322                 factor = 1;
2323         factor = num_stripes / factor;
2324
2325         for (i = 0; i < num_stripes; i++) {
2326                 stripe = btrfs_stripe_nr(chunk, i);
2327                 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
2328                         continue;
2329
2330                 stripe_offset = btrfs_stripe_offset(leaf, stripe);
2331                 stripe_length = btrfs_chunk_length(leaf, chunk);
2332                 do_div(stripe_length, factor);
2333
2334                 if (stripe_offset < bargs->pend &&
2335                     stripe_offset + stripe_length > bargs->pstart)
2336                         return 0;
2337         }
2338
2339         return 1;
2340 }
2341
2342 /* [vstart, vend) */
2343 static int chunk_vrange_filter(struct extent_buffer *leaf,
2344                                struct btrfs_chunk *chunk,
2345                                u64 chunk_offset,
2346                                struct btrfs_balance_args *bargs)
2347 {
2348         if (chunk_offset < bargs->vend &&
2349             chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
2350                 /* at least part of the chunk is inside this vrange */
2351                 return 0;
2352
2353         return 1;
2354 }
2355
2356 static int chunk_soft_convert_filter(u64 chunk_profile,
2357                                      struct btrfs_balance_args *bargs)
2358 {
2359         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
2360                 return 0;
2361
2362         chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
2363
2364         if (chunk_profile == 0)
2365                 chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2366
2367         if (bargs->target & chunk_profile)
2368                 return 1;
2369
2370         return 0;
2371 }
2372
2373 static int should_balance_chunk(struct btrfs_root *root,
2374                                 struct extent_buffer *leaf,
2375                                 struct btrfs_chunk *chunk, u64 chunk_offset)
2376 {
2377         struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
2378         struct btrfs_balance_args *bargs = NULL;
2379         u64 chunk_type = btrfs_chunk_type(leaf, chunk);
2380
2381         /* type filter */
2382         if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
2383               (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
2384                 return 0;
2385         }
2386
2387         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
2388                 bargs = &bctl->data;
2389         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
2390                 bargs = &bctl->sys;
2391         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
2392                 bargs = &bctl->meta;
2393
2394         /* profiles filter */
2395         if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
2396             chunk_profiles_filter(chunk_type, bargs)) {
2397                 return 0;
2398         }
2399
2400         /* usage filter */
2401         if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
2402             chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
2403                 return 0;
2404         }
2405
2406         /* devid filter */
2407         if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
2408             chunk_devid_filter(leaf, chunk, bargs)) {
2409                 return 0;
2410         }
2411
2412         /* drange filter, makes sense only with devid filter */
2413         if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
2414             chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
2415                 return 0;
2416         }
2417
2418         /* vrange filter */
2419         if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
2420             chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
2421                 return 0;
2422         }
2423
2424         /* soft profile changing mode */
2425         if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
2426             chunk_soft_convert_filter(chunk_type, bargs)) {
2427                 return 0;
2428         }
2429
2430         return 1;
2431 }
2432
2433 static u64 div_factor(u64 num, int factor)
2434 {
2435         if (factor == 10)
2436                 return num;
2437         num *= factor;
2438         do_div(num, 10);
2439         return num;
2440 }
2441
2442 static int __btrfs_balance(struct btrfs_fs_info *fs_info)
2443 {
2444         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
2445         struct btrfs_root *chunk_root = fs_info->chunk_root;
2446         struct btrfs_root *dev_root = fs_info->dev_root;
2447         struct list_head *devices;
2448         struct btrfs_device *device;
2449         u64 old_size;
2450         u64 size_to_free;
2451         struct btrfs_chunk *chunk;
2452         struct btrfs_path *path;
2453         struct btrfs_key key;
2454         struct btrfs_key found_key;
2455         struct btrfs_trans_handle *trans;
2456         struct extent_buffer *leaf;
2457         int slot;
2458         int ret;
2459         int enospc_errors = 0;
2460         bool counting = true;
2461
2462         /* step one make some room on all the devices */
2463         devices = &fs_info->fs_devices->devices;
2464         list_for_each_entry(device, devices, dev_list) {
2465                 old_size = device->total_bytes;
2466                 size_to_free = div_factor(old_size, 1);
2467                 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
2468                 if (!device->writeable ||
2469                     device->total_bytes - device->bytes_used > size_to_free)
2470                         continue;
2471
2472                 ret = btrfs_shrink_device(device, old_size - size_to_free);
2473                 if (ret == -ENOSPC)
2474                         break;
2475                 BUG_ON(ret);
2476
2477                 trans = btrfs_start_transaction(dev_root, 0);
2478                 BUG_ON(IS_ERR(trans));
2479
2480                 ret = btrfs_grow_device(trans, device, old_size);
2481                 BUG_ON(ret);
2482
2483                 btrfs_end_transaction(trans, dev_root);
2484         }
2485
2486         /* step two, relocate all the chunks */
2487         path = btrfs_alloc_path();
2488         if (!path) {
2489                 ret = -ENOMEM;
2490                 goto error;
2491         }
2492
2493         /* zero out stat counters */
2494         spin_lock(&fs_info->balance_lock);
2495         memset(&bctl->stat, 0, sizeof(bctl->stat));
2496         spin_unlock(&fs_info->balance_lock);
2497 again:
2498         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2499         key.offset = (u64)-1;
2500         key.type = BTRFS_CHUNK_ITEM_KEY;
2501
2502         while (1) {
2503                 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
2504                     atomic_read(&fs_info->balance_cancel_req)) {
2505                         ret = -ECANCELED;
2506                         goto error;
2507                 }
2508
2509                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2510                 if (ret < 0)
2511                         goto error;
2512
2513                 /*
2514                  * this shouldn't happen, it means the last relocate
2515                  * failed
2516                  */
2517                 if (ret == 0)
2518                         BUG(); /* FIXME break ? */
2519
2520                 ret = btrfs_previous_item(chunk_root, path, 0,
2521                                           BTRFS_CHUNK_ITEM_KEY);
2522                 if (ret) {
2523                         ret = 0;
2524                         break;
2525                 }
2526
2527                 leaf = path->nodes[0];
2528                 slot = path->slots[0];
2529                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
2530
2531                 if (found_key.objectid != key.objectid)
2532                         break;
2533
2534                 /* chunk zero is special */
2535                 if (found_key.offset == 0)
2536                         break;
2537
2538                 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
2539
2540                 if (!counting) {
2541                         spin_lock(&fs_info->balance_lock);
2542                         bctl->stat.considered++;
2543                         spin_unlock(&fs_info->balance_lock);
2544                 }
2545
2546                 ret = should_balance_chunk(chunk_root, leaf, chunk,
2547                                            found_key.offset);
2548                 btrfs_release_path(path);
2549                 if (!ret)
2550                         goto loop;
2551
2552                 if (counting) {
2553                         spin_lock(&fs_info->balance_lock);
2554                         bctl->stat.expected++;
2555                         spin_unlock(&fs_info->balance_lock);
2556                         goto loop;
2557                 }
2558
2559                 ret = btrfs_relocate_chunk(chunk_root,
2560                                            chunk_root->root_key.objectid,
2561                                            found_key.objectid,
2562                                            found_key.offset);
2563                 if (ret && ret != -ENOSPC)
2564                         goto error;
2565                 if (ret == -ENOSPC) {
2566                         enospc_errors++;
2567                 } else {
2568                         spin_lock(&fs_info->balance_lock);
2569                         bctl->stat.completed++;
2570                         spin_unlock(&fs_info->balance_lock);
2571                 }
2572 loop:
2573                 key.offset = found_key.offset - 1;
2574         }
2575
2576         if (counting) {
2577                 btrfs_release_path(path);
2578                 counting = false;
2579                 goto again;
2580         }
2581 error:
2582         btrfs_free_path(path);
2583         if (enospc_errors) {
2584                 printk(KERN_INFO "btrfs: %d enospc errors during balance\n",
2585                        enospc_errors);
2586                 if (!ret)
2587                         ret = -ENOSPC;
2588         }
2589
2590         return ret;
2591 }
2592
2593 static inline int balance_need_close(struct btrfs_fs_info *fs_info)
2594 {
2595         /* cancel requested || normal exit path */
2596         return atomic_read(&fs_info->balance_cancel_req) ||
2597                 (atomic_read(&fs_info->balance_pause_req) == 0 &&
2598                  atomic_read(&fs_info->balance_cancel_req) == 0);
2599 }
2600
2601 static void __cancel_balance(struct btrfs_fs_info *fs_info)
2602 {
2603         int ret;
2604
2605         unset_balance_control(fs_info);
2606         ret = del_balance_item(fs_info->tree_root);
2607         BUG_ON(ret);
2608 }
2609
2610 void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
2611                                struct btrfs_ioctl_balance_args *bargs);
2612
2613 /*
2614  * Should be called with both balance and volume mutexes held
2615  */
2616 int btrfs_balance(struct btrfs_balance_control *bctl,
2617                   struct btrfs_ioctl_balance_args *bargs)
2618 {
2619         struct btrfs_fs_info *fs_info = bctl->fs_info;
2620         u64 allowed;
2621         int ret;
2622
2623         if (btrfs_fs_closing(fs_info) ||
2624             atomic_read(&fs_info->balance_pause_req) ||
2625             atomic_read(&fs_info->balance_cancel_req)) {
2626                 ret = -EINVAL;
2627                 goto out;
2628         }
2629
2630         /*
2631          * In case of mixed groups both data and meta should be picked,
2632          * and identical options should be given for both of them.
2633          */
2634         allowed = btrfs_super_incompat_flags(fs_info->super_copy);
2635         if ((allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
2636             (bctl->flags & (BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA))) {
2637                 if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
2638                     !(bctl->flags & BTRFS_BALANCE_METADATA) ||
2639                     memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
2640                         printk(KERN_ERR "btrfs: with mixed groups data and "
2641                                "metadata balance options must be the same\n");
2642                         ret = -EINVAL;
2643                         goto out;
2644                 }
2645         }
2646
2647         /*
2648          * Profile changing sanity checks.  Skip them if a simple
2649          * balance is requested.
2650          */
2651         if (!((bctl->data.flags | bctl->sys.flags | bctl->meta.flags) &
2652               BTRFS_BALANCE_ARGS_CONVERT))
2653                 goto do_balance;
2654
2655         allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2656         if (fs_info->fs_devices->num_devices == 1)
2657                 allowed |= BTRFS_BLOCK_GROUP_DUP;
2658         else if (fs_info->fs_devices->num_devices < 4)
2659                 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
2660         else
2661                 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
2662                                 BTRFS_BLOCK_GROUP_RAID10);
2663
2664         if (!profile_is_valid(bctl->data.target, 1) ||
2665             bctl->data.target & ~allowed) {
2666                 printk(KERN_ERR "btrfs: unable to start balance with target "
2667                        "data profile %llu\n",
2668                        (unsigned long long)bctl->data.target);
2669                 ret = -EINVAL;
2670                 goto out;
2671         }
2672         if (!profile_is_valid(bctl->meta.target, 1) ||
2673             bctl->meta.target & ~allowed) {
2674                 printk(KERN_ERR "btrfs: unable to start balance with target "
2675                        "metadata profile %llu\n",
2676                        (unsigned long long)bctl->meta.target);
2677                 ret = -EINVAL;
2678                 goto out;
2679         }
2680         if (!profile_is_valid(bctl->sys.target, 1) ||
2681             bctl->sys.target & ~allowed) {
2682                 printk(KERN_ERR "btrfs: unable to start balance with target "
2683                        "system profile %llu\n",
2684                        (unsigned long long)bctl->sys.target);
2685                 ret = -EINVAL;
2686                 goto out;
2687         }
2688
2689         if (bctl->data.target & BTRFS_BLOCK_GROUP_DUP) {
2690                 printk(KERN_ERR "btrfs: dup for data is not allowed\n");
2691                 ret = -EINVAL;
2692                 goto out;
2693         }
2694
2695         /* allow to reduce meta or sys integrity only if force set */
2696         allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
2697                         BTRFS_BLOCK_GROUP_RAID10;
2698         if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2699              (fs_info->avail_system_alloc_bits & allowed) &&
2700              !(bctl->sys.target & allowed)) ||
2701             ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2702              (fs_info->avail_metadata_alloc_bits & allowed) &&
2703              !(bctl->meta.target & allowed))) {
2704                 if (bctl->flags & BTRFS_BALANCE_FORCE) {
2705                         printk(KERN_INFO "btrfs: force reducing metadata "
2706                                "integrity\n");
2707                 } else {
2708                         printk(KERN_ERR "btrfs: balance will reduce metadata "
2709                                "integrity, use force if you want this\n");
2710                         ret = -EINVAL;
2711                         goto out;
2712                 }
2713         }
2714
2715 do_balance:
2716         ret = insert_balance_item(fs_info->tree_root, bctl);
2717         if (ret && ret != -EEXIST)
2718                 goto out;
2719
2720         if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
2721                 BUG_ON(ret == -EEXIST);
2722                 set_balance_control(bctl);
2723         } else {
2724                 BUG_ON(ret != -EEXIST);
2725                 spin_lock(&fs_info->balance_lock);
2726                 update_balance_args(bctl);
2727                 spin_unlock(&fs_info->balance_lock);
2728         }
2729
2730         atomic_inc(&fs_info->balance_running);
2731         mutex_unlock(&fs_info->balance_mutex);
2732
2733         ret = __btrfs_balance(fs_info);
2734
2735         mutex_lock(&fs_info->balance_mutex);
2736         atomic_dec(&fs_info->balance_running);
2737
2738         if (bargs) {
2739                 memset(bargs, 0, sizeof(*bargs));
2740                 update_ioctl_balance_args(fs_info, 0, bargs);
2741         }
2742
2743         if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
2744             balance_need_close(fs_info)) {
2745                 __cancel_balance(fs_info);
2746         }
2747
2748         wake_up(&fs_info->balance_wait_q);
2749
2750         return ret;
2751 out:
2752         if (bctl->flags & BTRFS_BALANCE_RESUME)
2753                 __cancel_balance(fs_info);
2754         else
2755                 kfree(bctl);
2756         return ret;
2757 }
2758
2759 static int balance_kthread(void *data)
2760 {
2761         struct btrfs_balance_control *bctl =
2762                         (struct btrfs_balance_control *)data;
2763         struct btrfs_fs_info *fs_info = bctl->fs_info;
2764         int ret = 0;
2765
2766         mutex_lock(&fs_info->volume_mutex);
2767         mutex_lock(&fs_info->balance_mutex);
2768
2769         set_balance_control(bctl);
2770
2771         if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
2772                 printk(KERN_INFO "btrfs: force skipping balance\n");
2773         } else {
2774                 printk(KERN_INFO "btrfs: continuing balance\n");
2775                 ret = btrfs_balance(bctl, NULL);
2776         }
2777
2778         mutex_unlock(&fs_info->balance_mutex);
2779         mutex_unlock(&fs_info->volume_mutex);
2780         return ret;
2781 }
2782
2783 int btrfs_recover_balance(struct btrfs_root *tree_root)
2784 {
2785         struct task_struct *tsk;
2786         struct btrfs_balance_control *bctl;
2787         struct btrfs_balance_item *item;
2788         struct btrfs_disk_balance_args disk_bargs;
2789         struct btrfs_path *path;
2790         struct extent_buffer *leaf;
2791         struct btrfs_key key;
2792         int ret;
2793
2794         path = btrfs_alloc_path();
2795         if (!path)
2796                 return -ENOMEM;
2797
2798         bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
2799         if (!bctl) {
2800                 ret = -ENOMEM;
2801                 goto out;
2802         }
2803
2804         key.objectid = BTRFS_BALANCE_OBJECTID;
2805         key.type = BTRFS_BALANCE_ITEM_KEY;
2806         key.offset = 0;
2807
2808         ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
2809         if (ret < 0)
2810                 goto out_bctl;
2811         if (ret > 0) { /* ret = -ENOENT; */
2812                 ret = 0;
2813                 goto out_bctl;
2814         }
2815
2816         leaf = path->nodes[0];
2817         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
2818
2819         bctl->fs_info = tree_root->fs_info;
2820         bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME;
2821
2822         btrfs_balance_data(leaf, item, &disk_bargs);
2823         btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
2824         btrfs_balance_meta(leaf, item, &disk_bargs);
2825         btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
2826         btrfs_balance_sys(leaf, item, &disk_bargs);
2827         btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
2828
2829         tsk = kthread_run(balance_kthread, bctl, "btrfs-balance");
2830         if (IS_ERR(tsk))
2831                 ret = PTR_ERR(tsk);
2832         else
2833                 goto out;
2834
2835 out_bctl:
2836         kfree(bctl);
2837 out:
2838         btrfs_free_path(path);
2839         return ret;
2840 }
2841
2842 int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
2843 {
2844         int ret = 0;
2845
2846         mutex_lock(&fs_info->balance_mutex);
2847         if (!fs_info->balance_ctl) {
2848                 mutex_unlock(&fs_info->balance_mutex);
2849                 return -ENOTCONN;
2850         }
2851
2852         if (atomic_read(&fs_info->balance_running)) {
2853                 atomic_inc(&fs_info->balance_pause_req);
2854                 mutex_unlock(&fs_info->balance_mutex);
2855
2856                 wait_event(fs_info->balance_wait_q,
2857                            atomic_read(&fs_info->balance_running) == 0);
2858
2859                 mutex_lock(&fs_info->balance_mutex);
2860                 /* we are good with balance_ctl ripped off from under us */
2861                 BUG_ON(atomic_read(&fs_info->balance_running));
2862                 atomic_dec(&fs_info->balance_pause_req);
2863         } else {
2864                 ret = -ENOTCONN;
2865         }
2866
2867         mutex_unlock(&fs_info->balance_mutex);
2868         return ret;
2869 }
2870
2871 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
2872 {
2873         mutex_lock(&fs_info->balance_mutex);
2874         if (!fs_info->balance_ctl) {
2875                 mutex_unlock(&fs_info->balance_mutex);
2876                 return -ENOTCONN;
2877         }
2878
2879         atomic_inc(&fs_info->balance_cancel_req);
2880         /*
2881          * if we are running just wait and return, balance item is
2882          * deleted in btrfs_balance in this case
2883          */
2884         if (atomic_read(&fs_info->balance_running)) {
2885                 mutex_unlock(&fs_info->balance_mutex);
2886                 wait_event(fs_info->balance_wait_q,
2887                            atomic_read(&fs_info->balance_running) == 0);
2888                 mutex_lock(&fs_info->balance_mutex);
2889         } else {
2890                 /* __cancel_balance needs volume_mutex */
2891                 mutex_unlock(&fs_info->balance_mutex);
2892                 mutex_lock(&fs_info->volume_mutex);
2893                 mutex_lock(&fs_info->balance_mutex);
2894
2895                 if (fs_info->balance_ctl)
2896                         __cancel_balance(fs_info);
2897
2898                 mutex_unlock(&fs_info->volume_mutex);
2899         }
2900
2901         BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
2902         atomic_dec(&fs_info->balance_cancel_req);
2903         mutex_unlock(&fs_info->balance_mutex);
2904         return 0;
2905 }
2906
2907 /*
2908  * shrinking a device means finding all of the device extents past
2909  * the new size, and then following the back refs to the chunks.
2910  * The chunk relocation code actually frees the device extent
2911  */
2912 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2913 {
2914         struct btrfs_trans_handle *trans;
2915         struct btrfs_root *root = device->dev_root;
2916         struct btrfs_dev_extent *dev_extent = NULL;
2917         struct btrfs_path *path;
2918         u64 length;
2919         u64 chunk_tree;
2920         u64 chunk_objectid;
2921         u64 chunk_offset;
2922         int ret;
2923         int slot;
2924         int failed = 0;
2925         bool retried = false;
2926         struct extent_buffer *l;
2927         struct btrfs_key key;
2928         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
2929         u64 old_total = btrfs_super_total_bytes(super_copy);
2930         u64 old_size = device->total_bytes;
2931         u64 diff = device->total_bytes - new_size;
2932
2933         if (new_size >= device->total_bytes)
2934                 return -EINVAL;
2935
2936         path = btrfs_alloc_path();
2937         if (!path)
2938                 return -ENOMEM;
2939
2940         path->reada = 2;
2941
2942         lock_chunks(root);
2943
2944         device->total_bytes = new_size;
2945         if (device->writeable) {
2946                 device->fs_devices->total_rw_bytes -= diff;
2947                 spin_lock(&root->fs_info->free_chunk_lock);
2948                 root->fs_info->free_chunk_space -= diff;
2949                 spin_unlock(&root->fs_info->free_chunk_lock);
2950         }
2951         unlock_chunks(root);
2952
2953 again:
2954         key.objectid = device->devid;
2955         key.offset = (u64)-1;
2956         key.type = BTRFS_DEV_EXTENT_KEY;
2957
2958         while (1) {
2959                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2960                 if (ret < 0)
2961                         goto done;
2962
2963                 ret = btrfs_previous_item(root, path, 0, key.type);
2964                 if (ret < 0)
2965                         goto done;
2966                 if (ret) {
2967                         ret = 0;
2968                         btrfs_release_path(path);
2969                         break;
2970                 }
2971
2972                 l = path->nodes[0];
2973                 slot = path->slots[0];
2974                 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
2975
2976                 if (key.objectid != device->devid) {
2977                         btrfs_release_path(path);
2978                         break;
2979                 }
2980
2981                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
2982                 length = btrfs_dev_extent_length(l, dev_extent);
2983
2984                 if (key.offset + length <= new_size) {
2985                         btrfs_release_path(path);
2986                         break;
2987                 }
2988
2989                 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
2990                 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
2991                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
2992                 btrfs_release_path(path);
2993
2994                 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
2995                                            chunk_offset);
2996                 if (ret && ret != -ENOSPC)
2997                         goto done;
2998                 if (ret == -ENOSPC)
2999                         failed++;
3000                 key.offset -= 1;
3001         }
3002
3003         if (failed && !retried) {
3004                 failed = 0;
3005                 retried = true;
3006                 goto again;
3007         } else if (failed && retried) {
3008                 ret = -ENOSPC;
3009                 lock_chunks(root);
3010
3011                 device->total_bytes = old_size;
3012                 if (device->writeable)
3013                         device->fs_devices->total_rw_bytes += diff;
3014                 spin_lock(&root->fs_info->free_chunk_lock);
3015                 root->fs_info->free_chunk_space += diff;
3016                 spin_unlock(&root->fs_info->free_chunk_lock);
3017                 unlock_chunks(root);
3018                 goto done;
3019         }
3020
3021         /* Shrinking succeeded, else we would be at "done". */
3022         trans = btrfs_start_transaction(root, 0);
3023         if (IS_ERR(trans)) {
3024                 ret = PTR_ERR(trans);
3025                 goto done;
3026         }
3027
3028         lock_chunks(root);
3029
3030         device->disk_total_bytes = new_size;
3031         /* Now btrfs_update_device() will change the on-disk size. */
3032         ret = btrfs_update_device(trans, device);
3033         if (ret) {
3034                 unlock_chunks(root);
3035                 btrfs_end_transaction(trans, root);
3036                 goto done;
3037         }
3038         WARN_ON(diff > old_total);
3039         btrfs_set_super_total_bytes(super_copy, old_total - diff);
3040         unlock_chunks(root);
3041         btrfs_end_transaction(trans, root);
3042 done:
3043         btrfs_free_path(path);
3044         return ret;
3045 }
3046
3047 static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
3048                            struct btrfs_root *root,
3049                            struct btrfs_key *key,
3050                            struct btrfs_chunk *chunk, int item_size)
3051 {
3052         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
3053         struct btrfs_disk_key disk_key;
3054         u32 array_size;
3055         u8 *ptr;
3056
3057         array_size = btrfs_super_sys_array_size(super_copy);
3058         if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
3059                 return -EFBIG;
3060
3061         ptr = super_copy->sys_chunk_array + array_size;
3062         btrfs_cpu_key_to_disk(&disk_key, key);
3063         memcpy(ptr, &disk_key, sizeof(disk_key));
3064         ptr += sizeof(disk_key);
3065         memcpy(ptr, chunk, item_size);
3066         item_size += sizeof(disk_key);
3067         btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
3068         return 0;
3069 }
3070
3071 /*
3072  * sort the devices in descending order by max_avail, total_avail
3073  */
3074 static int btrfs_cmp_device_info(const void *a, const void *b)
3075 {
3076         const struct btrfs_device_info *di_a = a;
3077         const struct btrfs_device_info *di_b = b;
3078
3079         if (di_a->max_avail > di_b->max_avail)
3080                 return -1;
3081         if (di_a->max_avail < di_b->max_avail)
3082                 return 1;
3083         if (di_a->total_avail > di_b->total_avail)
3084                 return -1;
3085         if (di_a->total_avail < di_b->total_avail)
3086                 return 1;
3087         return 0;
3088 }
3089
3090 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3091                                struct btrfs_root *extent_root,
3092                                struct map_lookup **map_ret,
3093                                u64 *num_bytes_out, u64 *stripe_size_out,
3094                                u64 start, u64 type)
3095 {
3096         struct btrfs_fs_info *info = extent_root->fs_info;
3097         struct btrfs_fs_devices *fs_devices = info->fs_devices;
3098         struct list_head *cur;
3099         struct map_lookup *map = NULL;
3100         struct extent_map_tree *em_tree;
3101         struct extent_map *em;
3102         struct btrfs_device_info *devices_info = NULL;
3103         u64 total_avail;
3104         int num_stripes;        /* total number of stripes to allocate */
3105         int sub_stripes;        /* sub_stripes info for map */
3106         int dev_stripes;        /* stripes per dev */
3107         int devs_max;           /* max devs to use */
3108         int devs_min;           /* min devs needed */
3109         int devs_increment;     /* ndevs has to be a multiple of this */
3110         int ncopies;            /* how many copies to data has */
3111         int ret;
3112         u64 max_stripe_size;
3113         u64 max_chunk_size;
3114         u64 stripe_size;
3115         u64 num_bytes;
3116         int ndevs;
3117         int i;
3118         int j;
3119
3120         if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
3121             (type & BTRFS_BLOCK_GROUP_DUP)) {
3122                 WARN_ON(1);
3123                 type &= ~BTRFS_BLOCK_GROUP_DUP;
3124         }
3125
3126         if (list_empty(&fs_devices->alloc_list))
3127                 return -ENOSPC;
3128
3129         sub_stripes = 1;
3130         dev_stripes = 1;
3131         devs_increment = 1;
3132         ncopies = 1;
3133         devs_max = 0;   /* 0 == as many as possible */
3134         devs_min = 1;
3135
3136         /*
3137          * define the properties of each RAID type.
3138          * FIXME: move this to a global table and use it in all RAID
3139          * calculation code
3140          */
3141         if (type & (BTRFS_BLOCK_GROUP_DUP)) {
3142                 dev_stripes = 2;
3143                 ncopies = 2;
3144                 devs_max = 1;
3145         } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
3146                 devs_min = 2;
3147         } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
3148                 devs_increment = 2;
3149                 ncopies = 2;
3150                 devs_max = 2;
3151                 devs_min = 2;
3152         } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
3153                 sub_stripes = 2;
3154                 devs_increment = 2;
3155                 ncopies = 2;
3156                 devs_min = 4;
3157         } else {
3158                 devs_max = 1;
3159         }
3160
3161         if (type & BTRFS_BLOCK_GROUP_DATA) {
3162                 max_stripe_size = 1024 * 1024 * 1024;
3163                 max_chunk_size = 10 * max_stripe_size;
3164         } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
3165                 /* for larger filesystems, use larger metadata chunks */
3166                 if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
3167                         max_stripe_size = 1024 * 1024 * 1024;
3168                 else
3169                         max_stripe_size = 256 * 1024 * 1024;
3170                 max_chunk_size = max_stripe_size;
3171         } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
3172                 max_stripe_size = 8 * 1024 * 1024;
3173                 max_chunk_size = 2 * max_stripe_size;
3174         } else {
3175                 printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
3176                        type);
3177                 BUG_ON(1);
3178         }
3179
3180         /* we don't want a chunk larger than 10% of writeable space */
3181         max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
3182                              max_chunk_size);
3183
3184         devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
3185                                GFP_NOFS);
3186         if (!devices_info)
3187                 return -ENOMEM;
3188
3189         cur = fs_devices->alloc_list.next;
3190
3191         /*
3192          * in the first pass through the devices list, we gather information
3193          * about the available holes on each device.
3194          */
3195         ndevs = 0;
3196         while (cur != &fs_devices->alloc_list) {
3197                 struct btrfs_device *device;
3198                 u64 max_avail;
3199                 u64 dev_offset;
3200
3201                 device = list_entry(cur, struct btrfs_device, dev_alloc_list);
3202
3203                 cur = cur->next;
3204
3205                 if (!device->writeable) {
3206                         printk(KERN_ERR
3207                                "btrfs: read-only device in alloc_list\n");
3208                         WARN_ON(1);
3209                         continue;
3210                 }
3211
3212                 if (!device->in_fs_metadata)
3213                         continue;
3214
3215                 if (device->total_bytes > device->bytes_used)
3216                         total_avail = device->total_bytes - device->bytes_used;
3217                 else
3218                         total_avail = 0;
3219
3220                 /* If there is no space on this device, skip it. */
3221                 if (total_avail == 0)
3222                         continue;
3223
3224                 ret = find_free_dev_extent(trans, device,
3225                                            max_stripe_size * dev_stripes,
3226                                            &dev_offset, &max_avail);
3227                 if (ret && ret != -ENOSPC)
3228                         goto error;
3229
3230                 if (ret == 0)
3231                         max_avail = max_stripe_size * dev_stripes;
3232
3233                 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
3234                         continue;
3235
3236                 devices_info[ndevs].dev_offset = dev_offset;
3237                 devices_info[ndevs].max_avail = max_avail;
3238                 devices_info[ndevs].total_avail = total_avail;
3239                 devices_info[ndevs].dev = device;
3240                 ++ndevs;
3241         }
3242
3243         /*
3244          * now sort the devices by hole size / available space
3245          */
3246         sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
3247              btrfs_cmp_device_info, NULL);
3248
3249         /* round down to number of usable stripes */
3250         ndevs -= ndevs % devs_increment;
3251
3252         if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
3253                 ret = -ENOSPC;
3254                 goto error;
3255         }
3256
3257         if (devs_max && ndevs > devs_max)
3258                 ndevs = devs_max;
3259         /*
3260          * the primary goal is to maximize the number of stripes, so use as many
3261          * devices as possible, even if the stripes are not maximum sized.
3262          */
3263         stripe_size = devices_info[ndevs-1].max_avail;
3264         num_stripes = ndevs * dev_stripes;
3265
3266         if (stripe_size * num_stripes > max_chunk_size * ncopies) {
3267                 stripe_size = max_chunk_size * ncopies;
3268                 do_div(stripe_size, num_stripes);
3269         }
3270
3271         do_div(stripe_size, dev_stripes);
3272         do_div(stripe_size, BTRFS_STRIPE_LEN);
3273         stripe_size *= BTRFS_STRIPE_LEN;
3274
3275         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
3276         if (!map) {
3277                 ret = -ENOMEM;
3278                 goto error;
3279         }
3280         map->num_stripes = num_stripes;
3281
3282         for (i = 0; i < ndevs; ++i) {
3283                 for (j = 0; j < dev_stripes; ++j) {
3284                         int s = i * dev_stripes + j;
3285                         map->stripes[s].dev = devices_info[i].dev;
3286                         map->stripes[s].physical = devices_info[i].dev_offset +
3287                                                    j * stripe_size;
3288                 }
3289         }
3290         map->sector_size = extent_root->sectorsize;
3291         map->stripe_len = BTRFS_STRIPE_LEN;
3292         map->io_align = BTRFS_STRIPE_LEN;
3293         map->io_width = BTRFS_STRIPE_LEN;
3294         map->type = type;
3295         map->sub_stripes = sub_stripes;
3296
3297         *map_ret = map;
3298         num_bytes = stripe_size * (num_stripes / ncopies);
3299
3300         *stripe_size_out = stripe_size;
3301         *num_bytes_out = num_bytes;
3302
3303         trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
3304
3305         em = alloc_extent_map();
3306         if (!em) {
3307                 ret = -ENOMEM;
3308                 goto error;
3309         }
3310         em->bdev = (struct block_device *)map;
3311         em->start = start;
3312         em->len = num_bytes;
3313         em->block_start = 0;
3314         em->block_len = em->len;
3315
3316         em_tree = &extent_root->fs_info->mapping_tree.map_tree;
3317         write_lock(&em_tree->lock);
3318         ret = add_extent_mapping(em_tree, em);
3319         write_unlock(&em_tree->lock);
3320         BUG_ON(ret);
3321         free_extent_map(em);
3322
3323         ret = btrfs_make_block_group(trans, extent_root, 0, type,
3324                                      BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3325                                      start, num_bytes);
3326         BUG_ON(ret);
3327
3328         for (i = 0; i < map->num_stripes; ++i) {
3329                 struct btrfs_device *device;
3330                 u64 dev_offset;
3331
3332                 device = map->stripes[i].dev;
3333                 dev_offset = map->stripes[i].physical;
3334
3335                 ret = btrfs_alloc_dev_extent(trans, device,
3336                                 info->chunk_root->root_key.objectid,
3337                                 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3338                                 start, dev_offset, stripe_size);
3339                 BUG_ON(ret);
3340         }
3341
3342         kfree(devices_info);
3343         return 0;
3344
3345 error:
3346         kfree(map);
3347         kfree(devices_info);
3348         return ret;
3349 }
3350
3351 static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
3352                                 struct btrfs_root *extent_root,
3353                                 struct map_lookup *map, u64 chunk_offset,
3354                                 u64 chunk_size, u64 stripe_size)
3355 {
3356         u64 dev_offset;
3357         struct btrfs_key key;
3358         struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
3359         struct btrfs_device *device;
3360         struct btrfs_chunk *chunk;
3361         struct btrfs_stripe *stripe;
3362         size_t item_size = btrfs_chunk_item_size(map->num_stripes);
3363         int index = 0;
3364         int ret;
3365
3366         chunk = kzalloc(item_size, GFP_NOFS);
3367         if (!chunk)
3368                 return -ENOMEM;
3369
3370         index = 0;
3371         while (index < map->num_stripes) {
3372                 device = map->stripes[index].dev;
3373                 device->bytes_used += stripe_size;
3374                 ret = btrfs_update_device(trans, device);
3375                 BUG_ON(ret);
3376                 index++;
3377         }
3378
3379         spin_lock(&extent_root->fs_info->free_chunk_lock);
3380         extent_root->fs_info->free_chunk_space -= (stripe_size *
3381                                                    map->num_stripes);
3382         spin_unlock(&extent_root->fs_info->free_chunk_lock);
3383
3384         index = 0;
3385         stripe = &chunk->stripe;
3386         while (index < map->num_stripes) {
3387                 device = map->stripes[index].dev;
3388                 dev_offset = map->stripes[index].physical;
3389
3390                 btrfs_set_stack_stripe_devid(stripe, device->devid);
3391                 btrfs_set_stack_stripe_offset(stripe, dev_offset);
3392                 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
3393                 stripe++;
3394                 index++;
3395         }
3396
3397         btrfs_set_stack_chunk_length(chunk, chunk_size);
3398         btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
3399         btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
3400         btrfs_set_stack_chunk_type(chunk, map->type);
3401         btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
3402         btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
3403         btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
3404         btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
3405         btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
3406
3407         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3408         key.type = BTRFS_CHUNK_ITEM_KEY;
3409         key.offset = chunk_offset;
3410
3411         ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
3412         BUG_ON(ret);
3413
3414         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
3415                 ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
3416                                              item_size);
3417                 BUG_ON(ret);
3418         }
3419
3420         kfree(chunk);
3421         return 0;
3422 }
3423
3424 /*
3425  * Chunk allocation falls into two parts. The first part does works
3426  * that make the new allocated chunk useable, but not do any operation
3427  * that modifies the chunk tree. The second part does the works that
3428  * require modifying the chunk tree. This division is important for the
3429  * bootstrap process of adding storage to a seed btrfs.
3430  */
3431 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3432                       struct btrfs_root *extent_root, u64 type)
3433 {
3434         u64 chunk_offset;
3435         u64 chunk_size;
3436         u64 stripe_size;
3437         struct map_lookup *map;
3438         struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
3439         int ret;
3440
3441         ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3442                               &chunk_offset);
3443         if (ret)
3444                 return ret;
3445
3446         ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
3447                                   &stripe_size, chunk_offset, type);
3448         if (ret)
3449                 return ret;
3450
3451         ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
3452                                    chunk_size, stripe_size);
3453         BUG_ON(ret);
3454         return 0;
3455 }
3456
3457 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
3458                                          struct btrfs_root *root,
3459                                          struct btrfs_device *device)
3460 {
3461         u64 chunk_offset;
3462         u64 sys_chunk_offset;
3463         u64 chunk_size;
3464         u64 sys_chunk_size;
3465         u64 stripe_size;
3466         u64 sys_stripe_size;
3467         u64 alloc_profile;
3468         struct map_lookup *map;
3469         struct map_lookup *sys_map;
3470         struct btrfs_fs_info *fs_info = root->fs_info;
3471         struct btrfs_root *extent_root = fs_info->extent_root;
3472         int ret;
3473
3474         ret = find_next_chunk(fs_info->chunk_root,
3475                               BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
3476         if (ret)
3477                 return ret;
3478
3479         alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
3480                                 fs_info->avail_metadata_alloc_bits;
3481         alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
3482
3483         ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
3484                                   &stripe_size, chunk_offset, alloc_profile);
3485         BUG_ON(ret);
3486
3487         sys_chunk_offset = chunk_offset + chunk_size;
3488
3489         alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
3490                                 fs_info->avail_system_alloc_bits;
3491         alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
3492
3493         ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
3494                                   &sys_chunk_size, &sys_stripe_size,
3495                                   sys_chunk_offset, alloc_profile);
3496         BUG_ON(ret);
3497
3498         ret = btrfs_add_device(trans, fs_info->chunk_root, device);
3499         BUG_ON(ret);
3500
3501         /*
3502          * Modifying chunk tree needs allocating new blocks from both
3503          * system block group and metadata block group. So we only can
3504          * do operations require modifying the chunk tree after both
3505          * block groups were created.
3506          */
3507         ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
3508                                    chunk_size, stripe_size);
3509         BUG_ON(ret);
3510
3511         ret = __finish_chunk_alloc(trans, extent_root, sys_map,
3512                                    sys_chunk_offset, sys_chunk_size,
3513                                    sys_stripe_size);
3514         BUG_ON(ret);
3515         return 0;
3516 }
3517
3518 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
3519 {
3520         struct extent_map *em;
3521         struct map_lookup *map;
3522         struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
3523         int readonly = 0;
3524         int i;
3525
3526         read_lock(&map_tree->map_tree.lock);
3527         em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
3528         read_unlock(&map_tree->map_tree.lock);
3529         if (!em)
3530                 return 1;
3531
3532         if (btrfs_test_opt(root, DEGRADED)) {
3533                 free_extent_map(em);
3534                 return 0;
3535         }
3536
3537         map = (struct map_lookup *)em->bdev;
3538         for (i = 0; i < map->num_stripes; i++) {
3539                 if (!map->stripes[i].dev->writeable) {
3540                         readonly = 1;
3541                         break;
3542                 }
3543         }
3544         free_extent_map(em);
3545         return readonly;
3546 }
3547
3548 void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
3549 {
3550         extent_map_tree_init(&tree->map_tree);
3551 }
3552
3553 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
3554 {
3555         struct extent_map *em;
3556
3557         while (1) {
3558                 write_lock(&tree->map_tree.lock);
3559                 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
3560                 if (em)
3561                         remove_extent_mapping(&tree->map_tree, em);
3562                 write_unlock(&tree->map_tree.lock);
3563                 if (!em)
3564                         break;
3565                 kfree(em->bdev);
3566                 /* once for us */
3567                 free_extent_map(em);
3568                 /* once for the tree */
3569                 free_extent_map(em);
3570         }
3571 }
3572
3573 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
3574 {
3575         struct extent_map *em;
3576         struct map_lookup *map;
3577         struct extent_map_tree *em_tree = &map_tree->map_tree;
3578         int ret;
3579
3580         read_lock(&em_tree->lock);
3581         em = lookup_extent_mapping(em_tree, logical, len);
3582         read_unlock(&em_tree->lock);
3583         BUG_ON(!em);
3584
3585         BUG_ON(em->start > logical || em->start + em->len < logical);
3586         map = (struct map_lookup *)em->bdev;
3587         if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
3588                 ret = map->num_stripes;
3589         else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
3590                 ret = map->sub_stripes;
3591         else
3592                 ret = 1;
3593         free_extent_map(em);
3594         return ret;
3595 }
3596
3597 static int find_live_mirror(struct map_lookup *map, int first, int num,
3598                             int optimal)
3599 {
3600         int i;
3601         if (map->stripes[optimal].dev->bdev)
3602                 return optimal;
3603         for (i = first; i < first + num; i++) {
3604                 if (map->stripes[i].dev->bdev)
3605                         return i;
3606         }
3607         /* we couldn't find one that doesn't fail.  Just return something
3608          * and the io error handling code will clean up eventually
3609          */
3610         return optimal;
3611 }
3612
3613 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3614                              u64 logical, u64 *length,
3615                              struct btrfs_bio **bbio_ret,
3616                              int mirror_num)
3617 {
3618         struct extent_map *em;
3619         struct map_lookup *map;
3620         struct extent_map_tree *em_tree = &map_tree->map_tree;
3621         u64 offset;
3622         u64 stripe_offset;
3623         u64 stripe_end_offset;
3624         u64 stripe_nr;
3625         u64 stripe_nr_orig;
3626         u64 stripe_nr_end;
3627         int stripes_allocated = 8;
3628         int stripes_required = 1;
3629         int stripe_index;
3630         int i;
3631         int num_stripes;
3632         int max_errors = 0;
3633         struct btrfs_bio *bbio = NULL;
3634
3635         if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
3636                 stripes_allocated = 1;
3637 again:
3638         if (bbio_ret) {
3639                 bbio = kzalloc(btrfs_bio_size(stripes_allocated),
3640                                 GFP_NOFS);
3641                 if (!bbio)
3642                         return -ENOMEM;
3643
3644                 atomic_set(&bbio->error, 0);
3645         }
3646
3647         read_lock(&em_tree->lock);
3648         em = lookup_extent_mapping(em_tree, logical, *length);
3649         read_unlock(&em_tree->lock);
3650
3651         if (!em) {
3652                 printk(KERN_CRIT "unable to find logical %llu len %llu\n",
3653                        (unsigned long long)logical,
3654                        (unsigned long long)*length);
3655                 BUG();
3656         }
3657
3658         BUG_ON(em->start > logical || em->start + em->len < logical);
3659         map = (struct map_lookup *)em->bdev;
3660         offset = logical - em->start;
3661
3662         if (mirror_num > map->num_stripes)
3663                 mirror_num = 0;
3664
3665         /* if our btrfs_bio struct is too small, back off and try again */
3666         if (rw & REQ_WRITE) {
3667                 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
3668                                  BTRFS_BLOCK_GROUP_DUP)) {
3669                         stripes_required = map->num_stripes;
3670                         max_errors = 1;
3671                 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3672                         stripes_required = map->sub_stripes;
3673                         max_errors = 1;
3674                 }
3675         }
3676         if (rw & REQ_DISCARD) {
3677                 if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK)
3678                         stripes_required = map->num_stripes;
3679         }
3680         if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
3681             stripes_allocated < stripes_required) {
3682                 stripes_allocated = map->num_stripes;
3683                 free_extent_map(em);
3684                 kfree(bbio);
3685                 goto again;
3686         }
3687         stripe_nr = offset;
3688         /*
3689          * stripe_nr counts the total number of stripes we have to stride
3690          * to get to this block
3691          */
3692         do_div(stripe_nr, map->stripe_len);
3693
3694         stripe_offset = stripe_nr * map->stripe_len;
3695         BUG_ON(offset < stripe_offset);
3696
3697         /* stripe_offset is the offset of this block in its stripe*/
3698         stripe_offset = offset - stripe_offset;
3699
3700         if (rw & REQ_DISCARD)
3701                 *length = min_t(u64, em->len - offset, *length);
3702         else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
3703                 /* we limit the length of each bio to what fits in a stripe */
3704                 *length = min_t(u64, em->len - offset,
3705                                 map->stripe_len - stripe_offset);
3706         } else {
3707                 *length = em->len - offset;
3708         }
3709
3710         if (!bbio_ret)
3711                 goto out;
3712
3713         num_stripes = 1;
3714         stripe_index = 0;
3715         stripe_nr_orig = stripe_nr;
3716         stripe_nr_end = (offset + *length + map->stripe_len - 1) &
3717                         (~(map->stripe_len - 1));
3718         do_div(stripe_nr_end, map->stripe_len);
3719         stripe_end_offset = stripe_nr_end * map->stripe_len -
3720                             (offset + *length);
3721         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3722                 if (rw & REQ_DISCARD)
3723                         num_stripes = min_t(u64, map->num_stripes,
3724                                             stripe_nr_end - stripe_nr_orig);
3725                 stripe_index = do_div(stripe_nr, map->num_stripes);
3726         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3727                 if (rw & (REQ_WRITE | REQ_DISCARD))
3728                         num_stripes = map->num_stripes;
3729                 else if (mirror_num)
3730                         stripe_index = mirror_num - 1;
3731                 else {
3732                         stripe_index = find_live_mirror(map, 0,
3733                                             map->num_stripes,
3734                                             current->pid % map->num_stripes);
3735                         mirror_num = stripe_index + 1;
3736                 }
3737
3738         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3739                 if (rw & (REQ_WRITE | REQ_DISCARD)) {
3740                         num_stripes = map->num_stripes;
3741                 } else if (mirror_num) {
3742                         stripe_index = mirror_num - 1;
3743                 } else {
3744                         mirror_num = 1;
3745                 }
3746
3747         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3748                 int factor = map->num_stripes / map->sub_stripes;
3749
3750                 stripe_index = do_div(stripe_nr, factor);
3751                 stripe_index *= map->sub_stripes;
3752
3753                 if (rw & REQ_WRITE)
3754                         num_stripes = map->sub_stripes;
3755                 else if (rw & REQ_DISCARD)
3756                         num_stripes = min_t(u64, map->sub_stripes *
3757                                             (stripe_nr_end - stripe_nr_orig),
3758                                             map->num_stripes);
3759                 else if (mirror_num)
3760                         stripe_index += mirror_num - 1;
3761                 else {
3762                         stripe_index = find_live_mirror(map, stripe_index,
3763                                               map->sub_stripes, stripe_index +
3764                                               current->pid % map->sub_stripes);
3765                         mirror_num = stripe_index + 1;
3766                 }
3767         } else {
3768                 /*
3769                  * after this do_div call, stripe_nr is the number of stripes
3770                  * on this device we have to walk to find the data, and
3771                  * stripe_index is the number of our device in the stripe array
3772                  */
3773                 stripe_index = do_div(stripe_nr, map->num_stripes);
3774                 mirror_num = stripe_index + 1;
3775         }
3776         BUG_ON(stripe_index >= map->num_stripes);
3777
3778         if (rw & REQ_DISCARD) {
3779                 for (i = 0; i < num_stripes; i++) {
3780                         bbio->stripes[i].physical =
3781                                 map->stripes[stripe_index].physical +
3782                                 stripe_offset + stripe_nr * map->stripe_len;
3783                         bbio->stripes[i].dev = map->stripes[stripe_index].dev;
3784
3785                         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3786                                 u64 stripes;
3787                                 u32 last_stripe = 0;
3788                                 int j;
3789
3790                                 div_u64_rem(stripe_nr_end - 1,
3791                                             map->num_stripes,
3792                                             &last_stripe);
3793
3794                                 for (j = 0; j < map->num_stripes; j++) {
3795                                         u32 test;
3796
3797                                         div_u64_rem(stripe_nr_end - 1 - j,
3798                                                     map->num_stripes, &test);
3799                                         if (test == stripe_index)
3800                                                 break;
3801                                 }
3802                                 stripes = stripe_nr_end - 1 - j;
3803                                 do_div(stripes, map->num_stripes);
3804                                 bbio->stripes[i].length = map->stripe_len *
3805                                         (stripes - stripe_nr + 1);
3806
3807                                 if (i == 0) {
3808                                         bbio->stripes[i].length -=
3809                                                 stripe_offset;
3810                                         stripe_offset = 0;
3811                                 }
3812                                 if (stripe_index == last_stripe)
3813                                         bbio->stripes[i].length -=
3814                                                 stripe_end_offset;
3815                         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3816                                 u64 stripes;
3817                                 int j;
3818                                 int factor = map->num_stripes /
3819                                              map->sub_stripes;
3820                                 u32 last_stripe = 0;
3821
3822                                 div_u64_rem(stripe_nr_end - 1,
3823                                             factor, &last_stripe);
3824                                 last_stripe *= map->sub_stripes;
3825
3826                                 for (j = 0; j < factor; j++) {
3827                                         u32 test;
3828
3829                                         div_u64_rem(stripe_nr_end - 1 - j,
3830                                                     factor, &test);
3831
3832                                         if (test ==
3833                                             stripe_index / map->sub_stripes)
3834                                                 break;
3835                                 }
3836                                 stripes = stripe_nr_end - 1 - j;
3837                                 do_div(stripes, factor);
3838                                 bbio->stripes[i].length = map->stripe_len *
3839                                         (stripes - stripe_nr + 1);
3840
3841                                 if (i < map->sub_stripes) {
3842                                         bbio->stripes[i].length -=
3843                                                 stripe_offset;
3844                                         if (i == map->sub_stripes - 1)
3845                                                 stripe_offset = 0;
3846                                 }
3847                                 if (stripe_index >= last_stripe &&
3848                                     stripe_index <= (last_stripe +
3849                                                      map->sub_stripes - 1)) {
3850                                         bbio->stripes[i].length -=
3851                                                 stripe_end_offset;
3852                                 }
3853                         } else
3854                                 bbio->stripes[i].length = *length;
3855
3856                         stripe_index++;
3857                         if (stripe_index == map->num_stripes) {
3858                                 /* This could only happen for RAID0/10 */
3859                                 stripe_index = 0;
3860                                 stripe_nr++;
3861                         }
3862                 }
3863         } else {
3864                 for (i = 0; i < num_stripes; i++) {
3865                         bbio->stripes[i].physical =
3866                                 map->stripes[stripe_index].physical +
3867                                 stripe_offset +
3868                                 stripe_nr * map->stripe_len;
3869                         bbio->stripes[i].dev =
3870                                 map->stripes[stripe_index].dev;
3871                         stripe_index++;
3872                 }
3873         }
3874         if (bbio_ret) {
3875                 *bbio_ret = bbio;
3876                 bbio->num_stripes = num_stripes;
3877                 bbio->max_errors = max_errors;
3878                 bbio->mirror_num = mirror_num;
3879         }
3880 out:
3881         free_extent_map(em);
3882         return 0;
3883 }
3884
3885 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3886                       u64 logical, u64 *length,
3887                       struct btrfs_bio **bbio_ret, int mirror_num)
3888 {
3889         return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret,
3890                                  mirror_num);
3891 }
3892
3893 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
3894                      u64 chunk_start, u64 physical, u64 devid,
3895                      u64 **logical, int *naddrs, int *stripe_len)
3896 {
3897         struct extent_map_tree *em_tree = &map_tree->map_tree;
3898         struct extent_map *em;
3899         struct map_lookup *map;
3900         u64 *buf;
3901         u64 bytenr;
3902         u64 length;
3903         u64 stripe_nr;
3904         int i, j, nr = 0;
3905
3906         read_lock(&em_tree->lock);
3907         em = lookup_extent_mapping(em_tree, chunk_start, 1);
3908         read_unlock(&em_tree->lock);
3909
3910         BUG_ON(!em || em->start != chunk_start);
3911         map = (struct map_lookup *)em->bdev;
3912
3913         length = em->len;
3914         if (map->type & BTRFS_BLOCK_GROUP_RAID10)
3915                 do_div(length, map->num_stripes / map->sub_stripes);
3916         else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
3917                 do_div(length, map->num_stripes);
3918
3919         buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
3920         BUG_ON(!buf);
3921
3922         for (i = 0; i < map->num_stripes; i++) {
3923                 if (devid && map->stripes[i].dev->devid != devid)
3924                         continue;
3925                 if (map->stripes[i].physical > physical ||
3926                     map->stripes[i].physical + length <= physical)
3927                         continue;
3928
3929                 stripe_nr = physical - map->stripes[i].physical;
3930                 do_div(stripe_nr, map->stripe_len);
3931
3932                 if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3933                         stripe_nr = stripe_nr * map->num_stripes + i;
3934                         do_div(stripe_nr, map->sub_stripes);
3935                 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3936                         stripe_nr = stripe_nr * map->num_stripes + i;
3937                 }
3938                 bytenr = chunk_start + stripe_nr * map->stripe_len;
3939                 WARN_ON(nr >= map->num_stripes);
3940                 for (j = 0; j < nr; j++) {
3941                         if (buf[j] == bytenr)
3942                                 break;
3943                 }
3944                 if (j == nr) {
3945                         WARN_ON(nr >= map->num_stripes);
3946                         buf[nr++] = bytenr;
3947                 }
3948         }
3949
3950         *logical = buf;
3951         *naddrs = nr;
3952         *stripe_len = map->stripe_len;
3953
3954         free_extent_map(em);
3955         return 0;
3956 }
3957
3958 static void btrfs_end_bio(struct bio *bio, int err)
3959 {
3960         struct btrfs_bio *bbio = bio->bi_private;
3961         int is_orig_bio = 0;
3962
3963         if (err)
3964                 atomic_inc(&bbio->error);
3965
3966         if (bio == bbio->orig_bio)
3967                 is_orig_bio = 1;
3968
3969         if (atomic_dec_and_test(&bbio->stripes_pending)) {
3970                 if (!is_orig_bio) {
3971                         bio_put(bio);
3972                         bio = bbio->orig_bio;
3973                 }
3974                 bio->bi_private = bbio->private;
3975                 bio->bi_end_io = bbio->end_io;
3976                 bio->bi_bdev = (struct block_device *)
3977                                         (unsigned long)bbio->mirror_num;
3978                 /* only send an error to the higher layers if it is
3979                  * beyond the tolerance of the multi-bio
3980                  */
3981                 if (atomic_read(&bbio->error) > bbio->max_errors) {
3982                         err = -EIO;
3983                 } else {
3984                         /*
3985                          * this bio is actually up to date, we didn't
3986                          * go over the max number of errors
3987                          */
3988                         set_bit(BIO_UPTODATE, &bio->bi_flags);
3989                         err = 0;
3990                 }
3991                 kfree(bbio);
3992
3993                 bio_endio(bio, err);
3994         } else if (!is_orig_bio) {
3995                 bio_put(bio);
3996         }
3997 }
3998
3999 struct async_sched {
4000         struct bio *bio;
4001         int rw;
4002         struct btrfs_fs_info *info;
4003         struct btrfs_work work;
4004 };
4005
4006 /*
4007  * see run_scheduled_bios for a description of why bios are collected for
4008  * async submit.
4009  *
4010  * This will add one bio to the pending list for a device and make sure
4011  * the work struct is scheduled.
4012  */
4013 static noinline int schedule_bio(struct btrfs_root *root,
4014                                  struct btrfs_device *device,
4015                                  int rw, struct bio *bio)
4016 {
4017         int should_queue = 1;
4018         struct btrfs_pending_bios *pending_bios;
4019
4020         /* don't bother with additional async steps for reads, right now */
4021         if (!(rw & REQ_WRITE)) {
4022                 bio_get(bio);
4023                 submit_bio(rw, bio);
4024                 bio_put(bio);
4025                 return 0;
4026         }
4027
4028         /*
4029          * nr_async_bios allows us to reliably return congestion to the
4030          * higher layers.  Otherwise, the async bio makes it appear we have
4031          * made progress against dirty pages when we've really just put it
4032          * on a queue for later
4033          */
4034         atomic_inc(&root->fs_info->nr_async_bios);
4035         WARN_ON(bio->bi_next);
4036         bio->bi_next = NULL;
4037         bio->bi_rw |= rw;
4038
4039         spin_lock(&device->io_lock);
4040         if (bio->bi_rw & REQ_SYNC)
4041                 pending_bios = &device->pending_sync_bios;
4042         else
4043                 pending_bios = &device->pending_bios;
4044
4045         if (pending_bios->tail)
4046                 pending_bios->tail->bi_next = bio;
4047
4048         pending_bios->tail = bio;
4049         if (!pending_bios->head)
4050                 pending_bios->head = bio;
4051         if (device->running_pending)
4052                 should_queue = 0;
4053
4054         spin_unlock(&device->io_lock);
4055
4056         if (should_queue)
4057                 btrfs_queue_worker(&root->fs_info->submit_workers,
4058                                    &device->work);
4059         return 0;
4060 }
4061
4062 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4063                   int mirror_num, int async_submit)
4064 {
4065         struct btrfs_mapping_tree *map_tree;
4066         struct btrfs_device *dev;
4067         struct bio *first_bio = bio;
4068         u64 logical = (u64)bio->bi_sector << 9;
4069         u64 length = 0;
4070         u64 map_length;
4071         int ret;
4072         int dev_nr = 0;
4073         int total_devs = 1;
4074         struct btrfs_bio *bbio = NULL;
4075
4076         length = bio->bi_size;
4077         map_tree = &root->fs_info->mapping_tree;
4078         map_length = length;
4079
4080         ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio,
4081                               mirror_num);
4082         BUG_ON(ret);
4083
4084         total_devs = bbio->num_stripes;
4085         if (map_length < length) {
4086                 printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
4087                        "len %llu\n", (unsigned long long)logical,
4088                        (unsigned long long)length,
4089                        (unsigned long long)map_length);
4090                 BUG();
4091         }
4092
4093         bbio->orig_bio = first_bio;
4094         bbio->private = first_bio->bi_private;
4095         bbio->end_io = first_bio->bi_end_io;
4096         atomic_set(&bbio->stripes_pending, bbio->num_stripes);
4097
4098         while (dev_nr < total_devs) {
4099                 if (dev_nr < total_devs - 1) {
4100                         bio = bio_clone(first_bio, GFP_NOFS);
4101                         BUG_ON(!bio);
4102                 } else {
4103                         bio = first_bio;
4104                 }
4105                 bio->bi_private = bbio;
4106                 bio->bi_end_io = btrfs_end_bio;
4107                 bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
4108                 dev = bbio->stripes[dev_nr].dev;
4109                 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
4110                         pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
4111                                  "(%s id %llu), size=%u\n", rw,
4112                                  (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
4113                                  dev->name, dev->devid, bio->bi_size);
4114                         bio->bi_bdev = dev->bdev;
4115                         if (async_submit)
4116                                 schedule_bio(root, dev, rw, bio);
4117                         else
4118                                 submit_bio(rw, bio);
4119                 } else {
4120                         bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
4121                         bio->bi_sector = logical >> 9;
4122                         bio_endio(bio, -EIO);
4123                 }
4124                 dev_nr++;
4125         }
4126         return 0;
4127 }
4128
4129 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
4130                                        u8 *uuid, u8 *fsid)
4131 {
4132         struct btrfs_device *device;
4133         struct btrfs_fs_devices *cur_devices;
4134
4135         cur_devices = root->fs_info->fs_devices;
4136         while (cur_devices) {
4137                 if (!fsid ||
4138                     !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
4139                         device = __find_device(&cur_devices->devices,
4140                                                devid, uuid);
4141                         if (device)
4142                                 return device;
4143                 }
4144                 cur_devices = cur_devices->seed;
4145         }
4146         return NULL;
4147 }
4148
4149 static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
4150                                             u64 devid, u8 *dev_uuid)
4151 {
4152         struct btrfs_device *device;
4153         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
4154
4155         device = kzalloc(sizeof(*device), GFP_NOFS);
4156         if (!device)
4157                 return NULL;
4158         list_add(&device->dev_list,
4159                  &fs_devices->devices);
4160         device->dev_root = root->fs_info->dev_root;
4161         device->devid = devid;
4162         device->work.func = pending_bios_fn;
4163         device->fs_devices = fs_devices;
4164         device->missing = 1;
4165         fs_devices->num_devices++;
4166         fs_devices->missing_devices++;
4167         spin_lock_init(&device->io_lock);
4168         INIT_LIST_HEAD(&device->dev_alloc_list);
4169         memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
4170         return device;
4171 }
4172
4173 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
4174                           struct extent_buffer *leaf,
4175                           struct btrfs_chunk *chunk)
4176 {
4177         struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
4178         struct map_lookup *map;
4179         struct extent_map *em;
4180         u64 logical;
4181         u64 length;
4182         u64 devid;
4183         u8 uuid[BTRFS_UUID_SIZE];
4184         int num_stripes;
4185         int ret;
4186         int i;
4187
4188         logical = key->offset;
4189         length = btrfs_chunk_length(leaf, chunk);
4190
4191         read_lock(&map_tree->map_tree.lock);
4192         em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
4193         read_unlock(&map_tree->map_tree.lock);
4194
4195         /* already mapped? */
4196         if (em && em->start <= logical && em->start + em->len > logical) {
4197                 free_extent_map(em);
4198                 return 0;
4199         } else if (em) {
4200                 free_extent_map(em);
4201         }
4202
4203         em = alloc_extent_map();
4204         if (!em)
4205                 return -ENOMEM;
4206         num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
4207         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
4208         if (!map) {
4209                 free_extent_map(em);
4210                 return -ENOMEM;
4211         }
4212
4213         em->bdev = (struct block_device *)map;
4214         em->start = logical;
4215         em->len = length;
4216         em->block_start = 0;
4217         em->block_len = em->len;
4218
4219         map->num_stripes = num_stripes;
4220         map->io_width = btrfs_chunk_io_width(leaf, chunk);
4221         map->io_align = btrfs_chunk_io_align(leaf, chunk);
4222         map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
4223         map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
4224         map->type = btrfs_chunk_type(leaf, chunk);
4225         map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
4226         for (i = 0; i < num_stripes; i++) {
4227                 map->stripes[i].physical =
4228                         btrfs_stripe_offset_nr(leaf, chunk, i);
4229                 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
4230                 read_extent_buffer(leaf, uuid, (unsigned long)
4231                                    btrfs_stripe_dev_uuid_nr(chunk, i),
4232                                    BTRFS_UUID_SIZE);
4233                 map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
4234                                                         NULL);
4235                 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
4236                         kfree(map);
4237                         free_extent_map(em);
4238                         return -EIO;
4239                 }
4240                 if (!map->stripes[i].dev) {
4241                         map->stripes[i].dev =
4242                                 add_missing_dev(root, devid, uuid);
4243                         if (!map->stripes[i].dev) {
4244                                 kfree(map);
4245                                 free_extent_map(em);
4246                                 return -EIO;
4247                         }
4248                 }
4249                 map->stripes[i].dev->in_fs_metadata = 1;
4250         }
4251
4252         write_lock(&map_tree->map_tree.lock);
4253         ret = add_extent_mapping(&map_tree->map_tree, em);
4254         write_unlock(&map_tree->map_tree.lock);
4255         BUG_ON(ret);
4256         free_extent_map(em);
4257
4258         return 0;
4259 }
4260
4261 static int fill_device_from_item(struct extent_buffer *leaf,
4262                                  struct btrfs_dev_item *dev_item,
4263                                  struct btrfs_device *device)
4264 {
4265         unsigned long ptr;
4266
4267         device->devid = btrfs_device_id(leaf, dev_item);
4268         device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
4269         device->total_bytes = device->disk_total_bytes;
4270         device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
4271         device->type = btrfs_device_type(leaf, dev_item);
4272         device->io_align = btrfs_device_io_align(leaf, dev_item);
4273         device->io_width = btrfs_device_io_width(leaf, dev_item);
4274         device->sector_size = btrfs_device_sector_size(leaf, dev_item);
4275
4276         ptr = (unsigned long)btrfs_device_uuid(dev_item);
4277         read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
4278
4279         return 0;
4280 }
4281
4282 static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
4283 {
4284         struct btrfs_fs_devices *fs_devices;
4285         int ret;
4286
4287         mutex_lock(&uuid_mutex);
4288
4289         fs_devices = root->fs_info->fs_devices->seed;
4290         while (fs_devices) {
4291                 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
4292                         ret = 0;
4293                         goto out;
4294                 }
4295                 fs_devices = fs_devices->seed;
4296         }
4297
4298         fs_devices = find_fsid(fsid);
4299         if (!fs_devices) {
4300                 ret = -ENOENT;
4301                 goto out;
4302         }
4303
4304         fs_devices = clone_fs_devices(fs_devices);
4305         if (IS_ERR(fs_devices)) {
4306                 ret = PTR_ERR(fs_devices);
4307                 goto out;
4308         }
4309
4310         ret = __btrfs_open_devices(fs_devices, FMODE_READ,
4311                                    root->fs_info->bdev_holder);
4312         if (ret)
4313                 goto out;
4314
4315         if (!fs_devices->seeding) {
4316                 __btrfs_close_devices(fs_devices);
4317                 free_fs_devices(fs_devices);
4318                 ret = -EINVAL;
4319                 goto out;
4320         }
4321
4322         fs_devices->seed = root->fs_info->fs_devices->seed;
4323         root->fs_info->fs_devices->seed = fs_devices;
4324 out:
4325         mutex_unlock(&uuid_mutex);
4326         return ret;
4327 }
4328
4329 static int read_one_dev(struct btrfs_root *root,
4330                         struct extent_buffer *leaf,
4331                         struct btrfs_dev_item *dev_item)
4332 {
4333         struct btrfs_device *device;
4334         u64 devid;
4335         int ret;
4336         u8 fs_uuid[BTRFS_UUID_SIZE];
4337         u8 dev_uuid[BTRFS_UUID_SIZE];
4338
4339         devid = btrfs_device_id(leaf, dev_item);
4340         read_extent_buffer(leaf, dev_uuid,
4341                            (unsigned long)btrfs_device_uuid(dev_item),
4342                            BTRFS_UUID_SIZE);
4343         read_extent_buffer(leaf, fs_uuid,
4344                            (unsigned long)btrfs_device_fsid(dev_item),
4345                            BTRFS_UUID_SIZE);
4346
4347         if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
4348                 ret = open_seed_devices(root, fs_uuid);
4349                 if (ret && !btrfs_test_opt(root, DEGRADED))
4350                         return ret;
4351         }
4352
4353         device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
4354         if (!device || !device->bdev) {
4355                 if (!btrfs_test_opt(root, DEGRADED))
4356                         return -EIO;
4357
4358                 if (!device) {
4359                         printk(KERN_WARNING "warning devid %llu missing\n",
4360                                (unsigned long long)devid);
4361                         device = add_missing_dev(root, devid, dev_uuid);
4362                         if (!device)
4363                                 return -ENOMEM;
4364                 } else if (!device->missing) {
4365                         /*
4366                          * this happens when a device that was properly setup
4367                          * in the device info lists suddenly goes bad.
4368                          * device->bdev is NULL, and so we have to set
4369                          * device->missing to one here
4370                          */
4371                         root->fs_info->fs_devices->missing_devices++;
4372                         device->missing = 1;
4373                 }
4374         }
4375
4376         if (device->fs_devices != root->fs_info->fs_devices) {
4377                 BUG_ON(device->writeable);
4378                 if (device->generation !=
4379                     btrfs_device_generation(leaf, dev_item))
4380                         return -EINVAL;
4381         }
4382
4383         fill_device_from_item(leaf, dev_item, device);
4384         device->dev_root = root->fs_info->dev_root;
4385         device->in_fs_metadata = 1;
4386         if (device->writeable) {
4387                 device->fs_devices->total_rw_bytes += device->total_bytes;
4388                 spin_lock(&root->fs_info->free_chunk_lock);
4389                 root->fs_info->free_chunk_space += device->total_bytes -
4390                         device->bytes_used;
4391                 spin_unlock(&root->fs_info->free_chunk_lock);
4392         }
4393         ret = 0;
4394         return ret;
4395 }
4396
4397 int btrfs_read_sys_array(struct btrfs_root *root)
4398 {
4399         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
4400         struct extent_buffer *sb;
4401         struct btrfs_disk_key *disk_key;
4402         struct btrfs_chunk *chunk;
4403         u8 *ptr;
4404         unsigned long sb_ptr;
4405         int ret = 0;
4406         u32 num_stripes;
4407         u32 array_size;
4408         u32 len = 0;
4409         u32 cur;
4410         struct btrfs_key key;
4411
4412         sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
4413                                           BTRFS_SUPER_INFO_SIZE);
4414         if (!sb)
4415                 return -ENOMEM;
4416         btrfs_set_buffer_uptodate(sb);
4417         btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
4418
4419         write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
4420         array_size = btrfs_super_sys_array_size(super_copy);
4421
4422         ptr = super_copy->sys_chunk_array;
4423         sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
4424         cur = 0;
4425
4426         while (cur < array_size) {
4427                 disk_key = (struct btrfs_disk_key *)ptr;
4428                 btrfs_disk_key_to_cpu(&key, disk_key);
4429
4430                 len = sizeof(*disk_key); ptr += len;
4431                 sb_ptr += len;
4432                 cur += len;
4433
4434                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
4435                         chunk = (struct btrfs_chunk *)sb_ptr;
4436                         ret = read_one_chunk(root, &key, sb, chunk);
4437                         if (ret)
4438                                 break;
4439                         num_stripes = btrfs_chunk_num_stripes(sb, chunk);
4440                         len = btrfs_chunk_item_size(num_stripes);
4441                 } else {
4442                         ret = -EIO;
4443                         break;
4444                 }
4445                 ptr += len;
4446                 sb_ptr += len;
4447                 cur += len;
4448         }
4449         free_extent_buffer(sb);
4450         return ret;
4451 }
4452
4453 int btrfs_read_chunk_tree(struct btrfs_root *root)
4454 {
4455         struct btrfs_path *path;
4456         struct extent_buffer *leaf;
4457         struct btrfs_key key;
4458         struct btrfs_key found_key;
4459         int ret;
4460         int slot;
4461
4462         root = root->fs_info->chunk_root;
4463
4464         path = btrfs_alloc_path();
4465         if (!path)
4466                 return -ENOMEM;
4467
4468         /* first we search for all of the device items, and then we
4469          * read in all of the chunk items.  This way we can create chunk
4470          * mappings that reference all of the devices that are afound
4471          */
4472         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
4473         key.offset = 0;
4474         key.type = 0;
4475 again:
4476         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4477         if (ret < 0)
4478                 goto error;
4479         while (1) {
4480                 leaf = path->nodes[0];
4481                 slot = path->slots[0];
4482                 if (slot >= btrfs_header_nritems(leaf)) {
4483                         ret = btrfs_next_leaf(root, path);
4484                         if (ret == 0)
4485                                 continue;
4486                         if (ret < 0)
4487                                 goto error;
4488                         break;
4489                 }
4490                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
4491                 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
4492                         if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
4493                                 break;
4494                         if (found_key.type == BTRFS_DEV_ITEM_KEY) {
4495                                 struct btrfs_dev_item *dev_item;
4496                                 dev_item = btrfs_item_ptr(leaf, slot,
4497                                                   struct btrfs_dev_item);
4498                                 ret = read_one_dev(root, leaf, dev_item);
4499                                 if (ret)
4500                                         goto error;
4501                         }
4502                 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
4503                         struct btrfs_chunk *chunk;
4504                         chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
4505                         ret = read_one_chunk(root, &found_key, leaf, chunk);
4506                         if (ret)
4507                                 goto error;
4508                 }
4509                 path->slots[0]++;
4510         }
4511         if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
4512                 key.objectid = 0;
4513                 btrfs_release_path(path);
4514                 goto again;
4515         }
4516         ret = 0;
4517 error:
4518         btrfs_free_path(path);
4519         return ret;
4520 }