]> Pileus Git - ~andy/linux/commitdiff
Merge branch 'for-3.14/drivers' of git://git.kernel.dk/linux-block
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 30 Jan 2014 19:40:10 +0000 (11:40 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 30 Jan 2014 19:40:10 +0000 (11:40 -0800)
Pull block IO driver changes from Jens Axboe:

 - bcache update from Kent Overstreet.

 - two bcache fixes from Nicholas Swenson.

 - cciss pci init error fix from Andrew.

 - underflow fix in the parallel IDE pg_write code from Dan Carpenter.
   I'm sure the 1 (or 0) users of that are now happy.

 - two PCI related fixes for sx8 from Jingoo Han.

 - floppy init fix for first block read from Jiri Kosina.

 - pktcdvd error return miss fix from Julia Lawall.

 - removal of IRQF_SHARED from the SEGA Dreamcast CD-ROM code from
   Michael Opdenacker.

 - comment typo fix for the loop driver from Olaf Hering.

 - potential oops fix for null_blk from Raghavendra K T.

 - two fixes from Sam Bradshaw (Micron) for the mtip32xx driver, fixing
   an OOM problem and a problem with handling security locked conditions

* 'for-3.14/drivers' of git://git.kernel.dk/linux-block: (47 commits)
  mg_disk: Spelling s/finised/finished/
  null_blk: Null pointer deference problem in alloc_page_buffers
  mtip32xx: Correctly handle security locked condition
  mtip32xx: Make SGL container per-command to eliminate high order dma allocation
  drivers/block/loop.c: fix comment typo in loop_config_discard
  drivers/block/cciss.c:cciss_init_one(): use proper errnos
  drivers/block/paride/pg.c: underflow bug in pg_write()
  drivers/block/sx8.c: remove unnecessary pci_set_drvdata()
  drivers/block/sx8.c: use module_pci_driver()
  floppy: bail out in open() if drive is not responding to block0 read
  bcache: Fix auxiliary search trees for key size > cacheline size
  bcache: Don't return -EINTR when insert finished
  bcache: Improve bucket_prio() calculation
  bcache: Add bch_bkey_equal_header()
  bcache: update bch_bkey_try_merge
  bcache: Move insert_fixup() to btree_keys_ops
  bcache: Convert sorting to btree_keys
  bcache: Convert debug code to btree_keys
  bcache: Convert btree_iter to struct btree_keys
  bcache: Refactor bset_tree sysfs stats
  ...

39 files changed:
block/blk-settings.c
drivers/block/cciss.c
drivers/block/floppy.c
drivers/block/loop.c
drivers/block/mg_disk.c
drivers/block/mtip32xx/mtip32xx.c
drivers/block/mtip32xx/mtip32xx.h
drivers/block/null_blk.c
drivers/block/paride/pg.c
drivers/block/pktcdvd.c
drivers/block/sx8.c
drivers/cdrom/gdrom.c
drivers/md/bcache/Makefile
drivers/md/bcache/alloc.c
drivers/md/bcache/bcache.h
drivers/md/bcache/bset.c
drivers/md/bcache/bset.h
drivers/md/bcache/btree.c
drivers/md/bcache/btree.h
drivers/md/bcache/closure.c
drivers/md/bcache/closure.h
drivers/md/bcache/debug.c
drivers/md/bcache/debug.h
drivers/md/bcache/extents.c [new file with mode: 0644]
drivers/md/bcache/extents.h [new file with mode: 0644]
drivers/md/bcache/journal.c
drivers/md/bcache/journal.h
drivers/md/bcache/movinggc.c
drivers/md/bcache/request.c
drivers/md/bcache/request.h
drivers/md/bcache/super.c
drivers/md/bcache/sysfs.c
drivers/md/bcache/util.h
drivers/md/raid5.c
fs/buffer.c
include/linux/blkdev.h
include/trace/events/bcache.h
include/uapi/linux/bcache.h
include/uapi/linux/fd.h

index 05e826793e4e36b2e6c8de29802674767e3bd4d8..5d21239bc8599feb3fa12fa4906ba57aaf1c553f 100644 (file)
@@ -592,6 +592,10 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
                ret = -1;
        }
 
+       t->raid_partial_stripes_expensive =
+               max(t->raid_partial_stripes_expensive,
+                   b->raid_partial_stripes_expensive);
+
        /* Find lowest common alignment_offset */
        t->alignment_offset = lcm(t->alignment_offset, alignment)
                & (max(t->physical_block_size, t->io_min) - 1);
index b35fc4f5237c3b44c7c51aaa0517e58a42a2ab81..036e8ab86c718057ae01c9f6bf3ba6c403a6b942 100644 (file)
@@ -5004,7 +5004,7 @@ reinit_after_soft_reset:
 
        i = alloc_cciss_hba(pdev);
        if (i < 0)
-               return -1;
+               return -ENOMEM;
 
        h = hba[i];
        h->pdev = pdev;
@@ -5205,7 +5205,7 @@ clean_no_release_regions:
         */
        pci_set_drvdata(pdev, NULL);
        free_hba(h);
-       return -1;
+       return -ENODEV;
 }
 
 static void cciss_shutdown(struct pci_dev *pdev)
index 6b29c4422828a4dc5442bf19b1a2c719d5d2377c..2023043ce7c0e94b0e3c3618eb0787a9ad5ddb73 100644 (file)
@@ -3691,9 +3691,12 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
        if (!(mode & FMODE_NDELAY)) {
                if (mode & (FMODE_READ|FMODE_WRITE)) {
                        UDRS->last_checked = 0;
+                       clear_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags);
                        check_disk_change(bdev);
                        if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags))
                                goto out;
+                       if (test_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags))
+                               goto out;
                }
                res = -EROFS;
                if ((mode & FMODE_WRITE) &&
@@ -3746,17 +3749,29 @@ static unsigned int floppy_check_events(struct gendisk *disk,
  * a disk in the drive, and whether that disk is writable.
  */
 
-static void floppy_rb0_complete(struct bio *bio, int err)
+struct rb0_cbdata {
+       int drive;
+       struct completion complete;
+};
+
+static void floppy_rb0_cb(struct bio *bio, int err)
 {
-       complete((struct completion *)bio->bi_private);
+       struct rb0_cbdata *cbdata = (struct rb0_cbdata *)bio->bi_private;
+       int drive = cbdata->drive;
+
+       if (err) {
+               pr_info("floppy: error %d while reading block 0", err);
+               set_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags);
+       }
+       complete(&cbdata->complete);
 }
 
-static int __floppy_read_block_0(struct block_device *bdev)
+static int __floppy_read_block_0(struct block_device *bdev, int drive)
 {
        struct bio bio;
        struct bio_vec bio_vec;
-       struct completion complete;
        struct page *page;
+       struct rb0_cbdata cbdata;
        size_t size;
 
        page = alloc_page(GFP_NOIO);
@@ -3769,6 +3784,8 @@ static int __floppy_read_block_0(struct block_device *bdev)
        if (!size)
                size = 1024;
 
+       cbdata.drive = drive;
+
        bio_init(&bio);
        bio.bi_io_vec = &bio_vec;
        bio_vec.bv_page = page;
@@ -3779,13 +3796,14 @@ static int __floppy_read_block_0(struct block_device *bdev)
        bio.bi_bdev = bdev;
        bio.bi_iter.bi_sector = 0;
        bio.bi_flags = (1 << BIO_QUIET);
-       init_completion(&complete);
-       bio.bi_private = &complete;
-       bio.bi_end_io = floppy_rb0_complete;
+       bio.bi_private = &cbdata;
+       bio.bi_end_io = floppy_rb0_cb;
 
        submit_bio(READ, &bio);
        process_fd_request();
-       wait_for_completion(&complete);
+
+       init_completion(&cbdata.complete);
+       wait_for_completion(&cbdata.complete);
 
        __free_page(page);
 
@@ -3827,7 +3845,7 @@ static int floppy_revalidate(struct gendisk *disk)
                        UDRS->generation++;
                if (drive_no_geom(drive)) {
                        /* auto-sensing */
-                       res = __floppy_read_block_0(opened_bdev[drive]);
+                       res = __floppy_read_block_0(opened_bdev[drive], drive);
                } else {
                        if (cf)
                                poll_drive(false, FD_RAW_NEED_DISK);
index 33fde3a3975954c793d0d2e9feb846e5e6a2bdc2..66e8c3b94ef35443f46bf67ea3065023da8b808d 100644 (file)
@@ -799,7 +799,7 @@ static void loop_config_discard(struct loop_device *lo)
 
        /*
         * We use punch hole to reclaim the free space used by the
-        * image a.k.a. discard. However we do support discard if
+        * image a.k.a. discard. However we do not support discard if
         * encryption is enabled, because it may give an attacker
         * useful information.
         */
index 7bc363f1ee82241227452594a78a1e7c5c2d3eca..eb59b124136690e217897dd6003473e09e5bd64a 100644 (file)
@@ -915,7 +915,7 @@ static int mg_probe(struct platform_device *plat_dev)
 
        /* disk reset */
        if (prv_data->dev_attr == MG_STORAGE_DEV) {
-               /* If POR seq. not yet finised, wait */
+               /* If POR seq. not yet finished, wait */
                err = mg_wait_rstout(host->rstout, MG_TMAX_RSTOUT);
                if (err)
                        goto probe_err_3b;
index 52b2f2a714708a756b4adcee140e8ae2a91f5006..516026954be62d325a9e9d7c5541e5fbf51c7569 100644 (file)
 #include "mtip32xx.h"
 
 #define HW_CMD_SLOT_SZ         (MTIP_MAX_COMMAND_SLOTS * 32)
-#define HW_CMD_TBL_SZ          (AHCI_CMD_TBL_HDR_SZ + (MTIP_MAX_SG * 16))
-#define HW_CMD_TBL_AR_SZ       (HW_CMD_TBL_SZ * MTIP_MAX_COMMAND_SLOTS)
-#define HW_PORT_PRIV_DMA_SZ \
-               (HW_CMD_SLOT_SZ + HW_CMD_TBL_AR_SZ + AHCI_RX_FIS_SZ)
+
+/* DMA region containing RX Fis, Identify, RLE10, and SMART buffers */
+#define AHCI_RX_FIS_SZ          0x100
+#define AHCI_RX_FIS_OFFSET      0x0
+#define AHCI_IDFY_SZ            ATA_SECT_SIZE
+#define AHCI_IDFY_OFFSET        0x400
+#define AHCI_SECTBUF_SZ         ATA_SECT_SIZE
+#define AHCI_SECTBUF_OFFSET     0x800
+#define AHCI_SMARTBUF_SZ        ATA_SECT_SIZE
+#define AHCI_SMARTBUF_OFFSET    0xC00
+/* 0x100 + 0x200 + 0x200 + 0x200 is smaller than 4k but we pad it out */
+#define BLOCK_DMA_ALLOC_SZ      4096
+
+/* DMA region containing command table (should be 8192 bytes) */
+#define AHCI_CMD_SLOT_SZ        sizeof(struct mtip_cmd_hdr)
+#define AHCI_CMD_TBL_SZ         (MTIP_MAX_COMMAND_SLOTS * AHCI_CMD_SLOT_SZ)
+#define AHCI_CMD_TBL_OFFSET     0x0
+
+/* DMA region per command (contains header and SGL) */
+#define AHCI_CMD_TBL_HDR_SZ     0x80
+#define AHCI_CMD_TBL_HDR_OFFSET 0x0
+#define AHCI_CMD_TBL_SGL_SZ     (MTIP_MAX_SG * sizeof(struct mtip_cmd_sg))
+#define AHCI_CMD_TBL_SGL_OFFSET AHCI_CMD_TBL_HDR_SZ
+#define CMD_DMA_ALLOC_SZ        (AHCI_CMD_TBL_SGL_SZ + AHCI_CMD_TBL_HDR_SZ)
+
 
 #define HOST_CAP_NZDMA         (1 << 19)
 #define HOST_HSORG             0xFC
@@ -899,8 +920,9 @@ static void mtip_handle_tfe(struct driver_data *dd)
                        fail_reason = "thermal shutdown";
                }
                if (buf[288] == 0xBF) {
+                       set_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag);
                        dev_info(&dd->pdev->dev,
-                               "Drive indicates rebuild has failed.\n");
+                               "Drive indicates rebuild has failed. Secure erase required.\n");
                        fail_all_ncq_cmds = 1;
                        fail_reason = "rebuild failed";
                }
@@ -1566,6 +1588,12 @@ static int mtip_get_identify(struct mtip_port *port, void __user *user_buffer)
        }
 #endif
 
+       /* Check security locked state */
+       if (port->identify[128] & 0x4)
+               set_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag);
+       else
+               clear_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag);
+
 #ifdef MTIP_TRIM /* Disabling TRIM support temporarily */
        /* Demux ID.DRAT & ID.RZAT to determine trim support */
        if (port->identify[69] & (1 << 14) && port->identify[69] & (1 << 5))
@@ -1887,6 +1915,10 @@ static void mtip_dump_identify(struct mtip_port *port)
        strlcpy(cbuf, (char *)(port->identify+27), 41);
        dev_info(&port->dd->pdev->dev, "Model: %s\n", cbuf);
 
+       dev_info(&port->dd->pdev->dev, "Security: %04x %s\n",
+               port->identify[128],
+               port->identify[128] & 0x4 ? "(LOCKED)" : "");
+
        if (mtip_hw_get_capacity(port->dd, &sectors))
                dev_info(&port->dd->pdev->dev,
                        "Capacity: %llu sectors (%llu MB)\n",
@@ -3312,6 +3344,118 @@ st_out:
        return 0;
 }
 
+/*
+ * DMA region teardown
+ *
+ * @dd Pointer to driver_data structure
+ *
+ * return value
+ *      None
+ */
+static void mtip_dma_free(struct driver_data *dd)
+{
+       int i;
+       struct mtip_port *port = dd->port;
+
+       if (port->block1)
+               dmam_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
+                                       port->block1, port->block1_dma);
+
+       if (port->command_list) {
+               dmam_free_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ,
+                               port->command_list, port->command_list_dma);
+       }
+
+       for (i = 0; i < MTIP_MAX_COMMAND_SLOTS; i++) {
+               if (port->commands[i].command)
+                       dmam_free_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ,
+                               port->commands[i].command,
+                               port->commands[i].command_dma);
+       }
+}
+
+/*
+ * DMA region setup
+ *
+ * @dd Pointer to driver_data structure
+ *
+ * return value
+ *      -ENOMEM Not enough free DMA region space to initialize driver
+ */
+static int mtip_dma_alloc(struct driver_data *dd)
+{
+       struct mtip_port *port = dd->port;
+       int i, rv = 0;
+       u32 host_cap_64 = readl(dd->mmio + HOST_CAP) & HOST_CAP_64;
+
+       /* Allocate dma memory for RX Fis, Identify, and Sector Bufffer */
+       port->block1 =
+               dmam_alloc_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
+                                       &port->block1_dma, GFP_KERNEL);
+       if (!port->block1)
+               return -ENOMEM;
+       memset(port->block1, 0, BLOCK_DMA_ALLOC_SZ);
+
+       /* Allocate dma memory for command list */
+       port->command_list =
+               dmam_alloc_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ,
+                                       &port->command_list_dma, GFP_KERNEL);
+       if (!port->command_list) {
+               dmam_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
+                                       port->block1, port->block1_dma);
+               port->block1 = NULL;
+               port->block1_dma = 0;
+               return -ENOMEM;
+       }
+       memset(port->command_list, 0, AHCI_CMD_TBL_SZ);
+
+       /* Setup all pointers into first DMA region */
+       port->rxfis         = port->block1 + AHCI_RX_FIS_OFFSET;
+       port->rxfis_dma     = port->block1_dma + AHCI_RX_FIS_OFFSET;
+       port->identify      = port->block1 + AHCI_IDFY_OFFSET;
+       port->identify_dma  = port->block1_dma + AHCI_IDFY_OFFSET;
+       port->log_buf       = port->block1 + AHCI_SECTBUF_OFFSET;
+       port->log_buf_dma   = port->block1_dma + AHCI_SECTBUF_OFFSET;
+       port->smart_buf     = port->block1 + AHCI_SMARTBUF_OFFSET;
+       port->smart_buf_dma = port->block1_dma + AHCI_SMARTBUF_OFFSET;
+
+       /* Setup per command SGL DMA region */
+
+       /* Point the command headers at the command tables */
+       for (i = 0; i < MTIP_MAX_COMMAND_SLOTS; i++) {
+               port->commands[i].command =
+                       dmam_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ,
+                               &port->commands[i].command_dma, GFP_KERNEL);
+               if (!port->commands[i].command) {
+                       rv = -ENOMEM;
+                       mtip_dma_free(dd);
+                       return rv;
+               }
+               memset(port->commands[i].command, 0, CMD_DMA_ALLOC_SZ);
+
+               port->commands[i].command_header = port->command_list +
+                                       (sizeof(struct mtip_cmd_hdr) * i);
+               port->commands[i].command_header_dma =
+                                       dd->port->command_list_dma +
+                                       (sizeof(struct mtip_cmd_hdr) * i);
+
+               if (host_cap_64)
+                       port->commands[i].command_header->ctbau =
+                               __force_bit2int cpu_to_le32(
+                               (port->commands[i].command_dma >> 16) >> 16);
+
+               port->commands[i].command_header->ctba =
+                               __force_bit2int cpu_to_le32(
+                               port->commands[i].command_dma & 0xFFFFFFFF);
+
+               sg_init_table(port->commands[i].sg, MTIP_MAX_SG);
+
+               /* Mark command as currently inactive */
+               atomic_set(&dd->port->commands[i].active, 0);
+       }
+       return 0;
+}
+
 /*
  * Called once for each card.
  *
@@ -3370,83 +3514,10 @@ static int mtip_hw_init(struct driver_data *dd)
        dd->port->mmio  = dd->mmio + PORT_OFFSET;
        dd->port->dd    = dd;
 
-       /* Allocate memory for the command list. */
-       dd->port->command_list =
-               dmam_alloc_coherent(&dd->pdev->dev,
-                       HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4),
-                       &dd->port->command_list_dma,
-                       GFP_KERNEL);
-       if (!dd->port->command_list) {
-               dev_err(&dd->pdev->dev,
-                       "Memory allocation: command list\n");
-               rv = -ENOMEM;
+       /* DMA allocations */
+       rv = mtip_dma_alloc(dd);
+       if (rv < 0)
                goto out1;
-       }
-
-       /* Clear the memory we have allocated. */
-       memset(dd->port->command_list,
-               0,
-               HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4));
-
-       /* Setup the addresse of the RX FIS. */
-       dd->port->rxfis     = dd->port->command_list + HW_CMD_SLOT_SZ;
-       dd->port->rxfis_dma = dd->port->command_list_dma + HW_CMD_SLOT_SZ;
-
-       /* Setup the address of the command tables. */
-       dd->port->command_table   = dd->port->rxfis + AHCI_RX_FIS_SZ;
-       dd->port->command_tbl_dma = dd->port->rxfis_dma + AHCI_RX_FIS_SZ;
-
-       /* Setup the address of the identify data. */
-       dd->port->identify     = dd->port->command_table +
-                                       HW_CMD_TBL_AR_SZ;
-       dd->port->identify_dma = dd->port->command_tbl_dma +
-                                       HW_CMD_TBL_AR_SZ;
-
-       /* Setup the address of the sector buffer - for some non-ncq cmds */
-       dd->port->sector_buffer = (void *) dd->port->identify + ATA_SECT_SIZE;
-       dd->port->sector_buffer_dma = dd->port->identify_dma + ATA_SECT_SIZE;
-
-       /* Setup the address of the log buf - for read log command */
-       dd->port->log_buf = (void *)dd->port->sector_buffer  + ATA_SECT_SIZE;
-       dd->port->log_buf_dma = dd->port->sector_buffer_dma + ATA_SECT_SIZE;
-
-       /* Setup the address of the smart buf - for smart read data command */
-       dd->port->smart_buf = (void *)dd->port->log_buf  + ATA_SECT_SIZE;
-       dd->port->smart_buf_dma = dd->port->log_buf_dma + ATA_SECT_SIZE;
-
-
-       /* Point the command headers at the command tables. */
-       for (i = 0; i < num_command_slots; i++) {
-               dd->port->commands[i].command_header =
-                                       dd->port->command_list +
-                                       (sizeof(struct mtip_cmd_hdr) * i);
-               dd->port->commands[i].command_header_dma =
-                                       dd->port->command_list_dma +
-                                       (sizeof(struct mtip_cmd_hdr) * i);
-
-               dd->port->commands[i].command =
-                       dd->port->command_table + (HW_CMD_TBL_SZ * i);
-               dd->port->commands[i].command_dma =
-                       dd->port->command_tbl_dma + (HW_CMD_TBL_SZ * i);
-
-               if (readl(dd->mmio + HOST_CAP) & HOST_CAP_64)
-                       dd->port->commands[i].command_header->ctbau =
-                       __force_bit2int cpu_to_le32(
-                       (dd->port->commands[i].command_dma >> 16) >> 16);
-               dd->port->commands[i].command_header->ctba =
-                       __force_bit2int cpu_to_le32(
-                       dd->port->commands[i].command_dma & 0xFFFFFFFF);
-
-               /*
-                * If this is not done, a bug is reported by the stock
-                * FC11 i386. Due to the fact that it has lots of kernel
-                * debugging enabled.
-                */
-               sg_init_table(dd->port->commands[i].sg, MTIP_MAX_SG);
-
-               /* Mark all commands as currently inactive.*/
-               atomic_set(&dd->port->commands[i].active, 0);
-       }
 
        /* Setup the pointers to the extended s_active and CI registers. */
        for (i = 0; i < dd->slot_groups; i++) {
@@ -3594,12 +3665,8 @@ out3:
 
 out2:
        mtip_deinit_port(dd->port);
+       mtip_dma_free(dd);
 
-       /* Free the command/command header memory. */
-       dmam_free_coherent(&dd->pdev->dev,
-                               HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4),
-                               dd->port->command_list,
-                               dd->port->command_list_dma);
 out1:
        /* Free the memory allocated for the for structure. */
        kfree(dd->port);
@@ -3622,7 +3689,8 @@ static int mtip_hw_exit(struct driver_data *dd)
         * saves its state.
         */
        if (!dd->sr) {
-               if (!test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag))
+               if (!test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags) &&
+                   !test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag))
                        if (mtip_standby_immediate(dd->port))
                                dev_warn(&dd->pdev->dev,
                                        "STANDBY IMMEDIATE failed\n");
@@ -3641,11 +3709,9 @@ static int mtip_hw_exit(struct driver_data *dd)
        irq_set_affinity_hint(dd->pdev->irq, NULL);
        devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd);
 
-       /* Free the command/command header memory. */
-       dmam_free_coherent(&dd->pdev->dev,
-                       HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4),
-                       dd->port->command_list,
-                       dd->port->command_list_dma);
+       /* Free dma regions */
+       mtip_dma_free(dd);
+
        /* Free the memory allocated for the for structure. */
        kfree(dd->port);
        dd->port = NULL;
index 9be7a1582ad3471a5400237b7db57b6a976cb2d9..b52e9a6d6aad6d602bf3a9d6f73c9b4c277c7e17 100644 (file)
@@ -69,7 +69,7 @@
  * Maximum number of scatter gather entries
  * a single command may have.
  */
-#define MTIP_MAX_SG            128
+#define MTIP_MAX_SG            504
 
 /*
  * Maximum number of slot groups (Command Issue & s_active registers)
@@ -92,7 +92,7 @@
 
 /* Driver name and version strings */
 #define MTIP_DRV_NAME          "mtip32xx"
-#define MTIP_DRV_VERSION       "1.2.6os3"
+#define MTIP_DRV_VERSION       "1.3.0"
 
 /* Maximum number of minor device numbers per device. */
 #define MTIP_MAX_MINORS                16
@@ -391,15 +391,13 @@ struct mtip_port {
         */
        dma_addr_t rxfis_dma;
        /*
-        * Pointer to the beginning of the command table memory as used
-        * by the driver.
+        * Pointer to the DMA region for RX Fis, Identify, RLE10, and SMART
         */
-       void *command_table;
+       void *block1;
        /*
-        * Pointer to the beginning of the command table memory as used
-        * by the DMA.
+        * DMA address of region for RX Fis, Identify, RLE10, and SMART
         */
-       dma_addr_t command_tbl_dma;
+       dma_addr_t block1_dma;
        /*
         * Pointer to the beginning of the identify data memory as used
         * by the driver.
index 83a598ebb65a4ab7699d1dcebe1b44b42ea8ae5a..3107282a9741f96665a2805b08217d3209c27bf7 100644 (file)
@@ -616,6 +616,11 @@ static int __init null_init(void)
                irqmode = NULL_IRQ_NONE;
        }
 #endif
+       if (bs > PAGE_SIZE) {
+               pr_warn("null_blk: invalid block size\n");
+               pr_warn("null_blk: defaults block size to %lu\n", PAGE_SIZE);
+               bs = PAGE_SIZE;
+       }
 
        if (queue_mode == NULL_Q_MQ && use_per_node_hctx) {
                if (submit_queues < nr_online_nodes) {
index 4a27b1de5fcb9fb0fef805fb51d0d2f1bc418a29..2ce3dfd7e6b9bafd65c670aa45bf3a8660578360 100644 (file)
@@ -581,7 +581,7 @@ static ssize_t pg_write(struct file *filp, const char __user *buf, size_t count,
 
        if (hdr.magic != PG_MAGIC)
                return -EINVAL;
-       if (hdr.dlen > PG_MAX_DATA)
+       if (hdr.dlen < 0 || hdr.dlen > PG_MAX_DATA)
                return -EINVAL;
        if ((count - hs) > PG_MAX_DATA)
                return -EINVAL;
index 3dda09a5ec4172bc7e57f6e18dff272ecd8deaba..a2af73db187b694c3112bf1c6d08e4070596cd10 100644 (file)
@@ -706,7 +706,9 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
                             WRITE : READ, __GFP_WAIT);
 
        if (cgc->buflen) {
-               if (blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen, __GFP_WAIT))
+               ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen,
+                                     __GFP_WAIT);
+               if (ret)
                        goto out;
        }
 
index 3fb6ab4c8b4e9e96f9ba487cf801fdcd776f0c73..d5e2d12b9d9e329d77fb21560b8f21e5e98a11f4 100644 (file)
@@ -1744,20 +1744,6 @@ static void carm_remove_one (struct pci_dev *pdev)
        kfree(host);
        pci_release_regions(pdev);
        pci_disable_device(pdev);
-       pci_set_drvdata(pdev, NULL);
 }
 
-static int __init carm_init(void)
-{
-       return pci_register_driver(&carm_driver);
-}
-
-static void __exit carm_exit(void)
-{
-       pci_unregister_driver(&carm_driver);
-}
-
-module_init(carm_init);
-module_exit(carm_exit);
-
-
+module_pci_driver(carm_driver);
index 5980cb9af857891491baee6026efe16cb84694de..51e75ad964223e07268e0781caa5fa9230f43824 100644 (file)
@@ -561,11 +561,11 @@ static int gdrom_set_interrupt_handlers(void)
        int err;
 
        err = request_irq(HW_EVENT_GDROM_CMD, gdrom_command_interrupt,
-               IRQF_DISABLED, "gdrom_command", &gd);
+               0, "gdrom_command", &gd);
        if (err)
                return err;
        err = request_irq(HW_EVENT_GDROM_DMA, gdrom_dma_interrupt,
-               IRQF_DISABLED, "gdrom_dma", &gd);
+               0, "gdrom_dma", &gd);
        if (err)
                free_irq(HW_EVENT_GDROM_CMD, &gd);
        return err;
index 0e9c82523be678eb1604d18a33aa196b724b0ce7..c488b846f831b9605789779310c90380fa3eb039 100644 (file)
@@ -1,7 +1,8 @@
 
 obj-$(CONFIG_BCACHE)   += bcache.o
 
-bcache-y               := alloc.o btree.o bset.o io.o journal.o writeback.o\
-       movinggc.o request.o super.o sysfs.o debug.o util.o trace.o stats.o closure.o
+bcache-y               := alloc.o bset.o btree.o closure.o debug.o extents.o\
+       io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
+       util.o writeback.o
 
 CFLAGS_request.o       += -Iblock
index 4c9852d92b0a909d5b103510ae77d55f4bb05ad0..c0d37d0824439f2daff4c049ab82dc6b3c6e646e 100644 (file)
@@ -132,10 +132,16 @@ bool bch_bucket_add_unused(struct cache *ca, struct bucket *b)
 {
        BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b));
 
-       if (fifo_used(&ca->free) > ca->watermark[WATERMARK_MOVINGGC] &&
-           CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO)
-               return false;
+       if (CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) {
+               unsigned i;
+
+               for (i = 0; i < RESERVE_NONE; i++)
+                       if (!fifo_full(&ca->free[i]))
+                               goto add;
 
+               return false;
+       }
+add:
        b->prio = 0;
 
        if (can_inc_bucket_gen(b) &&
@@ -162,8 +168,21 @@ static void invalidate_one_bucket(struct cache *ca, struct bucket *b)
        fifo_push(&ca->free_inc, b - ca->buckets);
 }
 
-#define bucket_prio(b)                         \
-       (((unsigned) (b->prio - ca->set->min_prio)) * GC_SECTORS_USED(b))
+/*
+ * Determines what order we're going to reuse buckets, smallest bucket_prio()
+ * first: we also take into account the number of sectors of live data in that
+ * bucket, and in order for that multiply to make sense we have to scale bucket
+ *
+ * Thus, we scale the bucket priorities so that the bucket with the smallest
+ * prio is worth 1/8th of what INITIAL_PRIO is worth.
+ */
+
+#define bucket_prio(b)                                                 \
+({                                                                     \
+       unsigned min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8;     \
+                                                                       \
+       (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b);  \
+})
 
 #define bucket_max_cmp(l, r)   (bucket_prio(l) < bucket_prio(r))
 #define bucket_min_cmp(l, r)   (bucket_prio(l) > bucket_prio(r))
@@ -304,6 +323,21 @@ do {                                                                       \
        __set_current_state(TASK_RUNNING);                              \
 } while (0)
 
+static int bch_allocator_push(struct cache *ca, long bucket)
+{
+       unsigned i;
+
+       /* Prios/gens are actually the most important reserve */
+       if (fifo_push(&ca->free[RESERVE_PRIO], bucket))
+               return true;
+
+       for (i = 0; i < RESERVE_NR; i++)
+               if (fifo_push(&ca->free[i], bucket))
+                       return true;
+
+       return false;
+}
+
 static int bch_allocator_thread(void *arg)
 {
        struct cache *ca = arg;
@@ -336,9 +370,7 @@ static int bch_allocator_thread(void *arg)
                                mutex_lock(&ca->set->bucket_lock);
                        }
 
-                       allocator_wait(ca, !fifo_full(&ca->free));
-
-                       fifo_push(&ca->free, bucket);
+                       allocator_wait(ca, bch_allocator_push(ca, bucket));
                        wake_up(&ca->set->bucket_wait);
                }
 
@@ -365,34 +397,29 @@ static int bch_allocator_thread(void *arg)
        }
 }
 
-long bch_bucket_alloc(struct cache *ca, unsigned watermark, bool wait)
+long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
 {
        DEFINE_WAIT(w);
        struct bucket *b;
        long r;
 
        /* fastpath */
-       if (fifo_used(&ca->free) > ca->watermark[watermark]) {
-               fifo_pop(&ca->free, r);
+       if (fifo_pop(&ca->free[RESERVE_NONE], r) ||
+           fifo_pop(&ca->free[reserve], r))
                goto out;
-       }
 
        if (!wait)
                return -1;
 
-       while (1) {
-               if (fifo_used(&ca->free) > ca->watermark[watermark]) {
-                       fifo_pop(&ca->free, r);
-                       break;
-               }
-
+       do {
                prepare_to_wait(&ca->set->bucket_wait, &w,
                                TASK_UNINTERRUPTIBLE);
 
                mutex_unlock(&ca->set->bucket_lock);
                schedule();
                mutex_lock(&ca->set->bucket_lock);
-       }
+       } while (!fifo_pop(&ca->free[RESERVE_NONE], r) &&
+                !fifo_pop(&ca->free[reserve], r));
 
        finish_wait(&ca->set->bucket_wait, &w);
 out:
@@ -401,12 +428,14 @@ out:
        if (expensive_debug_checks(ca->set)) {
                size_t iter;
                long i;
+               unsigned j;
 
                for (iter = 0; iter < prio_buckets(ca) * 2; iter++)
                        BUG_ON(ca->prio_buckets[iter] == (uint64_t) r);
 
-               fifo_for_each(i, &ca->free, iter)
-                       BUG_ON(i == r);
+               for (j = 0; j < RESERVE_NR; j++)
+                       fifo_for_each(i, &ca->free[j], iter)
+                               BUG_ON(i == r);
                fifo_for_each(i, &ca->free_inc, iter)
                        BUG_ON(i == r);
                fifo_for_each(i, &ca->unused, iter)
@@ -419,7 +448,7 @@ out:
 
        SET_GC_SECTORS_USED(b, ca->sb.bucket_size);
 
-       if (watermark <= WATERMARK_METADATA) {
+       if (reserve <= RESERVE_PRIO) {
                SET_GC_MARK(b, GC_MARK_METADATA);
                SET_GC_MOVE(b, 0);
                b->prio = BTREE_PRIO;
@@ -445,7 +474,7 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k)
        }
 }
 
-int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
+int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
                           struct bkey *k, int n, bool wait)
 {
        int i;
@@ -459,7 +488,7 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
 
        for (i = 0; i < n; i++) {
                struct cache *ca = c->cache_by_alloc[i];
-               long b = bch_bucket_alloc(ca, watermark, wait);
+               long b = bch_bucket_alloc(ca, reserve, wait);
 
                if (b == -1)
                        goto err;
@@ -478,12 +507,12 @@ err:
        return -1;
 }
 
-int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
+int bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
                         struct bkey *k, int n, bool wait)
 {
        int ret;
        mutex_lock(&c->bucket_lock);
-       ret = __bch_bucket_alloc_set(c, watermark, k, n, wait);
+       ret = __bch_bucket_alloc_set(c, reserve, k, n, wait);
        mutex_unlock(&c->bucket_lock);
        return ret;
 }
@@ -573,8 +602,8 @@ bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors,
 
        while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) {
                unsigned watermark = write_prio
-                       ? WATERMARK_MOVINGGC
-                       : WATERMARK_NONE;
+                       ? RESERVE_MOVINGGC
+                       : RESERVE_NONE;
 
                spin_unlock(&c->data_bucket_lock);
 
@@ -689,7 +718,7 @@ int bch_cache_allocator_init(struct cache *ca)
         * Then 8 for btree allocations
         * Then half for the moving garbage collector
         */
-
+#if 0
        ca->watermark[WATERMARK_PRIO] = 0;
 
        ca->watermark[WATERMARK_METADATA] = prio_buckets(ca);
@@ -699,6 +728,6 @@ int bch_cache_allocator_init(struct cache *ca)
 
        ca->watermark[WATERMARK_NONE] = ca->free.size / 2 +
                ca->watermark[WATERMARK_MOVINGGC];
-
+#endif
        return 0;
 }
index dbdbca5a95910a421898cef06bcf902a43401c6a..0c707e4f4eafc32adcdb3fec5539b2dbe0998a7a 100644 (file)
 #include <linux/types.h>
 #include <linux/workqueue.h>
 
+#include "bset.h"
 #include "util.h"
 #include "closure.h"
 
@@ -309,7 +310,8 @@ struct cached_dev {
        struct cache_sb         sb;
        struct bio              sb_bio;
        struct bio_vec          sb_bv[1];
-       struct closure_with_waitlist sb_write;
+       struct closure          sb_write;
+       struct semaphore        sb_write_mutex;
 
        /* Refcount on the cache set. Always nonzero when we're caching. */
        atomic_t                count;
@@ -382,12 +384,12 @@ struct cached_dev {
        unsigned                writeback_rate_p_term_inverse;
 };
 
-enum alloc_watermarks {
-       WATERMARK_PRIO,
-       WATERMARK_METADATA,
-       WATERMARK_MOVINGGC,
-       WATERMARK_NONE,
-       WATERMARK_MAX
+enum alloc_reserve {
+       RESERVE_BTREE,
+       RESERVE_PRIO,
+       RESERVE_MOVINGGC,
+       RESERVE_NONE,
+       RESERVE_NR,
 };
 
 struct cache {
@@ -399,8 +401,6 @@ struct cache {
        struct kobject          kobj;
        struct block_device     *bdev;
 
-       unsigned                watermark[WATERMARK_MAX];
-
        struct task_struct      *alloc_thread;
 
        struct closure          prio;
@@ -429,7 +429,7 @@ struct cache {
         * because all the data they contained was overwritten), so we only
         * need to discard them before they can be moved to the free list.
         */
-       DECLARE_FIFO(long, free);
+       DECLARE_FIFO(long, free)[RESERVE_NR];
        DECLARE_FIFO(long, free_inc);
        DECLARE_FIFO(long, unused);
 
@@ -514,7 +514,8 @@ struct cache_set {
        uint64_t                cached_dev_sectors;
        struct closure          caching;
 
-       struct closure_with_waitlist sb_write;
+       struct closure          sb_write;
+       struct semaphore        sb_write_mutex;
 
        mempool_t               *search;
        mempool_t               *bio_meta;
@@ -629,13 +630,15 @@ struct cache_set {
 
 #ifdef CONFIG_BCACHE_DEBUG
        struct btree            *verify_data;
+       struct bset             *verify_ondisk;
        struct mutex            verify_lock;
 #endif
 
        unsigned                nr_uuids;
        struct uuid_entry       *uuids;
        BKEY_PADDED(uuid_bucket);
-       struct closure_with_waitlist uuid_write;
+       struct closure          uuid_write;
+       struct semaphore        uuid_write_mutex;
 
        /*
         * A btree node on disk could have too many bsets for an iterator to fit
@@ -643,13 +646,7 @@ struct cache_set {
         */
        mempool_t               *fill_iter;
 
-       /*
-        * btree_sort() is a merge sort and requires temporary space - single
-        * element mempool
-        */
-       struct mutex            sort_lock;
-       struct bset             *sort;
-       unsigned                sort_crit_factor;
+       struct bset_sort_state  sort;
 
        /* List of buckets we're currently writing data to */
        struct list_head        data_buckets;
@@ -665,7 +662,6 @@ struct cache_set {
        unsigned                congested_read_threshold_us;
        unsigned                congested_write_threshold_us;
 
-       struct time_stats       sort_time;
        struct time_stats       btree_gc_time;
        struct time_stats       btree_split_time;
        struct time_stats       btree_read_time;
@@ -683,9 +679,9 @@ struct cache_set {
        unsigned                error_decay;
 
        unsigned short          journal_delay_ms;
+       bool                    expensive_debug_checks;
        unsigned                verify:1;
        unsigned                key_merging_disabled:1;
-       unsigned                expensive_debug_checks:1;
        unsigned                gc_always_rewrite:1;
        unsigned                shrinker_disabled:1;
        unsigned                copy_gc_enabled:1;
@@ -707,13 +703,8 @@ struct bbio {
        struct bio              bio;
 };
 
-static inline unsigned local_clock_us(void)
-{
-       return local_clock() >> 10;
-}
-
 #define BTREE_PRIO             USHRT_MAX
-#define INITIAL_PRIO           32768
+#define INITIAL_PRIO           32768U
 
 #define btree_bytes(c)         ((c)->btree_pages * PAGE_SIZE)
 #define btree_blocks(b)                                                        \
@@ -726,21 +717,6 @@ static inline unsigned local_clock_us(void)
 #define bucket_bytes(c)                ((c)->sb.bucket_size << 9)
 #define block_bytes(c)         ((c)->sb.block_size << 9)
 
-#define __set_bytes(i, k)      (sizeof(*(i)) + (k) * sizeof(uint64_t))
-#define set_bytes(i)           __set_bytes(i, i->keys)
-
-#define __set_blocks(i, k, c)  DIV_ROUND_UP(__set_bytes(i, k), block_bytes(c))
-#define set_blocks(i, c)       __set_blocks(i, (i)->keys, c)
-
-#define node(i, j)             ((struct bkey *) ((i)->d + (j)))
-#define end(i)                 node(i, (i)->keys)
-
-#define index(i, b)                                                    \
-       ((size_t) (((void *) i - (void *) (b)->sets[0].data) /          \
-                  block_bytes(b->c)))
-
-#define btree_data_space(b)    (PAGE_SIZE << (b)->page_order)
-
 #define prios_per_bucket(c)                            \
        ((bucket_bytes(c) - sizeof(struct prio_set)) /  \
         sizeof(struct bucket_disk))
@@ -783,20 +759,34 @@ static inline struct bucket *PTR_BUCKET(struct cache_set *c,
        return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr);
 }
 
-/* Btree key macros */
+static inline uint8_t gen_after(uint8_t a, uint8_t b)
+{
+       uint8_t r = a - b;
+       return r > 128U ? 0 : r;
+}
 
-static inline void bkey_init(struct bkey *k)
+static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k,
+                               unsigned i)
 {
-       *k = ZERO_KEY;
+       return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i));
 }
 
+static inline bool ptr_available(struct cache_set *c, const struct bkey *k,
+                                unsigned i)
+{
+       return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i);
+}
+
+/* Btree key macros */
+
 /*
  * This is used for various on disk data structures - cache_sb, prio_set, bset,
  * jset: The checksum is _always_ the first 8 bytes of these structs
  */
 #define csum_set(i)                                                    \
        bch_crc64(((void *) (i)) + sizeof(uint64_t),                    \
-             ((void *) end(i)) - (((void *) (i)) + sizeof(uint64_t)))
+                 ((void *) bset_bkey_last(i)) -                        \
+                 (((void *) (i)) + sizeof(uint64_t)))
 
 /* Error handling macros */
 
index 7d388b8bb50e35cf154d7c082ea492a5de08235d..4f6b5940e609b4f53c248bf65762ef8f99c7258c 100644 (file)
  * Copyright 2012 Google, Inc.
  */
 
-#include "bcache.h"
-#include "btree.h"
-#include "debug.h"
+#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
 
+#include "util.h"
+#include "bset.h"
+
+#include <linux/console.h>
 #include <linux/random.h>
 #include <linux/prefetch.h>
 
+#ifdef CONFIG_BCACHE_DEBUG
+
+void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set)
+{
+       struct bkey *k, *next;
+
+       for (k = i->start; k < bset_bkey_last(i); k = next) {
+               next = bkey_next(k);
+
+               printk(KERN_ERR "block %u key %zi/%u: ", set,
+                      (uint64_t *) k - i->d, i->keys);
+
+               if (b->ops->key_dump)
+                       b->ops->key_dump(b, k);
+               else
+                       printk("%llu:%llu\n", KEY_INODE(k), KEY_OFFSET(k));
+
+               if (next < bset_bkey_last(i) &&
+                   bkey_cmp(k, b->ops->is_extents ?
+                            &START_KEY(next) : next) > 0)
+                       printk(KERN_ERR "Key skipped backwards\n");
+       }
+}
+
+void bch_dump_bucket(struct btree_keys *b)
+{
+       unsigned i;
+
+       console_lock();
+       for (i = 0; i <= b->nsets; i++)
+               bch_dump_bset(b, b->set[i].data,
+                             bset_sector_offset(b, b->set[i].data));
+       console_unlock();
+}
+
+int __bch_count_data(struct btree_keys *b)
+{
+       unsigned ret = 0;
+       struct btree_iter iter;
+       struct bkey *k;
+
+       if (b->ops->is_extents)
+               for_each_key(b, k, &iter)
+                       ret += KEY_SIZE(k);
+       return ret;
+}
+
+void __bch_check_keys(struct btree_keys *b, const char *fmt, ...)
+{
+       va_list args;
+       struct bkey *k, *p = NULL;
+       struct btree_iter iter;
+       const char *err;
+
+       for_each_key(b, k, &iter) {
+               if (b->ops->is_extents) {
+                       err = "Keys out of order";
+                       if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0)
+                               goto bug;
+
+                       if (bch_ptr_invalid(b, k))
+                               continue;
+
+                       err =  "Overlapping keys";
+                       if (p && bkey_cmp(p, &START_KEY(k)) > 0)
+                               goto bug;
+               } else {
+                       if (bch_ptr_bad(b, k))
+                               continue;
+
+                       err = "Duplicate keys";
+                       if (p && !bkey_cmp(p, k))
+                               goto bug;
+               }
+               p = k;
+       }
+#if 0
+       err = "Key larger than btree node key";
+       if (p && bkey_cmp(p, &b->key) > 0)
+               goto bug;
+#endif
+       return;
+bug:
+       bch_dump_bucket(b);
+
+       va_start(args, fmt);
+       vprintk(fmt, args);
+       va_end(args);
+
+       panic("bch_check_keys error:  %s:\n", err);
+}
+
+static void bch_btree_iter_next_check(struct btree_iter *iter)
+{
+       struct bkey *k = iter->data->k, *next = bkey_next(k);
+
+       if (next < iter->data->end &&
+           bkey_cmp(k, iter->b->ops->is_extents ?
+                    &START_KEY(next) : next) > 0) {
+               bch_dump_bucket(iter->b);
+               panic("Key skipped backwards\n");
+       }
+}
+
+#else
+
+static inline void bch_btree_iter_next_check(struct btree_iter *iter) {}
+
+#endif
+
 /* Keylists */
 
-int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c)
+int __bch_keylist_realloc(struct keylist *l, unsigned u64s)
 {
        size_t oldsize = bch_keylist_nkeys(l);
-       size_t newsize = oldsize + 2 + nptrs;
+       size_t newsize = oldsize + u64s;
        uint64_t *old_keys = l->keys_p == l->inline_keys ? NULL : l->keys_p;
        uint64_t *new_keys;
 
-       /* The journalling code doesn't handle the case where the keys to insert
-        * is bigger than an empty write: If we just return -ENOMEM here,
-        * bio_insert() and bio_invalidate() will insert the keys created so far
-        * and finish the rest when the keylist is empty.
-        */
-       if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset))
-               return -ENOMEM;
-
        newsize = roundup_pow_of_two(newsize);
 
        if (newsize <= KEYLIST_INLINE ||
@@ -71,136 +175,6 @@ void bch_keylist_pop_front(struct keylist *l)
                bch_keylist_bytes(l));
 }
 
-/* Pointer validation */
-
-static bool __ptr_invalid(struct cache_set *c, const struct bkey *k)
-{
-       unsigned i;
-
-       for (i = 0; i < KEY_PTRS(k); i++)
-               if (ptr_available(c, k, i)) {
-                       struct cache *ca = PTR_CACHE(c, k, i);
-                       size_t bucket = PTR_BUCKET_NR(c, k, i);
-                       size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
-
-                       if (KEY_SIZE(k) + r > c->sb.bucket_size ||
-                           bucket <  ca->sb.first_bucket ||
-                           bucket >= ca->sb.nbuckets)
-                               return true;
-               }
-
-       return false;
-}
-
-bool bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k)
-{
-       char buf[80];
-
-       if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))
-               goto bad;
-
-       if (__ptr_invalid(c, k))
-               goto bad;
-
-       return false;
-bad:
-       bch_bkey_to_text(buf, sizeof(buf), k);
-       cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k));
-       return true;
-}
-
-bool bch_extent_ptr_invalid(struct cache_set *c, const struct bkey *k)
-{
-       char buf[80];
-
-       if (!KEY_SIZE(k))
-               return true;
-
-       if (KEY_SIZE(k) > KEY_OFFSET(k))
-               goto bad;
-
-       if (__ptr_invalid(c, k))
-               goto bad;
-
-       return false;
-bad:
-       bch_bkey_to_text(buf, sizeof(buf), k);
-       cache_bug(c, "spotted extent %s: %s", buf, bch_ptr_status(c, k));
-       return true;
-}
-
-static bool ptr_bad_expensive_checks(struct btree *b, const struct bkey *k,
-                                    unsigned ptr)
-{
-       struct bucket *g = PTR_BUCKET(b->c, k, ptr);
-       char buf[80];
-
-       if (mutex_trylock(&b->c->bucket_lock)) {
-               if (b->level) {
-                       if (KEY_DIRTY(k) ||
-                           g->prio != BTREE_PRIO ||
-                           (b->c->gc_mark_valid &&
-                            GC_MARK(g) != GC_MARK_METADATA))
-                               goto err;
-
-               } else {
-                       if (g->prio == BTREE_PRIO)
-                               goto err;
-
-                       if (KEY_DIRTY(k) &&
-                           b->c->gc_mark_valid &&
-                           GC_MARK(g) != GC_MARK_DIRTY)
-                               goto err;
-               }
-               mutex_unlock(&b->c->bucket_lock);
-       }
-
-       return false;
-err:
-       mutex_unlock(&b->c->bucket_lock);
-       bch_bkey_to_text(buf, sizeof(buf), k);
-       btree_bug(b,
-"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
-                 buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin),
-                 g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
-       return true;
-}
-
-bool bch_ptr_bad(struct btree *b, const struct bkey *k)
-{
-       struct bucket *g;
-       unsigned i, stale;
-
-       if (!bkey_cmp(k, &ZERO_KEY) ||
-           !KEY_PTRS(k) ||
-           bch_ptr_invalid(b, k))
-               return true;
-
-       for (i = 0; i < KEY_PTRS(k); i++) {
-               if (!ptr_available(b->c, k, i))
-                       return true;
-
-               g = PTR_BUCKET(b->c, k, i);
-               stale = ptr_stale(b->c, k, i);
-
-               btree_bug_on(stale > 96, b,
-                            "key too stale: %i, need_gc %u",
-                            stale, b->c->need_gc);
-
-               btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k),
-                            b, "stale dirty pointer");
-
-               if (stale)
-                       return true;
-
-               if (expensive_debug_checks(b->c) &&
-                   ptr_bad_expensive_checks(b, k, i))
-                       return true;
-       }
-
-       return false;
-}
-
 /* Key/pointer manipulation */
 
 void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src,
@@ -255,56 +229,138 @@ bool __bch_cut_back(const struct bkey *where, struct bkey *k)
        return true;
 }
 
-static uint64_t merge_chksums(struct bkey *l, struct bkey *r)
+/* Auxiliary search trees */
+
+/* 32 bits total: */
+#define BKEY_MID_BITS          3
+#define BKEY_EXPONENT_BITS     7
+#define BKEY_MANTISSA_BITS     (32 - BKEY_MID_BITS - BKEY_EXPONENT_BITS)
+#define BKEY_MANTISSA_MASK     ((1 << BKEY_MANTISSA_BITS) - 1)
+
+struct bkey_float {
+       unsigned        exponent:BKEY_EXPONENT_BITS;
+       unsigned        m:BKEY_MID_BITS;
+       unsigned        mantissa:BKEY_MANTISSA_BITS;
+} __packed;
+
+/*
+ * BSET_CACHELINE was originally intended to match the hardware cacheline size -
+ * it used to be 64, but I realized the lookup code would touch slightly less
+ * memory if it was 128.
+ *
+ * It definites the number of bytes (in struct bset) per struct bkey_float in
+ * the auxiliar search tree - when we're done searching the bset_float tree we
+ * have this many bytes left that we do a linear search over.
+ *
+ * Since (after level 5) every level of the bset_tree is on a new cacheline,
+ * we're touching one fewer cacheline in the bset tree in exchange for one more
+ * cacheline in the linear search - but the linear search might stop before it
+ * gets to the second cacheline.
+ */
+
+#define BSET_CACHELINE         128
+
+/* Space required for the btree node keys */
+static inline size_t btree_keys_bytes(struct btree_keys *b)
 {
-       return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) &
-               ~((uint64_t)1 << 63);
+       return PAGE_SIZE << b->page_order;
 }
 
-/* Tries to merge l and r: l should be lower than r
- * Returns true if we were able to merge. If we did merge, l will be the merged
- * key, r will be untouched.
- */
-bool bch_bkey_try_merge(struct btree *b, struct bkey *l, struct bkey *r)
+static inline size_t btree_keys_cachelines(struct btree_keys *b)
 {
-       unsigned i;
+       return btree_keys_bytes(b) / BSET_CACHELINE;
+}
 
-       if (key_merging_disabled(b->c))
-               return false;
+/* Space required for the auxiliary search trees */
+static inline size_t bset_tree_bytes(struct btree_keys *b)
+{
+       return btree_keys_cachelines(b) * sizeof(struct bkey_float);
+}
 
-       if (KEY_PTRS(l) != KEY_PTRS(r) ||
-           KEY_DIRTY(l) != KEY_DIRTY(r) ||
-           bkey_cmp(l, &START_KEY(r)))
-               return false;
+/* Space required for the prev pointers */
+static inline size_t bset_prev_bytes(struct btree_keys *b)
+{
+       return btree_keys_cachelines(b) * sizeof(uint8_t);
+}
 
-       for (i = 0; i < KEY_PTRS(l); i++)
-               if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
-                   PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i))
-                       return false;
+/* Memory allocation */
 
-       /* Keys with no pointers aren't restricted to one bucket and could
-        * overflow KEY_SIZE
-        */
-       if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) {
-               SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l));
-               SET_KEY_SIZE(l, USHRT_MAX);
+void bch_btree_keys_free(struct btree_keys *b)
+{
+       struct bset_tree *t = b->set;
 
-               bch_cut_front(l, r);
-               return false;
-       }
+       if (bset_prev_bytes(b) < PAGE_SIZE)
+               kfree(t->prev);
+       else
+               free_pages((unsigned long) t->prev,
+                          get_order(bset_prev_bytes(b)));
 
-       if (KEY_CSUM(l)) {
-               if (KEY_CSUM(r))
-                       l->ptr[KEY_PTRS(l)] = merge_chksums(l, r);
-               else
-                       SET_KEY_CSUM(l, 0);
-       }
+       if (bset_tree_bytes(b) < PAGE_SIZE)
+               kfree(t->tree);
+       else
+               free_pages((unsigned long) t->tree,
+                          get_order(bset_tree_bytes(b)));
 
-       SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r));
-       SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r));
+       free_pages((unsigned long) t->data, b->page_order);
 
-       return true;
+       t->prev = NULL;
+       t->tree = NULL;
+       t->data = NULL;
+}
+EXPORT_SYMBOL(bch_btree_keys_free);
+
+int bch_btree_keys_alloc(struct btree_keys *b, unsigned page_order, gfp_t gfp)
+{
+       struct bset_tree *t = b->set;
+
+       BUG_ON(t->data);
+
+       b->page_order = page_order;
+
+       t->data = (void *) __get_free_pages(gfp, b->page_order);
+       if (!t->data)
+               goto err;
+
+       t->tree = bset_tree_bytes(b) < PAGE_SIZE
+               ? kmalloc(bset_tree_bytes(b), gfp)
+               : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b)));
+       if (!t->tree)
+               goto err;
+
+       t->prev = bset_prev_bytes(b) < PAGE_SIZE
+               ? kmalloc(bset_prev_bytes(b), gfp)
+               : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b)));
+       if (!t->prev)
+               goto err;
+
+       return 0;
+err:
+       bch_btree_keys_free(b);
+       return -ENOMEM;
 }
+EXPORT_SYMBOL(bch_btree_keys_alloc);
+
+void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops,
+                        bool *expensive_debug_checks)
+{
+       unsigned i;
+
+       b->ops = ops;
+       b->expensive_debug_checks = expensive_debug_checks;
+       b->nsets = 0;
+       b->last_set_unwritten = 0;
+
+       /* XXX: shouldn't be needed */
+       for (i = 0; i < MAX_BSETS; i++)
+               b->set[i].size = 0;
+       /*
+        * Second loop starts at 1 because b->keys[0]->data is the memory we
+        * allocated
+        */
+       for (i = 1; i < MAX_BSETS; i++)
+               b->set[i].data = NULL;
+}
+EXPORT_SYMBOL(bch_btree_keys_init);
 
 /* Binary tree stuff for auxiliary search trees */
 
@@ -455,9 +511,11 @@ static unsigned bkey_to_cacheline(struct bset_tree *t, struct bkey *k)
        return ((void *) k - (void *) t->data) / BSET_CACHELINE;
 }
 
-static unsigned bkey_to_cacheline_offset(struct bkey *k)
+static unsigned bkey_to_cacheline_offset(struct bset_tree *t,
+                                        unsigned cacheline,
+                                        struct bkey *k)
 {
-       return ((size_t) k & (BSET_CACHELINE - 1)) / sizeof(uint64_t);
+       return (u64 *) k - (u64 *) cacheline_to_bkey(t, cacheline, 0);
 }
 
 static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned j)
@@ -504,7 +562,7 @@ static void make_bfloat(struct bset_tree *t, unsigned j)
                : tree_to_prev_bkey(t, j >> ffs(j));
 
        struct bkey *r = is_power_of_2(j + 1)
-               ? node(t->data, t->data->keys - bkey_u64s(&t->end))
+               ? bset_bkey_idx(t->data, t->data->keys - bkey_u64s(&t->end))
                : tree_to_bkey(t, j >> (ffz(j) + 1));
 
        BUG_ON(m < l || m > r);
@@ -528,9 +586,9 @@ static void make_bfloat(struct bset_tree *t, unsigned j)
                f->exponent = 127;
 }
 
-static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
+static void bset_alloc_tree(struct btree_keys *b, struct bset_tree *t)
 {
-       if (t != b->sets) {
+       if (t != b->set) {
                unsigned j = roundup(t[-1].size,
                                     64 / sizeof(struct bkey_float));
 
@@ -538,33 +596,54 @@ static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
                t->prev = t[-1].prev + j;
        }
 
-       while (t < b->sets + MAX_BSETS)
+       while (t < b->set + MAX_BSETS)
                t++->size = 0;
 }
 
-static void bset_build_unwritten_tree(struct btree *b)
+static void bch_bset_build_unwritten_tree(struct btree_keys *b)
 {
-       struct bset_tree *t = b->sets + b->nsets;
+       struct bset_tree *t = bset_tree_last(b);
+
+       BUG_ON(b->last_set_unwritten);
+       b->last_set_unwritten = 1;
 
        bset_alloc_tree(b, t);
 
-       if (t->tree != b->sets->tree + bset_tree_space(b)) {
-               t->prev[0] = bkey_to_cacheline_offset(t->data->start);
+       if (t->tree != b->set->tree + btree_keys_cachelines(b)) {
+               t->prev[0] = bkey_to_cacheline_offset(t, 0, t->data->start);
                t->size = 1;
        }
 }
 
-static void bset_build_written_tree(struct btree *b)
+void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic)
+{
+       if (i != b->set->data) {
+               b->set[++b->nsets].data = i;
+               i->seq = b->set->data->seq;
+       } else
+               get_random_bytes(&i->seq, sizeof(uint64_t));
+
+       i->magic        = magic;
+       i->version      = 0;
+       i->keys         = 0;
+
+       bch_bset_build_unwritten_tree(b);
+}
+EXPORT_SYMBOL(bch_bset_init_next);
+
+void bch_bset_build_written_tree(struct btree_keys *b)
 {
-       struct bset_tree *t = b->sets + b->nsets;
-       struct bkey *k = t->data->start;
+       struct bset_tree *t = bset_tree_last(b);
+       struct bkey *prev = NULL, *k = t->data->start;
        unsigned j, cacheline = 1;
 
+       b->last_set_unwritten = 0;
+
        bset_alloc_tree(b, t);
 
        t->size = min_t(unsigned,
-                       bkey_to_cacheline(t, end(t->data)),
-                       b->sets->tree + bset_tree_space(b) - t->tree);
+                       bkey_to_cacheline(t, bset_bkey_last(t->data)),
+                       b->set->tree + btree_keys_cachelines(b) - t->tree);
 
        if (t->size < 2) {
                t->size = 0;
@@ -577,16 +656,14 @@ static void bset_build_written_tree(struct btree *b)
        for (j = inorder_next(0, t->size);
             j;
             j = inorder_next(j, t->size)) {
-               while (bkey_to_cacheline(t, k) != cacheline)
-                       k = bkey_next(k);
+               while (bkey_to_cacheline(t, k) < cacheline)
+                       prev = k, k = bkey_next(k);
 
-               t->prev[j] = bkey_u64s(k);
-               k = bkey_next(k);
-               cacheline++;
-               t->tree[j].m = bkey_to_cacheline_offset(k);
+               t->prev[j] = bkey_u64s(prev);
+               t->tree[j].m = bkey_to_cacheline_offset(t, cacheline++, k);
        }
 
-       while (bkey_next(k) != end(t->data))
+       while (bkey_next(k) != bset_bkey_last(t->data))
                k = bkey_next(k);
 
        t->end = *k;
@@ -597,14 +674,17 @@ static void bset_build_written_tree(struct btree *b)
             j = inorder_next(j, t->size))
                make_bfloat(t, j);
 }
+EXPORT_SYMBOL(bch_bset_build_written_tree);
 
-void bch_bset_fix_invalidated_key(struct btree *b, struct bkey *k)
+/* Insert */
+
+void bch_bset_fix_invalidated_key(struct btree_keys *b, struct bkey *k)
 {
        struct bset_tree *t;
        unsigned inorder, j = 1;
 
-       for (t = b->sets; t <= &b->sets[b->nsets]; t++)
-               if (k < end(t->data))
+       for (t = b->set; t <= bset_tree_last(b); t++)
+               if (k < bset_bkey_last(t->data))
                        goto found_set;
 
        BUG();
@@ -617,7 +697,7 @@ found_set:
        if (k == t->data->start)
                goto fix_left;
 
-       if (bkey_next(k) == end(t->data)) {
+       if (bkey_next(k) == bset_bkey_last(t->data)) {
                t->end = *k;
                goto fix_right;
        }
@@ -642,10 +722,12 @@ fix_right:        do {
                        j = j * 2 + 1;
                } while (j < t->size);
 }
+EXPORT_SYMBOL(bch_bset_fix_invalidated_key);
 
-void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k)
+static void bch_bset_fix_lookup_table(struct btree_keys *b,
+                                     struct bset_tree *t,
+                                     struct bkey *k)
 {
-       struct bset_tree *t = &b->sets[b->nsets];
        unsigned shift = bkey_u64s(k);
        unsigned j = bkey_to_cacheline(t, k);
 
@@ -657,8 +739,8 @@ void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k)
         * lookup table for the first key that is strictly greater than k:
         * it's either k's cacheline or the next one
         */
-       if (j < t->size &&
-           table_to_bkey(t, j) <= k)
+       while (j < t->size &&
+              table_to_bkey(t, j) <= k)
                j++;
 
        /* Adjust all the lookup table entries, and find a new key for any that
@@ -673,54 +755,124 @@ void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k)
                        while (k < cacheline_to_bkey(t, j, 0))
                                k = bkey_next(k);
 
-                       t->prev[j] = bkey_to_cacheline_offset(k);
+                       t->prev[j] = bkey_to_cacheline_offset(t, j, k);
                }
        }
 
-       if (t->size == b->sets->tree + bset_tree_space(b) - t->tree)
+       if (t->size == b->set->tree + btree_keys_cachelines(b) - t->tree)
                return;
 
        /* Possibly add a new entry to the end of the lookup table */
 
        for (k = table_to_bkey(t, t->size - 1);
-            k != end(t->data);
+            k != bset_bkey_last(t->data);
             k = bkey_next(k))
                if (t->size == bkey_to_cacheline(t, k)) {
-                       t->prev[t->size] = bkey_to_cacheline_offset(k);
+                       t->prev[t->size] = bkey_to_cacheline_offset(t, t->size, k);
                        t->size++;
                }
 }
 
-void bch_bset_init_next(struct btree *b)
+/*
+ * Tries to merge l and r: l should be lower than r
+ * Returns true if we were able to merge. If we did merge, l will be the merged
+ * key, r will be untouched.
+ */
+bool bch_bkey_try_merge(struct btree_keys *b, struct bkey *l, struct bkey *r)
 {
-       struct bset *i = write_block(b);
+       if (!b->ops->key_merge)
+               return false;
 
-       if (i != b->sets[0].data) {
-               b->sets[++b->nsets].data = i;
-               i->seq = b->sets[0].data->seq;
-       } else
-               get_random_bytes(&i->seq, sizeof(uint64_t));
+       /*
+        * Generic header checks
+        * Assumes left and right are in order
+        * Left and right must be exactly aligned
+        */
+       if (!bch_bkey_equal_header(l, r) ||
+            bkey_cmp(l, &START_KEY(r)))
+               return false;
 
-       i->magic        = bset_magic(&b->c->sb);
-       i->version      = 0;
-       i->keys         = 0;
+       return b->ops->key_merge(b, l, r);
+}
+EXPORT_SYMBOL(bch_bkey_try_merge);
 
-       bset_build_unwritten_tree(b);
+void bch_bset_insert(struct btree_keys *b, struct bkey *where,
+                    struct bkey *insert)
+{
+       struct bset_tree *t = bset_tree_last(b);
+
+       BUG_ON(!b->last_set_unwritten);
+       BUG_ON(bset_byte_offset(b, t->data) +
+              __set_bytes(t->data, t->data->keys + bkey_u64s(insert)) >
+              PAGE_SIZE << b->page_order);
+
+       memmove((uint64_t *) where + bkey_u64s(insert),
+               where,
+               (void *) bset_bkey_last(t->data) - (void *) where);
+
+       t->data->keys += bkey_u64s(insert);
+       bkey_copy(where, insert);
+       bch_bset_fix_lookup_table(b, t, where);
 }
+EXPORT_SYMBOL(bch_bset_insert);
+
+unsigned bch_btree_insert_key(struct btree_keys *b, struct bkey *k,
+                             struct bkey *replace_key)
+{
+       unsigned status = BTREE_INSERT_STATUS_NO_INSERT;
+       struct bset *i = bset_tree_last(b)->data;
+       struct bkey *m, *prev = NULL;
+       struct btree_iter iter;
+
+       BUG_ON(b->ops->is_extents && !KEY_SIZE(k));
+
+       m = bch_btree_iter_init(b, &iter, b->ops->is_extents
+                               ? PRECEDING_KEY(&START_KEY(k))
+                               : PRECEDING_KEY(k));
+
+       if (b->ops->insert_fixup(b, k, &iter, replace_key))
+               return status;
+
+       status = BTREE_INSERT_STATUS_INSERT;
+
+       while (m != bset_bkey_last(i) &&
+              bkey_cmp(k, b->ops->is_extents ? &START_KEY(m) : m) > 0)
+               prev = m, m = bkey_next(m);
+
+       /* prev is in the tree, if we merge we're done */
+       status = BTREE_INSERT_STATUS_BACK_MERGE;
+       if (prev &&
+           bch_bkey_try_merge(b, prev, k))
+               goto merged;
+#if 0
+       status = BTREE_INSERT_STATUS_OVERWROTE;
+       if (m != bset_bkey_last(i) &&
+           KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
+               goto copy;
+#endif
+       status = BTREE_INSERT_STATUS_FRONT_MERGE;
+       if (m != bset_bkey_last(i) &&
+           bch_bkey_try_merge(b, k, m))
+               goto copy;
+
+       bch_bset_insert(b, m, k);
+copy:  bkey_copy(m, k);
+merged:
+       return status;
+}
+EXPORT_SYMBOL(bch_btree_insert_key);
+
+/* Lookup */
 
 struct bset_search_iter {
        struct bkey *l, *r;
 };
 
-static struct bset_search_iter bset_search_write_set(struct btree *b,
-                                                    struct bset_tree *t,
+static struct bset_search_iter bset_search_write_set(struct bset_tree *t,
                                                     const struct bkey *search)
 {
        unsigned li = 0, ri = t->size;
 
-       BUG_ON(!b->nsets &&
-              t->size < bkey_to_cacheline(t, end(t->data)));
-
        while (li + 1 != ri) {
                unsigned m = (li + ri) >> 1;
 
@@ -732,12 +884,11 @@ static struct bset_search_iter bset_search_write_set(struct btree *b,
 
        return (struct bset_search_iter) {
                table_to_bkey(t, li),
-               ri < t->size ? table_to_bkey(t, ri) : end(t->data)
+               ri < t->size ? table_to_bkey(t, ri) : bset_bkey_last(t->data)
        };
 }
 
-static struct bset_search_iter bset_search_tree(struct btree *b,
-                                               struct bset_tree *t,
+static struct bset_search_iter bset_search_tree(struct bset_tree *t,
                                                const struct bkey *search)
 {
        struct bkey *l, *r;
@@ -784,7 +935,7 @@ static struct bset_search_iter bset_search_tree(struct btree *b,
                        f = &t->tree[inorder_next(j, t->size)];
                        r = cacheline_to_bkey(t, inorder, f->m);
                } else
-                       r = end(t->data);
+                       r = bset_bkey_last(t->data);
        } else {
                r = cacheline_to_bkey(t, inorder, f->m);
 
@@ -798,7 +949,7 @@ static struct bset_search_iter bset_search_tree(struct btree *b,
        return (struct bset_search_iter) {l, r};
 }
 
-struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
+struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t,
                               const struct bkey *search)
 {
        struct bset_search_iter i;
@@ -820,7 +971,7 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
 
        if (unlikely(!t->size)) {
                i.l = t->data->start;
-               i.r = end(t->data);
+               i.r = bset_bkey_last(t->data);
        } else if (bset_written(b, t)) {
                /*
                 * Each node in the auxiliary search tree covers a certain range
@@ -830,23 +981,27 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
                 */
 
                if (unlikely(bkey_cmp(search, &t->end) >= 0))
-                       return end(t->data);
+                       return bset_bkey_last(t->data);
 
                if (unlikely(bkey_cmp(search, t->data->start) < 0))
                        return t->data->start;
 
-               i = bset_search_tree(b, t, search);
-       } else
-               i = bset_search_write_set(b, t, search);
+               i = bset_search_tree(t, search);
+       } else {
+               BUG_ON(!b->nsets &&
+                      t->size < bkey_to_cacheline(t, bset_bkey_last(t->data)));
 
-       if (expensive_debug_checks(b->c)) {
+               i = bset_search_write_set(t, search);
+       }
+
+       if (btree_keys_expensive_checks(b)) {
                BUG_ON(bset_written(b, t) &&
                       i.l != t->data->start &&
                       bkey_cmp(tree_to_prev_bkey(t,
                          inorder_to_tree(bkey_to_cacheline(t, i.l), t)),
                                search) > 0);
 
-               BUG_ON(i.r != end(t->data) &&
+               BUG_ON(i.r != bset_bkey_last(t->data) &&
                       bkey_cmp(i.r, search) <= 0);
        }
 
@@ -856,22 +1011,17 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
 
        return i.l;
 }
+EXPORT_SYMBOL(__bch_bset_search);
 
 /* Btree iterator */
 
-/*
- * Returns true if l > r - unless l == r, in which case returns true if l is
- * older than r.
- *
- * Necessary for btree_sort_fixup() - if there are multiple keys that compare
- * equal in different sets, we have to process them newest to oldest.
- */
+typedef bool (btree_iter_cmp_fn)(struct btree_iter_set,
+                                struct btree_iter_set);
+
 static inline bool btree_iter_cmp(struct btree_iter_set l,
                                  struct btree_iter_set r)
 {
-       int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k));
-
-       return c ? c > 0 : l.k < r.k;
+       return bkey_cmp(l.k, r.k) > 0;
 }
 
 static inline bool btree_iter_end(struct btree_iter *iter)
@@ -888,8 +1038,10 @@ void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k,
                                 btree_iter_cmp));
 }
 
-struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter,
-                                  struct bkey *search, struct bset_tree *start)
+static struct bkey *__bch_btree_iter_init(struct btree_keys *b,
+                                         struct btree_iter *iter,
+                                         struct bkey *search,
+                                         struct bset_tree *start)
 {
        struct bkey *ret = NULL;
        iter->size = ARRAY_SIZE(iter->data);
@@ -899,15 +1051,24 @@ struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter,
        iter->b = b;
 #endif
 
-       for (; start <= &b->sets[b->nsets]; start++) {
+       for (; start <= bset_tree_last(b); start++) {
                ret = bch_bset_search(b, start, search);
-               bch_btree_iter_push(iter, ret, end(start->data));
+               bch_btree_iter_push(iter, ret, bset_bkey_last(start->data));
        }
 
        return ret;
 }
 
-struct bkey *bch_btree_iter_next(struct btree_iter *iter)
+struct bkey *bch_btree_iter_init(struct btree_keys *b,
+                                struct btree_iter *iter,
+                                struct bkey *search)
+{
+       return __bch_btree_iter_init(b, iter, search, b->set);
+}
+EXPORT_SYMBOL(bch_btree_iter_init);
+
+static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter,
+                                                btree_iter_cmp_fn *cmp)
 {
        struct btree_iter_set unused;
        struct bkey *ret = NULL;
@@ -924,16 +1085,23 @@ struct bkey *bch_btree_iter_next(struct btree_iter *iter)
                }
 
                if (iter->data->k == iter->data->end)
-                       heap_pop(iter, unused, btree_iter_cmp);
+                       heap_pop(iter, unused, cmp);
                else
-                       heap_sift(iter, 0, btree_iter_cmp);
+                       heap_sift(iter, 0, cmp);
        }
 
        return ret;
 }
 
+struct bkey *bch_btree_iter_next(struct btree_iter *iter)
+{
+       return __bch_btree_iter_next(iter, btree_iter_cmp);
+
+}
+EXPORT_SYMBOL(bch_btree_iter_next);
+
 struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
-                                       struct btree *b, ptr_filter_fn fn)
+                                       struct btree_keys *b, ptr_filter_fn fn)
 {
        struct bkey *ret;
 
@@ -946,70 +1114,58 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
 
 /* Mergesort */
 
-static void sort_key_next(struct btree_iter *iter,
-                         struct btree_iter_set *i)
+void bch_bset_sort_state_free(struct bset_sort_state *state)
 {
-       i->k = bkey_next(i->k);
-
-       if (i->k == i->end)
-               *i = iter->data[--iter->used];
+       if (state->pool)
+               mempool_destroy(state->pool);
 }
 
-static void btree_sort_fixup(struct btree_iter *iter)
+int bch_bset_sort_state_init(struct bset_sort_state *state, unsigned page_order)
 {
-       while (iter->used > 1) {
-               struct btree_iter_set *top = iter->data, *i = top + 1;
+       spin_lock_init(&state->time.lock);
 
-               if (iter->used > 2 &&
-                   btree_iter_cmp(i[0], i[1]))
-                       i++;
+       state->page_order = page_order;
+       state->crit_factor = int_sqrt(1 << page_order);
 
-               if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0)
-                       break;
-
-               if (!KEY_SIZE(i->k)) {
-                       sort_key_next(iter, i);
-                       heap_sift(iter, i - top, btree_iter_cmp);
-                       continue;
-               }
-
-               if (top->k > i->k) {
-                       if (bkey_cmp(top->k, i->k) >= 0)
-                               sort_key_next(iter, i);
-                       else
-                               bch_cut_front(top->k, i->k);
+       state->pool = mempool_create_page_pool(1, page_order);
+       if (!state->pool)
+               return -ENOMEM;
 
-                       heap_sift(iter, i - top, btree_iter_cmp);
-               } else {
-                       /* can't happen because of comparison func */
-                       BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k)));
-                       bch_cut_back(&START_KEY(i->k), top->k);
-               }
-       }
+       return 0;
 }
+EXPORT_SYMBOL(bch_bset_sort_state_init);
 
-static void btree_mergesort(struct btree *b, struct bset *out,
+static void btree_mergesort(struct btree_keys *b, struct bset *out,
                            struct btree_iter *iter,
                            bool fixup, bool remove_stale)
 {
+       int i;
        struct bkey *k, *last = NULL;
-       bool (*bad)(struct btree *, const struct bkey *) = remove_stale
+       BKEY_PADDED(k) tmp;
+       bool (*bad)(struct btree_keys *, const struct bkey *) = remove_stale
                ? bch_ptr_bad
                : bch_ptr_invalid;
 
+       /* Heapify the iterator, using our comparison function */
+       for (i = iter->used / 2 - 1; i >= 0; --i)
+               heap_sift(iter, i, b->ops->sort_cmp);
+
        while (!btree_iter_end(iter)) {
-               if (fixup && !b->level)
-                       btree_sort_fixup(iter);
+               if (b->ops->sort_fixup && fixup)
+                       k = b->ops->sort_fixup(iter, &tmp.k);
+               else
+                       k = NULL;
+
+               if (!k)
+                       k = __bch_btree_iter_next(iter, b->ops->sort_cmp);
 
-               k = bch_btree_iter_next(iter);
                if (bad(b, k))
                        continue;
 
                if (!last) {
                        last = out->start;
                        bkey_copy(last, k);
-               } else if (b->level ||
-                          !bch_bkey_try_merge(b, last, k)) {
+               } else if (!bch_bkey_try_merge(b, last, k)) {
                        last = bkey_next(last);
                        bkey_copy(last, k);
                }
@@ -1020,27 +1176,27 @@ static void btree_mergesort(struct btree *b, struct bset *out,
        pr_debug("sorted %i keys", out->keys);
 }
 
-static void __btree_sort(struct btree *b, struct btree_iter *iter,
-                        unsigned start, unsigned order, bool fixup)
+static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
+                        unsigned start, unsigned order, bool fixup,
+                        struct bset_sort_state *state)
 {
        uint64_t start_time;
-       bool remove_stale = !b->written;
+       bool used_mempool = false;
        struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO,
                                                     order);
        if (!out) {
-               mutex_lock(&b->c->sort_lock);
-               out = b->c->sort;
-               order = ilog2(bucket_pages(b->c));
+               BUG_ON(order > state->page_order);
+
+               out = page_address(mempool_alloc(state->pool, GFP_NOIO));
+               used_mempool = true;
+               order = state->page_order;
        }
 
        start_time = local_clock();
 
-       btree_mergesort(b, out, iter, fixup, remove_stale);
+       btree_mergesort(b, out, iter, fixup, false);
        b->nsets = start;
 
-       if (!fixup && !start && b->written)
-               bch_btree_verify(b, out);
-
        if (!start && order == b->page_order) {
                /*
                 * Our temporary buffer is the same size as the btree node's
@@ -1048,84 +1204,76 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter,
                 * memcpy()
                 */
 
-               out->magic      = bset_magic(&b->c->sb);
-               out->seq        = b->sets[0].data->seq;
-               out->version    = b->sets[0].data->version;
-               swap(out, b->sets[0].data);
-
-               if (b->c->sort == b->sets[0].data)
-                       b->c->sort = out;
+               out->magic      = b->set->data->magic;
+               out->seq        = b->set->data->seq;
+               out->version    = b->set->data->version;
+               swap(out, b->set->data);
        } else {
-               b->sets[start].data->keys = out->keys;
-               memcpy(b->sets[start].data->start, out->start,
-                      (void *) end(out) - (void *) out->start);
+               b->set[start].data->keys = out->keys;
+               memcpy(b->set[start].data->start, out->start,
+                      (void *) bset_bkey_last(out) - (void *) out->start);
        }
 
-       if (out == b->c->sort)
-               mutex_unlock(&b->c->sort_lock);
+       if (used_mempool)
+               mempool_free(virt_to_page(out), state->pool);
        else
                free_pages((unsigned long) out, order);
 
-       if (b->written)
-               bset_build_written_tree(b);
+       bch_bset_build_written_tree(b);
 
        if (!start)
-               bch_time_stats_update(&b->c->sort_time, start_time);
+               bch_time_stats_update(&state->time, start_time);
 }
 
-void bch_btree_sort_partial(struct btree *b, unsigned start)
+void bch_btree_sort_partial(struct btree_keys *b, unsigned start,
+                           struct bset_sort_state *state)
 {
        size_t order = b->page_order, keys = 0;
        struct btree_iter iter;
        int oldsize = bch_count_data(b);
 
-       __bch_btree_iter_init(b, &iter, NULL, &b->sets[start]);
-
-       BUG_ON(b->sets[b->nsets].data == write_block(b) &&
-              (b->sets[b->nsets].size || b->nsets));
-
+       __bch_btree_iter_init(b, &iter, NULL, &b->set[start]);
 
        if (start) {
                unsigned i;
 
                for (i = start; i <= b->nsets; i++)
-                       keys += b->sets[i].data->keys;
+                       keys += b->set[i].data->keys;
 
-               order = roundup_pow_of_two(__set_bytes(b->sets->data,
-                                                      keys)) / PAGE_SIZE;
-               if (order)
-                       order = ilog2(order);
+               order = get_order(__set_bytes(b->set->data, keys));
        }
 
-       __btree_sort(b, &iter, start, order, false);
+       __btree_sort(b, &iter, start, order, false, state);
 
-       EBUG_ON(b->written && oldsize >= 0 && bch_count_data(b) != oldsize);
+       EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize);
 }
+EXPORT_SYMBOL(bch_btree_sort_partial);
 
-void bch_btree_sort_and_fix_extents(struct btree *b, struct btree_iter *iter)
+void bch_btree_sort_and_fix_extents(struct btree_keys *b,
+                                   struct btree_iter *iter,
+                                   struct bset_sort_state *state)
 {
-       BUG_ON(!b->written);
-       __btree_sort(b, iter, 0, b->page_order, true);
+       __btree_sort(b, iter, 0, b->page_order, true, state);
 }
 
-void bch_btree_sort_into(struct btree *b, struct btree *new)
+void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new,
+                        struct bset_sort_state *state)
 {
        uint64_t start_time = local_clock();
 
        struct btree_iter iter;
        bch_btree_iter_init(b, &iter, NULL);
 
-       btree_mergesort(b, new->sets->data, &iter, false, true);
+       btree_mergesort(b, new->set->data, &iter, false, true);
 
-       bch_time_stats_update(&b->c->sort_time, start_time);
+       bch_time_stats_update(&state->time, start_time);
 
-       bkey_copy_key(&new->key, &b->key);
-       new->sets->size = 0;
+       new->set->size = 0; // XXX: why?
 }
 
 #define SORT_CRIT      (4096 / sizeof(uint64_t))
 
-void bch_btree_sort_lazy(struct btree *b)
+void bch_btree_sort_lazy(struct btree_keys *b, struct bset_sort_state *state)
 {
        unsigned crit = SORT_CRIT;
        int i;
@@ -1134,50 +1282,32 @@ void bch_btree_sort_lazy(struct btree *b)
        if (!b->nsets)
                goto out;
 
-       /* If not a leaf node, always sort */
-       if (b->level) {
-               bch_btree_sort(b);
-               return;
-       }
-
        for (i = b->nsets - 1; i >= 0; --i) {
-               crit *= b->c->sort_crit_factor;
+               crit *= state->crit_factor;
 
-               if (b->sets[i].data->keys < crit) {
-                       bch_btree_sort_partial(b, i);
+               if (b->set[i].data->keys < crit) {
+                       bch_btree_sort_partial(b, i, state);
                        return;
                }
        }
 
        /* Sort if we'd overflow */
        if (b->nsets + 1 == MAX_BSETS) {
-               bch_btree_sort(b);
+               bch_btree_sort(b, state);
                return;
        }
 
 out:
-       bset_build_written_tree(b);
+       bch_bset_build_written_tree(b);
 }
+EXPORT_SYMBOL(bch_btree_sort_lazy);
 
-/* Sysfs stuff */
-
-struct bset_stats {
-       struct btree_op op;
-       size_t nodes;
-       size_t sets_written, sets_unwritten;
-       size_t bytes_written, bytes_unwritten;
-       size_t floats, failed;
-};
-
-static int btree_bset_stats(struct btree_op *op, struct btree *b)
+void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *stats)
 {
-       struct bset_stats *stats = container_of(op, struct bset_stats, op);
        unsigned i;
 
-       stats->nodes++;
-
        for (i = 0; i <= b->nsets; i++) {
-               struct bset_tree *t = &b->sets[i];
+               struct bset_tree *t = &b->set[i];
                size_t bytes = t->data->keys * sizeof(uint64_t);
                size_t j;
 
@@ -1195,32 +1325,4 @@ static int btree_bset_stats(struct btree_op *op, struct btree *b)
                        stats->bytes_unwritten += bytes;
                }
        }
-
-       return MAP_CONTINUE;
-}
-
-int bch_bset_print_stats(struct cache_set *c, char *buf)
-{
-       struct bset_stats t;
-       int ret;
-
-       memset(&t, 0, sizeof(struct bset_stats));
-       bch_btree_op_init(&t.op, -1);
-
-       ret = bch_btree_map_nodes(&t.op, c, &ZERO_KEY, btree_bset_stats);
-       if (ret < 0)
-               return ret;
-
-       return snprintf(buf, PAGE_SIZE,
-                       "btree nodes:           %zu\n"
-                       "written sets:          %zu\n"
-                       "unwritten sets:                %zu\n"
-                       "written key bytes:     %zu\n"
-                       "unwritten key bytes:   %zu\n"
-                       "floats:                        %zu\n"
-                       "failed:                        %zu\n",
-                       t.nodes,
-                       t.sets_written, t.sets_unwritten,
-                       t.bytes_written, t.bytes_unwritten,
-                       t.floats, t.failed);
 }
index 1d3c24f9fa0e95fbb20c03fcb825559b0736de1b..003260f4ddf6e725417956531e64c9cb33022ae4 100644 (file)
@@ -1,7 +1,11 @@
 #ifndef _BCACHE_BSET_H
 #define _BCACHE_BSET_H
 
-#include <linux/slab.h>
+#include <linux/bcache.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+#include "util.h" /* for time_stats */
 
 /*
  * BKEYS:
  * first key in that range of bytes again.
  */
 
-/* Btree key comparison/iteration */
+struct btree_keys;
+struct btree_iter;
+struct btree_iter_set;
+struct bkey_float;
 
 #define MAX_BSETS              4U
 
-struct btree_iter {
-       size_t size, used;
-#ifdef CONFIG_BCACHE_DEBUG
-       struct btree *b;
-#endif
-       struct btree_iter_set {
-               struct bkey *k, *end;
-       } data[MAX_BSETS];
-};
-
 struct bset_tree {
        /*
         * We construct a binary tree in an array as if the array
@@ -165,14 +162,14 @@ struct bset_tree {
         */
 
        /* size of the binary tree and prev array */
-       unsigned        size;
+       unsigned                size;
 
        /* function of size - precalculated for to_inorder() */
-       unsigned        extra;
+       unsigned                extra;
 
        /* copy of the last key in the set */
-       struct bkey     end;
-       struct bkey_float *tree;
+       struct bkey             end;
+       struct bkey_float       *tree;
 
        /*
         * The nodes in the bset tree point to specific keys - this
@@ -182,12 +179,219 @@ struct bset_tree {
         * to keep bkey_float to 4 bytes and prev isn't used in the fast
         * path.
         */
-       uint8_t         *prev;
+       uint8_t                 *prev;
 
        /* The actual btree node, with pointers to each sorted set */
-       struct bset     *data;
+       struct bset             *data;
+};
+
+struct btree_keys_ops {
+       bool            (*sort_cmp)(struct btree_iter_set,
+                                   struct btree_iter_set);
+       struct bkey     *(*sort_fixup)(struct btree_iter *, struct bkey *);
+       bool            (*insert_fixup)(struct btree_keys *, struct bkey *,
+                                       struct btree_iter *, struct bkey *);
+       bool            (*key_invalid)(struct btree_keys *,
+                                      const struct bkey *);
+       bool            (*key_bad)(struct btree_keys *, const struct bkey *);
+       bool            (*key_merge)(struct btree_keys *,
+                                    struct bkey *, struct bkey *);
+       void            (*key_to_text)(char *, size_t, const struct bkey *);
+       void            (*key_dump)(struct btree_keys *, const struct bkey *);
+
+       /*
+        * Only used for deciding whether to use START_KEY(k) or just the key
+        * itself in a couple places
+        */
+       bool            is_extents;
+};
+
+struct btree_keys {
+       const struct btree_keys_ops     *ops;
+       uint8_t                 page_order;
+       uint8_t                 nsets;
+       unsigned                last_set_unwritten:1;
+       bool                    *expensive_debug_checks;
+
+       /*
+        * Sets of sorted keys - the real btree node - plus a binary search tree
+        *
+        * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
+        * to the memory we have allocated for this btree node. Additionally,
+        * set[0]->data points to the entire btree node as it exists on disk.
+        */
+       struct bset_tree        set[MAX_BSETS];
+};
+
+static inline struct bset_tree *bset_tree_last(struct btree_keys *b)
+{
+       return b->set + b->nsets;
+}
+
+static inline bool bset_written(struct btree_keys *b, struct bset_tree *t)
+{
+       return t <= b->set + b->nsets - b->last_set_unwritten;
+}
+
+static inline bool bkey_written(struct btree_keys *b, struct bkey *k)
+{
+       return !b->last_set_unwritten || k < b->set[b->nsets].data->start;
+}
+
+static inline unsigned bset_byte_offset(struct btree_keys *b, struct bset *i)
+{
+       return ((size_t) i) - ((size_t) b->set->data);
+}
+
+static inline unsigned bset_sector_offset(struct btree_keys *b, struct bset *i)
+{
+       return bset_byte_offset(b, i) >> 9;
+}
+
+#define __set_bytes(i, k)      (sizeof(*(i)) + (k) * sizeof(uint64_t))
+#define set_bytes(i)           __set_bytes(i, i->keys)
+
+#define __set_blocks(i, k, block_bytes)                                \
+       DIV_ROUND_UP(__set_bytes(i, k), block_bytes)
+#define set_blocks(i, block_bytes)                             \
+       __set_blocks(i, (i)->keys, block_bytes)
+
+static inline size_t bch_btree_keys_u64s_remaining(struct btree_keys *b)
+{
+       struct bset_tree *t = bset_tree_last(b);
+
+       BUG_ON((PAGE_SIZE << b->page_order) <
+              (bset_byte_offset(b, t->data) + set_bytes(t->data)));
+
+       if (!b->last_set_unwritten)
+               return 0;
+
+       return ((PAGE_SIZE << b->page_order) -
+               (bset_byte_offset(b, t->data) + set_bytes(t->data))) /
+               sizeof(u64);
+}
+
+static inline struct bset *bset_next_set(struct btree_keys *b,
+                                        unsigned block_bytes)
+{
+       struct bset *i = bset_tree_last(b)->data;
+
+       return ((void *) i) + roundup(set_bytes(i), block_bytes);
+}
+
+void bch_btree_keys_free(struct btree_keys *);
+int bch_btree_keys_alloc(struct btree_keys *, unsigned, gfp_t);
+void bch_btree_keys_init(struct btree_keys *, const struct btree_keys_ops *,
+                        bool *);
+
+void bch_bset_init_next(struct btree_keys *, struct bset *, uint64_t);
+void bch_bset_build_written_tree(struct btree_keys *);
+void bch_bset_fix_invalidated_key(struct btree_keys *, struct bkey *);
+bool bch_bkey_try_merge(struct btree_keys *, struct bkey *, struct bkey *);
+void bch_bset_insert(struct btree_keys *, struct bkey *, struct bkey *);
+unsigned bch_btree_insert_key(struct btree_keys *, struct bkey *,
+                             struct bkey *);
+
+enum {
+       BTREE_INSERT_STATUS_NO_INSERT = 0,
+       BTREE_INSERT_STATUS_INSERT,
+       BTREE_INSERT_STATUS_BACK_MERGE,
+       BTREE_INSERT_STATUS_OVERWROTE,
+       BTREE_INSERT_STATUS_FRONT_MERGE,
 };
 
+/* Btree key iteration */
+
+struct btree_iter {
+       size_t size, used;
+#ifdef CONFIG_BCACHE_DEBUG
+       struct btree_keys *b;
+#endif
+       struct btree_iter_set {
+               struct bkey *k, *end;
+       } data[MAX_BSETS];
+};
+
+typedef bool (*ptr_filter_fn)(struct btree_keys *, const struct bkey *);
+
+struct bkey *bch_btree_iter_next(struct btree_iter *);
+struct bkey *bch_btree_iter_next_filter(struct btree_iter *,
+                                       struct btree_keys *, ptr_filter_fn);
+
+void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *);
+struct bkey *bch_btree_iter_init(struct btree_keys *, struct btree_iter *,
+                                struct bkey *);
+
+struct bkey *__bch_bset_search(struct btree_keys *, struct bset_tree *,
+                              const struct bkey *);
+
+/*
+ * Returns the first key that is strictly greater than search
+ */
+static inline struct bkey *bch_bset_search(struct btree_keys *b,
+                                          struct bset_tree *t,
+                                          const struct bkey *search)
+{
+       return search ? __bch_bset_search(b, t, search) : t->data->start;
+}
+
+#define for_each_key_filter(b, k, iter, filter)                                \
+       for (bch_btree_iter_init((b), (iter), NULL);                    \
+            ((k) = bch_btree_iter_next_filter((iter), (b), filter));)
+
+#define for_each_key(b, k, iter)                                       \
+       for (bch_btree_iter_init((b), (iter), NULL);                    \
+            ((k) = bch_btree_iter_next(iter));)
+
+/* Sorting */
+
+struct bset_sort_state {
+       mempool_t               *pool;
+
+       unsigned                page_order;
+       unsigned                crit_factor;
+
+       struct time_stats       time;
+};
+
+void bch_bset_sort_state_free(struct bset_sort_state *);
+int bch_bset_sort_state_init(struct bset_sort_state *, unsigned);
+void bch_btree_sort_lazy(struct btree_keys *, struct bset_sort_state *);
+void bch_btree_sort_into(struct btree_keys *, struct btree_keys *,
+                        struct bset_sort_state *);
+void bch_btree_sort_and_fix_extents(struct btree_keys *, struct btree_iter *,
+                                   struct bset_sort_state *);
+void bch_btree_sort_partial(struct btree_keys *, unsigned,
+                           struct bset_sort_state *);
+
+static inline void bch_btree_sort(struct btree_keys *b,
+                                 struct bset_sort_state *state)
+{
+       bch_btree_sort_partial(b, 0, state);
+}
+
+struct bset_stats {
+       size_t sets_written, sets_unwritten;
+       size_t bytes_written, bytes_unwritten;
+       size_t floats, failed;
+};
+
+void bch_btree_keys_stats(struct btree_keys *, struct bset_stats *);
+
+/* Bkey utility code */
+
+#define bset_bkey_last(i)      bkey_idx((struct bkey *) (i)->d, (i)->keys)
+
+static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned idx)
+{
+       return bkey_idx(i->start, idx);
+}
+
+static inline void bkey_init(struct bkey *k)
+{
+       *k = ZERO_KEY;
+}
+
 static __always_inline int64_t bkey_cmp(const struct bkey *l,
                                        const struct bkey *r)
 {
@@ -196,6 +400,62 @@ static __always_inline int64_t bkey_cmp(const struct bkey *l,
                : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r);
 }
 
+void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *,
+                             unsigned);
+bool __bch_cut_front(const struct bkey *, struct bkey *);
+bool __bch_cut_back(const struct bkey *, struct bkey *);
+
+static inline bool bch_cut_front(const struct bkey *where, struct bkey *k)
+{
+       BUG_ON(bkey_cmp(where, k) > 0);
+       return __bch_cut_front(where, k);
+}
+
+static inline bool bch_cut_back(const struct bkey *where, struct bkey *k)
+{
+       BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0);
+       return __bch_cut_back(where, k);
+}
+
+#define PRECEDING_KEY(_k)                                      \
+({                                                             \
+       struct bkey *_ret = NULL;                               \
+                                                               \
+       if (KEY_INODE(_k) || KEY_OFFSET(_k)) {                  \
+               _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0);  \
+                                                               \
+               if (!_ret->low)                                 \
+                       _ret->high--;                           \
+               _ret->low--;                                    \
+       }                                                       \
+                                                               \
+       _ret;                                                   \
+})
+
+static inline bool bch_ptr_invalid(struct btree_keys *b, const struct bkey *k)
+{
+       return b->ops->key_invalid(b, k);
+}
+
+static inline bool bch_ptr_bad(struct btree_keys *b, const struct bkey *k)
+{
+       return b->ops->key_bad(b, k);
+}
+
+static inline void bch_bkey_to_text(struct btree_keys *b, char *buf,
+                                   size_t size, const struct bkey *k)
+{
+       return b->ops->key_to_text(buf, size, k);
+}
+
+static inline bool bch_bkey_equal_header(const struct bkey *l,
+                                        const struct bkey *r)
+{
+       return (KEY_DIRTY(l) == KEY_DIRTY(r) &&
+               KEY_PTRS(l) == KEY_PTRS(r) &&
+               KEY_CSUM(l) == KEY_CSUM(l));
+}
+
 /* Keylists */
 
 struct keylist {
@@ -257,136 +517,44 @@ static inline size_t bch_keylist_bytes(struct keylist *l)
 
 struct bkey *bch_keylist_pop(struct keylist *);
 void bch_keylist_pop_front(struct keylist *);
-int bch_keylist_realloc(struct keylist *, int, struct cache_set *);
-
-void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *,
-                             unsigned);
-bool __bch_cut_front(const struct bkey *, struct bkey *);
-bool __bch_cut_back(const struct bkey *, struct bkey *);
+int __bch_keylist_realloc(struct keylist *, unsigned);
 
-static inline bool bch_cut_front(const struct bkey *where, struct bkey *k)
-{
-       BUG_ON(bkey_cmp(where, k) > 0);
-       return __bch_cut_front(where, k);
-}
+/* Debug stuff */
 
-static inline bool bch_cut_back(const struct bkey *where, struct bkey *k)
-{
-       BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0);
-       return __bch_cut_back(where, k);
-}
-
-const char *bch_ptr_status(struct cache_set *, const struct bkey *);
-bool bch_btree_ptr_invalid(struct cache_set *, const struct bkey *);
-bool bch_extent_ptr_invalid(struct cache_set *, const struct bkey *);
-
-bool bch_ptr_bad(struct btree *, const struct bkey *);
-
-static inline uint8_t gen_after(uint8_t a, uint8_t b)
-{
-       uint8_t r = a - b;
-       return r > 128U ? 0 : r;
-}
-
-static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k,
-                               unsigned i)
-{
-       return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i));
-}
-
-static inline bool ptr_available(struct cache_set *c, const struct bkey *k,
-                                unsigned i)
-{
-       return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i);
-}
-
-
-typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *);
-
-struct bkey *bch_btree_iter_next(struct btree_iter *);
-struct bkey *bch_btree_iter_next_filter(struct btree_iter *,
-                                       struct btree *, ptr_filter_fn);
-
-void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *);
-struct bkey *__bch_btree_iter_init(struct btree *, struct btree_iter *,
-                                  struct bkey *, struct bset_tree *);
-
-/* 32 bits total: */
-#define BKEY_MID_BITS          3
-#define BKEY_EXPONENT_BITS     7
-#define BKEY_MANTISSA_BITS     22
-#define BKEY_MANTISSA_MASK     ((1 << BKEY_MANTISSA_BITS) - 1)
-
-struct bkey_float {
-       unsigned        exponent:BKEY_EXPONENT_BITS;
-       unsigned        m:BKEY_MID_BITS;
-       unsigned        mantissa:BKEY_MANTISSA_BITS;
-} __packed;
-
-/*
- * BSET_CACHELINE was originally intended to match the hardware cacheline size -
- * it used to be 64, but I realized the lookup code would touch slightly less
- * memory if it was 128.
- *
- * It definites the number of bytes (in struct bset) per struct bkey_float in
- * the auxiliar search tree - when we're done searching the bset_float tree we
- * have this many bytes left that we do a linear search over.
- *
- * Since (after level 5) every level of the bset_tree is on a new cacheline,
- * we're touching one fewer cacheline in the bset tree in exchange for one more
- * cacheline in the linear search - but the linear search might stop before it
- * gets to the second cacheline.
- */
-
-#define BSET_CACHELINE         128
-#define bset_tree_space(b)     (btree_data_space(b) / BSET_CACHELINE)
+#ifdef CONFIG_BCACHE_DEBUG
 
-#define bset_tree_bytes(b)     (bset_tree_space(b) * sizeof(struct bkey_float))
-#define bset_prev_bytes(b)     (bset_tree_space(b) * sizeof(uint8_t))
+int __bch_count_data(struct btree_keys *);
+void __bch_check_keys(struct btree_keys *, const char *, ...);
+void bch_dump_bset(struct btree_keys *, struct bset *, unsigned);
+void bch_dump_bucket(struct btree_keys *);
 
-void bch_bset_init_next(struct btree *);
+#else
 
-void bch_bset_fix_invalidated_key(struct btree *, struct bkey *);
-void bch_bset_fix_lookup_table(struct btree *, struct bkey *);
+static inline int __bch_count_data(struct btree_keys *b) { return -1; }
+static inline void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) {}
+static inline void bch_dump_bucket(struct btree_keys *b) {}
+void bch_dump_bset(struct btree_keys *, struct bset *, unsigned);
 
-struct bkey *__bch_bset_search(struct btree *, struct bset_tree *,
-                          const struct bkey *);
+#endif
 
-/*
- * Returns the first key that is strictly greater than search
- */
-static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t,
-                                          const struct bkey *search)
+static inline bool btree_keys_expensive_checks(struct btree_keys *b)
 {
-       return search ? __bch_bset_search(b, t, search) : t->data->start;
+#ifdef CONFIG_BCACHE_DEBUG
+       return *b->expensive_debug_checks;
+#else
+       return false;
+#endif
 }
 
-#define PRECEDING_KEY(_k)                                      \
-({                                                             \
-       struct bkey *_ret = NULL;                               \
-                                                               \
-       if (KEY_INODE(_k) || KEY_OFFSET(_k)) {                  \
-               _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0);  \
-                                                               \
-               if (!_ret->low)                                 \
-                       _ret->high--;                           \
-               _ret->low--;                                    \
-       }                                                       \
-                                                               \
-       _ret;                                                   \
-})
-
-bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *);
-void bch_btree_sort_lazy(struct btree *);
-void bch_btree_sort_into(struct btree *, struct btree *);
-void bch_btree_sort_and_fix_extents(struct btree *, struct btree_iter *);
-void bch_btree_sort_partial(struct btree *, unsigned);
-
-static inline void bch_btree_sort(struct btree *b)
+static inline int bch_count_data(struct btree_keys *b)
 {
-       bch_btree_sort_partial(b, 0);
+       return btree_keys_expensive_checks(b) ? __bch_count_data(b) : -1;
 }
 
-int bch_bset_print_stats(struct cache_set *, char *);
+#define bch_check_keys(b, ...)                                         \
+do {                                                                   \
+       if (btree_keys_expensive_checks(b))                             \
+               __bch_check_keys(b, __VA_ARGS__);                       \
+} while (0)
 
 #endif
index 946ecd3b048b0ae1c9bd47ab4c42572f7919b838..98cc0a810a366a466253d250baba5c2564e9fab7 100644 (file)
@@ -23,7 +23,7 @@
 #include "bcache.h"
 #include "btree.h"
 #include "debug.h"
-#include "writeback.h"
+#include "extents.h"
 
 #include <linux/slab.h>
 #include <linux/bitops.h>
  * Test module load/unload
  */
 
-enum {
-       BTREE_INSERT_STATUS_INSERT,
-       BTREE_INSERT_STATUS_BACK_MERGE,
-       BTREE_INSERT_STATUS_OVERWROTE,
-       BTREE_INSERT_STATUS_FRONT_MERGE,
-};
-
 #define MAX_NEED_GC            64
 #define MAX_SAVE_PRIO          72
 
@@ -106,14 +99,6 @@ enum {
 
 static struct workqueue_struct *btree_io_wq;
 
-static inline bool should_split(struct btree *b)
-{
-       struct bset *i = write_block(b);
-       return b->written >= btree_blocks(b) ||
-               (b->written + __set_blocks(i, i->keys + 15, b->c)
-                > btree_blocks(b));
-}
-
 #define insert_lock(s, b)      ((b)->level <= (s)->lock)
 
 /*
@@ -167,6 +152,8 @@ static inline bool should_split(struct btree *b)
                        _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);   \
                }                                                       \
                rw_unlock(_w, _b);                                      \
+               if (_r == -EINTR)                                       \
+                       schedule();                                     \
                bch_cannibalize_unlock(c);                              \
                if (_r == -ENOSPC) {                                    \
                        wait_event((c)->try_wait,                       \
@@ -175,9 +162,15 @@ static inline bool should_split(struct btree *b)
                }                                                       \
        } while (_r == -EINTR);                                         \
                                                                        \
+       finish_wait(&(c)->bucket_wait, &(op)->wait);                    \
        _r;                                                             \
 })
 
+static inline struct bset *write_block(struct btree *b)
+{
+       return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c);
+}
+
 /* Btree key manipulation */
 
 void bkey_put(struct cache_set *c, struct bkey *k)
@@ -194,16 +187,16 @@ void bkey_put(struct cache_set *c, struct bkey *k)
 static uint64_t btree_csum_set(struct btree *b, struct bset *i)
 {
        uint64_t crc = b->key.ptr[0];
-       void *data = (void *) i + 8, *end = end(i);
+       void *data = (void *) i + 8, *end = bset_bkey_last(i);
 
        crc = bch_crc64_update(crc, data, end - data);
        return crc ^ 0xffffffffffffffffULL;
 }
 
-static void bch_btree_node_read_done(struct btree *b)
+void bch_btree_node_read_done(struct btree *b)
 {
        const char *err = "bad btree header";
-       struct bset *i = b->sets[0].data;
+       struct bset *i = btree_bset_first(b);
        struct btree_iter *iter;
 
        iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT);
@@ -211,21 +204,22 @@ static void bch_btree_node_read_done(struct btree *b)
        iter->used = 0;
 
 #ifdef CONFIG_BCACHE_DEBUG
-       iter->b = b;
+       iter->b = &b->keys;
 #endif
 
        if (!i->seq)
                goto err;
 
        for (;
-            b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq;
+            b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq;
             i = write_block(b)) {
                err = "unsupported bset version";
                if (i->version > BCACHE_BSET_VERSION)
                        goto err;
 
                err = "bad btree header";
-               if (b->written + set_blocks(i, b->c) > btree_blocks(b))
+               if (b->written + set_blocks(i, block_bytes(b->c)) >
+                   btree_blocks(b))
                        goto err;
 
                err = "bad magic";
@@ -245,39 +239,40 @@ static void bch_btree_node_read_done(struct btree *b)
                }
 
                err = "empty set";
-               if (i != b->sets[0].data && !i->keys)
+               if (i != b->keys.set[0].data && !i->keys)
                        goto err;
 
-               bch_btree_iter_push(iter, i->start, end(i));
+               bch_btree_iter_push(iter, i->start, bset_bkey_last(i));
 
-               b->written += set_blocks(i, b->c);
+               b->written += set_blocks(i, block_bytes(b->c));
        }
 
        err = "corrupted btree";
        for (i = write_block(b);
-            index(i, b) < btree_blocks(b);
+            bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key);
             i = ((void *) i) + block_bytes(b->c))
-               if (i->seq == b->sets[0].data->seq)
+               if (i->seq == b->keys.set[0].data->seq)
                        goto err;
 
-       bch_btree_sort_and_fix_extents(b, iter);
+       bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort);
 
-       i = b->sets[0].data;
+       i = b->keys.set[0].data;
        err = "short btree key";
-       if (b->sets[0].size &&
-           bkey_cmp(&b->key, &b->sets[0].end) < 0)
+       if (b->keys.set[0].size &&
+           bkey_cmp(&b->key, &b->keys.set[0].end) < 0)
                goto err;
 
        if (b->written < btree_blocks(b))
-               bch_bset_init_next(b);
+               bch_bset_init_next(&b->keys, write_block(b),
+                                  bset_magic(&b->c->sb));
 out:
        mempool_free(iter, b->c->fill_iter);
        return;
 err:
        set_btree_node_io_error(b);
-       bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys",
+       bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys",
                            err, PTR_BUCKET_NR(b->c, &b->key, 0),
-                           index(i, b), i->keys);
+                           bset_block_offset(b, i), i->keys);
        goto out;
 }
 
@@ -287,7 +282,7 @@ static void btree_node_read_endio(struct bio *bio, int error)
        closure_put(cl);
 }
 
-void bch_btree_node_read(struct btree *b)
+static void bch_btree_node_read(struct btree *b)
 {
        uint64_t start_time = local_clock();
        struct closure cl;
@@ -303,7 +298,7 @@ void bch_btree_node_read(struct btree *b)
        bio->bi_end_io  = btree_node_read_endio;
        bio->bi_private = &cl;
 
-       bch_bio_map(bio, b->sets[0].data);
+       bch_bio_map(bio, b->keys.set[0].data);
 
        bch_submit_bbio(bio, b->c, &b->key, 0);
        closure_sync(&cl);
@@ -340,9 +335,16 @@ static void btree_complete_write(struct btree *b, struct btree_write *w)
        w->journal      = NULL;
 }
 
+static void btree_node_write_unlock(struct closure *cl)
+{
+       struct btree *b = container_of(cl, struct btree, io);
+
+       up(&b->io_mutex);
+}
+
 static void __btree_node_write_done(struct closure *cl)
 {
-       struct btree *b = container_of(cl, struct btree, io.cl);
+       struct btree *b = container_of(cl, struct btree, io);
        struct btree_write *w = btree_prev_write(b);
 
        bch_bbio_free(b->bio, b->c);
@@ -353,12 +355,12 @@ static void __btree_node_write_done(struct closure *cl)
                queue_delayed_work(btree_io_wq, &b->work,
                                   msecs_to_jiffies(30000));
 
-       closure_return(cl);
+       closure_return_with_destructor(cl, btree_node_write_unlock);
 }
 
 static void btree_node_write_done(struct closure *cl)
 {
-       struct btree *b = container_of(cl, struct btree, io.cl);
+       struct btree *b = container_of(cl, struct btree, io);
        struct bio_vec *bv;
        int n;
 
@@ -371,7 +373,7 @@ static void btree_node_write_done(struct closure *cl)
 static void btree_node_write_endio(struct bio *bio, int error)
 {
        struct closure *cl = bio->bi_private;
-       struct btree *b = container_of(cl, struct btree, io.cl);
+       struct btree *b = container_of(cl, struct btree, io);
 
        if (error)
                set_btree_node_io_error(b);
@@ -382,8 +384,8 @@ static void btree_node_write_endio(struct bio *bio, int error)
 
 static void do_btree_node_write(struct btree *b)
 {
-       struct closure *cl = &b->io.cl;
-       struct bset *i = b->sets[b->nsets].data;
+       struct closure *cl = &b->io;
+       struct bset *i = btree_bset_last(b);
        BKEY_PADDED(key) k;
 
        i->version      = BCACHE_BSET_VERSION;
@@ -395,7 +397,7 @@ static void do_btree_node_write(struct btree *b)
        b->bio->bi_end_io       = btree_node_write_endio;
        b->bio->bi_private      = cl;
        b->bio->bi_rw           = REQ_META|WRITE_SYNC|REQ_FUA;
-       b->bio->bi_iter.bi_size = set_blocks(i, b->c) * block_bytes(b->c);
+       b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c));
        bch_bio_map(b->bio, i);
 
        /*
@@ -414,7 +416,8 @@ static void do_btree_node_write(struct btree *b)
         */
 
        bkey_copy(&k.key, &b->key);
-       SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i));
+       SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) +
+                      bset_sector_offset(&b->keys, i));
 
        if (!bio_alloc_pages(b->bio, GFP_NOIO)) {
                int j;
@@ -435,40 +438,54 @@ static void do_btree_node_write(struct btree *b)
                bch_submit_bbio(b->bio, b->c, &k.key, 0);
 
                closure_sync(cl);
-               __btree_node_write_done(cl);
+               continue_at_nobarrier(cl, __btree_node_write_done, NULL);
        }
 }
 
 void bch_btree_node_write(struct btree *b, struct closure *parent)
 {
-       struct bset *i = b->sets[b->nsets].data;
+       struct bset *i = btree_bset_last(b);
 
        trace_bcache_btree_write(b);
 
        BUG_ON(current->bio_list);
        BUG_ON(b->written >= btree_blocks(b));
        BUG_ON(b->written && !i->keys);
-       BUG_ON(b->sets->data->seq != i->seq);
-       bch_check_keys(b, "writing");
+       BUG_ON(btree_bset_first(b)->seq != i->seq);
+       bch_check_keys(&b->keys, "writing");
 
        cancel_delayed_work(&b->work);
 
        /* If caller isn't waiting for write, parent refcount is cache set */
-       closure_lock(&b->io, parent ?: &b->c->cl);
+       down(&b->io_mutex);
+       closure_init(&b->io, parent ?: &b->c->cl);
 
        clear_bit(BTREE_NODE_dirty,      &b->flags);
        change_bit(BTREE_NODE_write_idx, &b->flags);
 
        do_btree_node_write(b);
 
-       b->written += set_blocks(i, b->c);
-       atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size,
+       atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size,
                        &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
 
-       bch_btree_sort_lazy(b);
+       b->written += set_blocks(i, block_bytes(b->c));
+
+       /* If not a leaf node, always sort */
+       if (b->level && b->keys.nsets)
+               bch_btree_sort(&b->keys, &b->c->sort);
+       else
+               bch_btree_sort_lazy(&b->keys, &b->c->sort);
+
+       /*
+        * do verify if there was more than one set initially (i.e. we did a
+        * sort) and we sorted down to a single set:
+        */
+       if (i != b->keys.set->data && !b->keys.nsets)
+               bch_btree_verify(b);
 
        if (b->written < btree_blocks(b))
-               bch_bset_init_next(b);
+               bch_bset_init_next(&b->keys, write_block(b),
+                                  bset_magic(&b->c->sb));
 }
 
 static void bch_btree_node_write_sync(struct btree *b)
@@ -493,7 +510,7 @@ static void btree_node_write_work(struct work_struct *w)
 
 static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
 {
-       struct bset *i = b->sets[b->nsets].data;
+       struct bset *i = btree_bset_last(b);
        struct btree_write *w = btree_current_write(b);
 
        BUG_ON(!b->written);
@@ -528,24 +545,6 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
  * mca -> memory cache
  */
 
-static void mca_reinit(struct btree *b)
-{
-       unsigned i;
-
-       b->flags        = 0;
-       b->written      = 0;
-       b->nsets        = 0;
-
-       for (i = 0; i < MAX_BSETS; i++)
-               b->sets[i].size = 0;
-       /*
-        * Second loop starts at 1 because b->sets[0]->data is the memory we
-        * allocated
-        */
-       for (i = 1; i < MAX_BSETS; i++)
-               b->sets[i].data = NULL;
-}
-
 #define mca_reserve(c) (((c->root && c->root->level)           \
                          ? c->root->level : 1) * 8 + 16)
 #define mca_can_free(c)                                                \
@@ -553,28 +552,12 @@ static void mca_reinit(struct btree *b)
 
 static void mca_data_free(struct btree *b)
 {
-       struct bset_tree *t = b->sets;
-       BUG_ON(!closure_is_unlocked(&b->io.cl));
+       BUG_ON(b->io_mutex.count != 1);
 
-       if (bset_prev_bytes(b) < PAGE_SIZE)
-               kfree(t->prev);
-       else
-               free_pages((unsigned long) t->prev,
-                          get_order(bset_prev_bytes(b)));
+       bch_btree_keys_free(&b->keys);
 
-       if (bset_tree_bytes(b) < PAGE_SIZE)
-               kfree(t->tree);
-       else
-               free_pages((unsigned long) t->tree,
-                          get_order(bset_tree_bytes(b)));
-
-       free_pages((unsigned long) t->data, b->page_order);
-
-       t->prev = NULL;
-       t->tree = NULL;
-       t->data = NULL;
-       list_move(&b->list, &b->c->btree_cache_freed);
        b->c->bucket_cache_used--;
+       list_move(&b->list, &b->c->btree_cache_freed);
 }
 
 static void mca_bucket_free(struct btree *b)
@@ -593,34 +576,16 @@ static unsigned btree_order(struct bkey *k)
 
 static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
 {
-       struct bset_tree *t = b->sets;
-       BUG_ON(t->data);
-
-       b->page_order = max_t(unsigned,
-                             ilog2(b->c->btree_pages),
-                             btree_order(k));
-
-       t->data = (void *) __get_free_pages(gfp, b->page_order);
-       if (!t->data)
-               goto err;
-
-       t->tree = bset_tree_bytes(b) < PAGE_SIZE
-               ? kmalloc(bset_tree_bytes(b), gfp)
-               : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b)));
-       if (!t->tree)
-               goto err;
-
-       t->prev = bset_prev_bytes(b) < PAGE_SIZE
-               ? kmalloc(bset_prev_bytes(b), gfp)
-               : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b)));
-       if (!t->prev)
-               goto err;
-
-       list_move(&b->list, &b->c->btree_cache);
-       b->c->bucket_cache_used++;
-       return;
-err:
-       mca_data_free(b);
+       if (!bch_btree_keys_alloc(&b->keys,
+                                 max_t(unsigned,
+                                       ilog2(b->c->btree_pages),
+                                       btree_order(k)),
+                                 gfp)) {
+               b->c->bucket_cache_used++;
+               list_move(&b->list, &b->c->btree_cache);
+       } else {
+               list_move(&b->list, &b->c->btree_cache_freed);
+       }
 }
 
 static struct btree *mca_bucket_alloc(struct cache_set *c,
@@ -635,7 +600,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c,
        INIT_LIST_HEAD(&b->list);
        INIT_DELAYED_WORK(&b->work, btree_node_write_work);
        b->c = c;
-       closure_init_unlocked(&b->io);
+       sema_init(&b->io_mutex, 1);
 
        mca_data_alloc(b, k, gfp);
        return b;
@@ -651,24 +616,31 @@ static int mca_reap(struct btree *b, unsigned min_order, bool flush)
        if (!down_write_trylock(&b->lock))
                return -ENOMEM;
 
-       BUG_ON(btree_node_dirty(b) && !b->sets[0].data);
+       BUG_ON(btree_node_dirty(b) && !b->keys.set[0].data);
 
-       if (b->page_order < min_order ||
-           (!flush &&
-            (btree_node_dirty(b) ||
-             atomic_read(&b->io.cl.remaining) != -1))) {
-               rw_unlock(true, b);
-               return -ENOMEM;
+       if (b->keys.page_order < min_order)
+               goto out_unlock;
+
+       if (!flush) {
+               if (btree_node_dirty(b))
+                       goto out_unlock;
+
+               if (down_trylock(&b->io_mutex))
+                       goto out_unlock;
+               up(&b->io_mutex);
        }
 
        if (btree_node_dirty(b))
                bch_btree_node_write_sync(b);
 
        /* wait for any in flight btree write */
-       closure_wait_event(&b->io.wait, &cl,
-                          atomic_read(&b->io.cl.remaining) == -1);
+       down(&b->io_mutex);
+       up(&b->io_mutex);
 
        return 0;
+out_unlock:
+       rw_unlock(true, b);
+       return -ENOMEM;
 }
 
 static unsigned long bch_mca_scan(struct shrinker *shrink,
@@ -714,14 +686,10 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
                }
        }
 
-       /*
-        * Can happen right when we first start up, before we've read in any
-        * btree nodes
-        */
-       if (list_empty(&c->btree_cache))
-               goto out;
-
        for (i = 0; (nr--) && i < c->bucket_cache_used; i++) {
+               if (list_empty(&c->btree_cache))
+                       goto out;
+
                b = list_first_entry(&c->btree_cache, struct btree, list);
                list_rotate_left(&c->btree_cache);
 
@@ -767,6 +735,8 @@ void bch_btree_cache_free(struct cache_set *c)
 #ifdef CONFIG_BCACHE_DEBUG
        if (c->verify_data)
                list_move(&c->verify_data->list, &c->btree_cache);
+
+       free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c)));
 #endif
 
        list_splice(&c->btree_cache_freeable,
@@ -807,10 +777,13 @@ int bch_btree_cache_alloc(struct cache_set *c)
 #ifdef CONFIG_BCACHE_DEBUG
        mutex_init(&c->verify_lock);
 
+       c->verify_ondisk = (void *)
+               __get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c)));
+
        c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
 
        if (c->verify_data &&
-           c->verify_data->sets[0].data)
+           c->verify_data->keys.set->data)
                list_del_init(&c->verify_data->list);
        else
                c->verify_data = NULL;
@@ -908,7 +881,7 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level)
        list_for_each_entry(b, &c->btree_cache_freed, list)
                if (!mca_reap(b, 0, false)) {
                        mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO);
-                       if (!b->sets[0].data)
+                       if (!b->keys.set[0].data)
                                goto err;
                        else
                                goto out;
@@ -919,10 +892,10 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level)
                goto err;
 
        BUG_ON(!down_write_trylock(&b->lock));
-       if (!b->sets->data)
+       if (!b->keys.set->data)
                goto err;
 out:
-       BUG_ON(!closure_is_unlocked(&b->io.cl));
+       BUG_ON(b->io_mutex.count != 1);
 
        bkey_copy(&b->key, k);
        list_move(&b->list, &c->btree_cache);
@@ -930,10 +903,17 @@ out:
        hlist_add_head_rcu(&b->hash, mca_hash(c, k));
 
        lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_);
-       b->level        = level;
        b->parent       = (void *) ~0UL;
+       b->flags        = 0;
+       b->written      = 0;
+       b->level        = level;
 
-       mca_reinit(b);
+       if (!b->level)
+               bch_btree_keys_init(&b->keys, &bch_extent_keys_ops,
+                                   &b->c->expensive_debug_checks);
+       else
+               bch_btree_keys_init(&b->keys, &bch_btree_keys_ops,
+                                   &b->c->expensive_debug_checks);
 
        return b;
 err:
@@ -994,13 +974,13 @@ retry:
 
        b->accessed = 1;
 
-       for (; i <= b->nsets && b->sets[i].size; i++) {
-               prefetch(b->sets[i].tree);
-               prefetch(b->sets[i].data);
+       for (; i <= b->keys.nsets && b->keys.set[i].size; i++) {
+               prefetch(b->keys.set[i].tree);
+               prefetch(b->keys.set[i].data);
        }
 
-       for (; i <= b->nsets; i++)
-               prefetch(b->sets[i].data);
+       for (; i <= b->keys.nsets; i++)
+               prefetch(b->keys.set[i].data);
 
        if (btree_node_io_error(b)) {
                rw_unlock(write, b);
@@ -1063,7 +1043,7 @@ struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait)
 
        mutex_lock(&c->bucket_lock);
 retry:
-       if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, wait))
+       if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait))
                goto err;
 
        bkey_put(c, &k.key);
@@ -1080,7 +1060,7 @@ retry:
        }
 
        b->accessed = 1;
-       bch_bset_init_next(b);
+       bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb));
 
        mutex_unlock(&c->bucket_lock);
 
@@ -1098,8 +1078,10 @@ err:
 static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait)
 {
        struct btree *n = bch_btree_node_alloc(b->c, b->level, wait);
-       if (!IS_ERR_OR_NULL(n))
-               bch_btree_sort_into(b, n);
+       if (!IS_ERR_OR_NULL(n)) {
+               bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort);
+               bkey_copy_key(&n->key, &b->key);
+       }
 
        return n;
 }
@@ -1120,6 +1102,28 @@ static void make_btree_freeing_key(struct btree *b, struct bkey *k)
        atomic_inc(&b->c->prio_blocked);
 }
 
+static int btree_check_reserve(struct btree *b, struct btree_op *op)
+{
+       struct cache_set *c = b->c;
+       struct cache *ca;
+       unsigned i, reserve = c->root->level * 2 + 1;
+       int ret = 0;
+
+       mutex_lock(&c->bucket_lock);
+
+       for_each_cache(ca, c, i)
+               if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) {
+                       if (op)
+                               prepare_to_wait(&c->bucket_wait, &op->wait,
+                                               TASK_UNINTERRUPTIBLE);
+                       ret = -EINTR;
+                       break;
+               }
+
+       mutex_unlock(&c->bucket_lock);
+       return ret;
+}
+
 /* Garbage collection */
 
 uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
@@ -1183,11 +1187,11 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
 
        gc->nodes++;
 
-       for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
+       for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) {
                stale = max(stale, btree_mark_key(b, k));
                keys++;
 
-               if (bch_ptr_bad(b, k))
+               if (bch_ptr_bad(&b->keys, k))
                        continue;
 
                gc->key_bytes += bkey_u64s(k);
@@ -1197,9 +1201,9 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
                gc->data += KEY_SIZE(k);
        }
 
-       for (t = b->sets; t <= &b->sets[b->nsets]; t++)
+       for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++)
                btree_bug_on(t->size &&
-                            bset_written(b, t) &&
+                            bset_written(&b->keys, t) &&
                             bkey_cmp(&b->key, &t->end) < 0,
                             b, "found short btree key in gc");
 
@@ -1243,7 +1247,8 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
        blocks = btree_default_blocks(b->c) * 2 / 3;
 
        if (nodes < 2 ||
-           __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1))
+           __set_blocks(b->keys.set[0].data, keys,
+                        block_bytes(b->c)) > blocks * (nodes - 1))
                return 0;
 
        for (i = 0; i < nodes; i++) {
@@ -1253,18 +1258,19 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
        }
 
        for (i = nodes - 1; i > 0; --i) {
-               struct bset *n1 = new_nodes[i]->sets->data;
-               struct bset *n2 = new_nodes[i - 1]->sets->data;
+               struct bset *n1 = btree_bset_first(new_nodes[i]);
+               struct bset *n2 = btree_bset_first(new_nodes[i - 1]);
                struct bkey *k, *last = NULL;
 
                keys = 0;
 
                if (i > 1) {
                        for (k = n2->start;
-                            k < end(n2);
+                            k < bset_bkey_last(n2);
                             k = bkey_next(k)) {
                                if (__set_blocks(n1, n1->keys + keys +
-                                                bkey_u64s(k), b->c) > blocks)
+                                                bkey_u64s(k),
+                                                block_bytes(b->c)) > blocks)
                                        break;
 
                                last = k;
@@ -1280,7 +1286,8 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
                         * though)
                         */
                        if (__set_blocks(n1, n1->keys + n2->keys,
-                                        b->c) > btree_blocks(new_nodes[i]))
+                                        block_bytes(b->c)) >
+                           btree_blocks(new_nodes[i]))
                                goto out_nocoalesce;
 
                        keys = n2->keys;
@@ -1288,27 +1295,28 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
                        last = &r->b->key;
                }
 
-               BUG_ON(__set_blocks(n1, n1->keys + keys,
-                                   b->c) > btree_blocks(new_nodes[i]));
+               BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c)) >
+                      btree_blocks(new_nodes[i]));
 
                if (last)
                        bkey_copy_key(&new_nodes[i]->key, last);
 
-               memcpy(end(n1),
+               memcpy(bset_bkey_last(n1),
                       n2->start,
-                      (void *) node(n2, keys) - (void *) n2->start);
+                      (void *) bset_bkey_idx(n2, keys) - (void *) n2->start);
 
                n1->keys += keys;
                r[i].keys = n1->keys;
 
                memmove(n2->start,
-                       node(n2, keys),
-                       (void *) end(n2) - (void *) node(n2, keys));
+                       bset_bkey_idx(n2, keys),
+                       (void *) bset_bkey_last(n2) -
+                       (void *) bset_bkey_idx(n2, keys));
 
                n2->keys -= keys;
 
-               if (bch_keylist_realloc(keylist,
-                                       KEY_PTRS(&new_nodes[i]->key), b->c))
+               if (__bch_keylist_realloc(keylist,
+                                         bkey_u64s(&new_nodes[i]->key)))
                        goto out_nocoalesce;
 
                bch_btree_node_write(new_nodes[i], &cl);
@@ -1316,7 +1324,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
        }
 
        for (i = 0; i < nodes; i++) {
-               if (bch_keylist_realloc(keylist, KEY_PTRS(&r[i].b->key), b->c))
+               if (__bch_keylist_realloc(keylist, bkey_u64s(&r[i].b->key)))
                        goto out_nocoalesce;
 
                make_btree_freeing_key(r[i].b, keylist->top);
@@ -1324,7 +1332,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
        }
 
        /* We emptied out this node */
-       BUG_ON(new_nodes[0]->sets->data->keys);
+       BUG_ON(btree_bset_first(new_nodes[0])->keys);
        btree_node_free(new_nodes[0]);
        rw_unlock(true, new_nodes[0]);
 
@@ -1370,7 +1378,7 @@ static unsigned btree_gc_count_keys(struct btree *b)
        struct btree_iter iter;
        unsigned ret = 0;
 
-       for_each_key_filter(b, k, &iter, bch_ptr_bad)
+       for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
                ret += bkey_u64s(k);
 
        return ret;
@@ -1390,13 +1398,13 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
        struct gc_merge_info *last = r + GC_MERGE_NODES - 1;
 
        bch_keylist_init(&keys);
-       bch_btree_iter_init(b, &iter, &b->c->gc_done);
+       bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done);
 
        for (i = 0; i < GC_MERGE_NODES; i++)
                r[i].b = ERR_PTR(-EINTR);
 
        while (1) {
-               k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
+               k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad);
                if (k) {
                        r->b = bch_btree_node_get(b->c, k, b->level - 1, true);
                        if (IS_ERR(r->b)) {
@@ -1416,7 +1424,8 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
 
                if (!IS_ERR(last->b)) {
                        should_rewrite = btree_gc_mark_node(last->b, gc);
-                       if (should_rewrite) {
+                       if (should_rewrite &&
+                           !btree_check_reserve(b, NULL)) {
                                n = btree_node_alloc_replacement(last->b,
                                                                 false);
 
@@ -1705,7 +1714,7 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
        struct bucket *g;
        struct btree_iter iter;
 
-       for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
+       for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) {
                for (i = 0; i < KEY_PTRS(k); i++) {
                        if (!ptr_available(b->c, k, i))
                                continue;
@@ -1728,10 +1737,11 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
        }
 
        if (b->level) {
-               bch_btree_iter_init(b, &iter, NULL);
+               bch_btree_iter_init(&b->keys, &iter, NULL);
 
                do {
-                       k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
+                       k = bch_btree_iter_next_filter(&iter, &b->keys,
+                                                      bch_ptr_bad);
                        if (k)
                                btree_node_prefetch(b->c, k, b->level - 1);
 
@@ -1774,235 +1784,36 @@ err:
 
 /* Btree insertion */
 
-static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert)
-{
-       struct bset *i = b->sets[b->nsets].data;
-
-       memmove((uint64_t *) where + bkey_u64s(insert),
-               where,
-               (void *) end(i) - (void *) where);
-
-       i->keys += bkey_u64s(insert);
-       bkey_copy(where, insert);
-       bch_bset_fix_lookup_table(b, where);
-}
-
-static bool fix_overlapping_extents(struct btree *b, struct bkey *insert,
-                                   struct btree_iter *iter,
-                                   struct bkey *replace_key)
+static bool btree_insert_key(struct btree *b, struct bkey *k,
+                            struct bkey *replace_key)
 {
-       void subtract_dirty(struct bkey *k, uint64_t offset, int sectors)
-       {
-               if (KEY_DIRTY(k))
-                       bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
-                                                    offset, -sectors);
-       }
-
-       uint64_t old_offset;
-       unsigned old_size, sectors_found = 0;
-
-       while (1) {
-               struct bkey *k = bch_btree_iter_next(iter);
-               if (!k ||
-                   bkey_cmp(&START_KEY(k), insert) >= 0)
-                       break;
-
-               if (bkey_cmp(k, &START_KEY(insert)) <= 0)
-                       continue;
-
-               old_offset = KEY_START(k);
-               old_size = KEY_SIZE(k);
-
-               /*
-                * We might overlap with 0 size extents; we can't skip these
-                * because if they're in the set we're inserting to we have to
-                * adjust them so they don't overlap with the key we're
-                * inserting. But we don't want to check them for replace
-                * operations.
-                */
-
-               if (replace_key && KEY_SIZE(k)) {
-                       /*
-                        * k might have been split since we inserted/found the
-                        * key we're replacing
-                        */
-                       unsigned i;
-                       uint64_t offset = KEY_START(k) -
-                               KEY_START(replace_key);
-
-                       /* But it must be a subset of the replace key */
-                       if (KEY_START(k) < KEY_START(replace_key) ||
-                           KEY_OFFSET(k) > KEY_OFFSET(replace_key))
-                               goto check_failed;
-
-                       /* We didn't find a key that we were supposed to */
-                       if (KEY_START(k) > KEY_START(insert) + sectors_found)
-                               goto check_failed;
-
-                       if (KEY_PTRS(k) != KEY_PTRS(replace_key) ||
-                           KEY_DIRTY(k) != KEY_DIRTY(replace_key))
-                               goto check_failed;
-
-                       /* skip past gen */
-                       offset <<= 8;
-
-                       BUG_ON(!KEY_PTRS(replace_key));
+       unsigned status;
 
-                       for (i = 0; i < KEY_PTRS(replace_key); i++)
-                               if (k->ptr[i] != replace_key->ptr[i] + offset)
-                                       goto check_failed;
-
-                       sectors_found = KEY_OFFSET(k) - KEY_START(insert);
-               }
-
-               if (bkey_cmp(insert, k) < 0 &&
-                   bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) {
-                       /*
-                        * We overlapped in the middle of an existing key: that
-                        * means we have to split the old key. But we have to do
-                        * slightly different things depending on whether the
-                        * old key has been written out yet.
-                        */
-
-                       struct bkey *top;
-
-                       subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert));
-
-                       if (bkey_written(b, k)) {
-                               /*
-                                * We insert a new key to cover the top of the
-                                * old key, and the old key is modified in place
-                                * to represent the bottom split.
-                                *
-                                * It's completely arbitrary whether the new key
-                                * is the top or the bottom, but it has to match
-                                * up with what btree_sort_fixup() does - it
-                                * doesn't check for this kind of overlap, it
-                                * depends on us inserting a new key for the top
-                                * here.
-                                */
-                               top = bch_bset_search(b, &b->sets[b->nsets],
-                                                     insert);
-                               shift_keys(b, top, k);
-                       } else {
-                               BKEY_PADDED(key) temp;
-                               bkey_copy(&temp.key, k);
-                               shift_keys(b, k, &temp.key);
-                               top = bkey_next(k);
-                       }
-
-                       bch_cut_front(insert, top);
-                       bch_cut_back(&START_KEY(insert), k);
-                       bch_bset_fix_invalidated_key(b, k);
-                       return false;
-               }
-
-               if (bkey_cmp(insert, k) < 0) {
-                       bch_cut_front(insert, k);
-               } else {
-                       if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0)
-                               old_offset = KEY_START(insert);
-
-                       if (bkey_written(b, k) &&
-                           bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
-                               /*
-                                * Completely overwrote, so we don't have to
-                                * invalidate the binary search tree
-                                */
-                               bch_cut_front(k, k);
-                       } else {
-                               __bch_cut_back(&START_KEY(insert), k);
-                               bch_bset_fix_invalidated_key(b, k);
-                       }
-               }
-
-               subtract_dirty(k, old_offset, old_size - KEY_SIZE(k));
-       }
+       BUG_ON(bkey_cmp(k, &b->key) > 0);
 
-check_failed:
-       if (replace_key) {
-               if (!sectors_found) {
-                       return true;
-               } else if (sectors_found < KEY_SIZE(insert)) {
-                       SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
-                                      (KEY_SIZE(insert) - sectors_found));
-                       SET_KEY_SIZE(insert, sectors_found);
-               }
-       }
+       status = bch_btree_insert_key(&b->keys, k, replace_key);
+       if (status != BTREE_INSERT_STATUS_NO_INSERT) {
+               bch_check_keys(&b->keys, "%u for %s", status,
+                              replace_key ? "replace" : "insert");
 
-       return false;
+               trace_bcache_btree_insert_key(b, k, replace_key != NULL,
+                                             status);
+               return true;
+       } else
+               return false;
 }
 
-static bool btree_insert_key(struct btree *b, struct btree_op *op,
-                            struct bkey *k, struct bkey *replace_key)
+static size_t insert_u64s_remaining(struct btree *b)
 {
-       struct bset *i = b->sets[b->nsets].data;
-       struct bkey *m, *prev;
-       unsigned status = BTREE_INSERT_STATUS_INSERT;
-
-       BUG_ON(bkey_cmp(k, &b->key) > 0);
-       BUG_ON(b->level && !KEY_PTRS(k));
-       BUG_ON(!b->level && !KEY_OFFSET(k));
-
-       if (!b->level) {
-               struct btree_iter iter;
-
-               /*
-                * bset_search() returns the first key that is strictly greater
-                * than the search key - but for back merging, we want to find
-                * the previous key.
-                */
-               prev = NULL;
-               m = bch_btree_iter_init(b, &iter, PRECEDING_KEY(&START_KEY(k)));
+       ssize_t ret = bch_btree_keys_u64s_remaining(&b->keys);
 
-               if (fix_overlapping_extents(b, k, &iter, replace_key)) {
-                       op->insert_collision = true;
-                       return false;
-               }
-
-               if (KEY_DIRTY(k))
-                       bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
-                                                    KEY_START(k), KEY_SIZE(k));
-
-               while (m != end(i) &&
-                      bkey_cmp(k, &START_KEY(m)) > 0)
-                       prev = m, m = bkey_next(m);
-
-               if (key_merging_disabled(b->c))
-                       goto insert;
-
-               /* prev is in the tree, if we merge we're done */
-               status = BTREE_INSERT_STATUS_BACK_MERGE;
-               if (prev &&
-                   bch_bkey_try_merge(b, prev, k))
-                       goto merged;
-
-               status = BTREE_INSERT_STATUS_OVERWROTE;
-               if (m != end(i) &&
-                   KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
-                       goto copy;
-
-               status = BTREE_INSERT_STATUS_FRONT_MERGE;
-               if (m != end(i) &&
-                   bch_bkey_try_merge(b, k, m))
-                       goto copy;
-       } else {
-               BUG_ON(replace_key);
-               m = bch_bset_search(b, &b->sets[b->nsets], k);
-       }
-
-insert:        shift_keys(b, m, k);
-copy:  bkey_copy(m, k);
-merged:
-       bch_check_keys(b, "%u for %s", status,
-                      replace_key ? "replace" : "insert");
-
-       if (b->level && !KEY_OFFSET(k))
-               btree_current_write(b)->prio_blocked++;
-
-       trace_bcache_btree_insert_key(b, k, replace_key != NULL, status);
+       /*
+        * Might land in the middle of an existing extent and have to split it
+        */
+       if (b->keys.ops->is_extents)
+               ret -= KEY_MAX_U64S;
 
-       return true;
+       return max(ret, 0L);
 }
 
 static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
@@ -2010,21 +1821,19 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
                                  struct bkey *replace_key)
 {
        bool ret = false;
-       int oldsize = bch_count_data(b);
+       int oldsize = bch_count_data(&b->keys);
 
        while (!bch_keylist_empty(insert_keys)) {
-               struct bset *i = write_block(b);
                struct bkey *k = insert_keys->keys;
 
-               if (b->written + __set_blocks(i, i->keys + bkey_u64s(k), b->c)
-                   > btree_blocks(b))
+               if (bkey_u64s(k) > insert_u64s_remaining(b))
                        break;
 
                if (bkey_cmp(k, &b->key) <= 0) {
                        if (!b->level)
                                bkey_put(b->c, k);
 
-                       ret |= btree_insert_key(b, op, k, replace_key);
+                       ret |= btree_insert_key(b, k, replace_key);
                        bch_keylist_pop_front(insert_keys);
                } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) {
                        BKEY_PADDED(key) temp;
@@ -2033,16 +1842,19 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
                        bch_cut_back(&b->key, &temp.key);
                        bch_cut_front(&b->key, insert_keys->keys);
 
-                       ret |= btree_insert_key(b, op, &temp.key, replace_key);
+                       ret |= btree_insert_key(b, &temp.key, replace_key);
                        break;
                } else {
                        break;
                }
        }
 
+       if (!ret)
+               op->insert_collision = true;
+
        BUG_ON(!bch_keylist_empty(insert_keys) && b->level);
 
-       BUG_ON(bch_count_data(b) < oldsize);
+       BUG_ON(bch_count_data(&b->keys) < oldsize);
        return ret;
 }
 
@@ -2059,16 +1871,21 @@ static int btree_split(struct btree *b, struct btree_op *op,
        closure_init_stack(&cl);
        bch_keylist_init(&parent_keys);
 
+       if (!b->level &&
+           btree_check_reserve(b, op))
+               return -EINTR;
+
        n1 = btree_node_alloc_replacement(b, true);
        if (IS_ERR(n1))
                goto err;
 
-       split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5;
+       split = set_blocks(btree_bset_first(n1),
+                          block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5;
 
        if (split) {
                unsigned keys = 0;
 
-               trace_bcache_btree_node_split(b, n1->sets[0].data->keys);
+               trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys);
 
                n2 = bch_btree_node_alloc(b->c, b->level, true);
                if (IS_ERR(n2))
@@ -2087,18 +1904,20 @@ static int btree_split(struct btree *b, struct btree_op *op,
                 * search tree yet
                 */
 
-               while (keys < (n1->sets[0].data->keys * 3) / 5)
-                       keys += bkey_u64s(node(n1->sets[0].data, keys));
+               while (keys < (btree_bset_first(n1)->keys * 3) / 5)
+                       keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1),
+                                                       keys));
 
-               bkey_copy_key(&n1->key, node(n1->sets[0].data, keys));
-               keys += bkey_u64s(node(n1->sets[0].data, keys));
+               bkey_copy_key(&n1->key,
+                             bset_bkey_idx(btree_bset_first(n1), keys));
+               keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys));
 
-               n2->sets[0].data->keys = n1->sets[0].data->keys - keys;
-               n1->sets[0].data->keys = keys;
+               btree_bset_first(n2)->keys = btree_bset_first(n1)->keys - keys;
+               btree_bset_first(n1)->keys = keys;
 
-               memcpy(n2->sets[0].data->start,
-                      end(n1->sets[0].data),
-                      n2->sets[0].data->keys * sizeof(uint64_t));
+               memcpy(btree_bset_first(n2)->start,
+                      bset_bkey_last(btree_bset_first(n1)),
+                      btree_bset_first(n2)->keys * sizeof(uint64_t));
 
                bkey_copy_key(&n2->key, &b->key);
 
@@ -2106,7 +1925,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
                bch_btree_node_write(n2, &cl);
                rw_unlock(true, n2);
        } else {
-               trace_bcache_btree_node_compact(b, n1->sets[0].data->keys);
+               trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys);
 
                bch_btree_insert_keys(n1, op, insert_keys, replace_key);
        }
@@ -2149,18 +1968,21 @@ static int btree_split(struct btree *b, struct btree_op *op,
 
        return 0;
 err_free2:
+       bkey_put(b->c, &n2->key);
        btree_node_free(n2);
        rw_unlock(true, n2);
 err_free1:
+       bkey_put(b->c, &n1->key);
        btree_node_free(n1);
        rw_unlock(true, n1);
 err:
+       WARN(1, "bcache: btree split failed");
+
        if (n3 == ERR_PTR(-EAGAIN) ||
            n2 == ERR_PTR(-EAGAIN) ||
            n1 == ERR_PTR(-EAGAIN))
                return -EAGAIN;
 
-       pr_warn("couldn't split");
        return -ENOMEM;
 }
 
@@ -2171,7 +1993,7 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
 {
        BUG_ON(b->level && replace_key);
 
-       if (should_split(b)) {
+       if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) {
                if (current->bio_list) {
                        op->lock = b->c->root->level + 1;
                        return -EAGAIN;
@@ -2180,11 +2002,13 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
                        return -EINTR;
                } else {
                        /* Invalidated all iterators */
-                       return btree_split(b, op, insert_keys, replace_key) ?:
-                               -EINTR;
+                       int ret = btree_split(b, op, insert_keys, replace_key);
+
+                       return bch_keylist_empty(insert_keys) ?
+                               0 : ret ?: -EINTR;
                }
        } else {
-               BUG_ON(write_block(b) != b->sets[b->nsets].data);
+               BUG_ON(write_block(b) != btree_bset_last(b));
 
                if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
                        if (!b->level)
@@ -2323,9 +2147,9 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
                struct bkey *k;
                struct btree_iter iter;
 
-               bch_btree_iter_init(b, &iter, from);
+               bch_btree_iter_init(&b->keys, &iter, from);
 
-               while ((k = bch_btree_iter_next_filter(&iter, b,
+               while ((k = bch_btree_iter_next_filter(&iter, &b->keys,
                                                       bch_ptr_bad))) {
                        ret = btree(map_nodes_recurse, k, b,
                                    op, from, fn, flags);
@@ -2356,9 +2180,9 @@ static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
        struct bkey *k;
        struct btree_iter iter;
 
-       bch_btree_iter_init(b, &iter, from);
+       bch_btree_iter_init(&b->keys, &iter, from);
 
-       while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) {
+       while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) {
                ret = !b->level
                        ? fn(op, b, k)
                        : btree(map_keys_recurse, k, b, op, from, fn, flags);
index 767e755708964ce82f36dc88e28281b5c1b90177..af065e97e55c4186782422db0bae20498fd3cdb8 100644 (file)
@@ -130,20 +130,12 @@ struct btree {
        unsigned long           flags;
        uint16_t                written;        /* would be nice to kill */
        uint8_t                 level;
-       uint8_t                 nsets;
-       uint8_t                 page_order;
-
-       /*
-        * Set of sorted keys - the real btree node - plus a binary search tree
-        *
-        * sets[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
-        * to the memory we have allocated for this btree node. Additionally,
-        * set[0]->data points to the entire btree node as it exists on disk.
-        */
-       struct bset_tree        sets[MAX_BSETS];
+
+       struct btree_keys       keys;
 
        /* For outstanding btree writes, used as a lock - protects write_idx */
-       struct closure_with_waitlist    io;
+       struct closure          io;
+       struct semaphore        io_mutex;
 
        struct list_head        list;
        struct delayed_work     work;
@@ -179,24 +171,19 @@ static inline struct btree_write *btree_prev_write(struct btree *b)
        return b->writes + (btree_node_write_idx(b) ^ 1);
 }
 
-static inline unsigned bset_offset(struct btree *b, struct bset *i)
+static inline struct bset *btree_bset_first(struct btree *b)
 {
-       return (((size_t) i) - ((size_t) b->sets->data)) >> 9;
+       return b->keys.set->data;
 }
 
-static inline struct bset *write_block(struct btree *b)
+static inline struct bset *btree_bset_last(struct btree *b)
 {
-       return ((void *) b->sets[0].data) + b->written * block_bytes(b->c);
+       return bset_tree_last(&b->keys)->data;
 }
 
-static inline bool bset_written(struct btree *b, struct bset_tree *t)
+static inline unsigned bset_block_offset(struct btree *b, struct bset *i)
 {
-       return t->data < write_block(b);
-}
-
-static inline bool bkey_written(struct btree *b, struct bkey *k)
-{
-       return k < write_block(b)->start;
+       return bset_sector_offset(&b->keys, i) >> b->c->block_bits;
 }
 
 static inline void set_gc_sectors(struct cache_set *c)
@@ -204,21 +191,6 @@ static inline void set_gc_sectors(struct cache_set *c)
        atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16);
 }
 
-static inline struct bkey *bch_btree_iter_init(struct btree *b,
-                                              struct btree_iter *iter,
-                                              struct bkey *search)
-{
-       return __bch_btree_iter_init(b, iter, search, b->sets);
-}
-
-static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k)
-{
-       if (b->level)
-               return bch_btree_ptr_invalid(b->c, k);
-       else
-               return bch_extent_ptr_invalid(b->c, k);
-}
-
 void bkey_put(struct cache_set *c, struct bkey *k);
 
 /* Looping macros */
@@ -229,17 +201,12 @@ void bkey_put(struct cache_set *c, struct bkey *k);
             iter++)                                                    \
                hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash)
 
-#define for_each_key_filter(b, k, iter, filter)                                \
-       for (bch_btree_iter_init((b), (iter), NULL);                    \
-            ((k) = bch_btree_iter_next_filter((iter), b, filter));)
-
-#define for_each_key(b, k, iter)                                       \
-       for (bch_btree_iter_init((b), (iter), NULL);                    \
-            ((k) = bch_btree_iter_next(iter));)
-
 /* Recursing down the btree */
 
 struct btree_op {
+       /* for waiting on btree reserve in btree_split() */
+       wait_queue_t            wait;
+
        /* Btree level at which we start taking write locks */
        short                   lock;
 
@@ -249,6 +216,7 @@ struct btree_op {
 static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level)
 {
        memset(op, 0, sizeof(struct btree_op));
+       init_wait(&op->wait);
        op->lock = write_lock_level;
 }
 
@@ -267,7 +235,7 @@ static inline void rw_unlock(bool w, struct btree *b)
        (w ? up_write : up_read)(&b->lock);
 }
 
-void bch_btree_node_read(struct btree *);
+void bch_btree_node_read_done(struct btree *);
 void bch_btree_node_write(struct btree *, struct closure *);
 
 void bch_btree_set_root(struct btree *);
index dfff2410322e70263ac63e6dbfd9536bd4a5dc54..7a228de95fd7e94fa61acb0758a53d8670770383 100644 (file)
 
 #include "closure.h"
 
-#define CL_FIELD(type, field)                                  \
-       case TYPE_ ## type:                                     \
-       return &container_of(cl, struct type, cl)->field
-
-static struct closure_waitlist *closure_waitlist(struct closure *cl)
-{
-       switch (cl->type) {
-               CL_FIELD(closure_with_waitlist, wait);
-       default:
-               return NULL;
-       }
-}
-
 static inline void closure_put_after_sub(struct closure *cl, int flags)
 {
        int r = flags & CLOSURE_REMAINING_MASK;
@@ -42,17 +29,10 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
                        closure_queue(cl);
                } else {
                        struct closure *parent = cl->parent;
-                       struct closure_waitlist *wait = closure_waitlist(cl);
                        closure_fn *destructor = cl->fn;
 
                        closure_debug_destroy(cl);
 
-                       smp_mb();
-                       atomic_set(&cl->remaining, -1);
-
-                       if (wait)
-                               closure_wake_up(wait);
-
                        if (destructor)
                                destructor(cl);
 
@@ -69,19 +49,18 @@ void closure_sub(struct closure *cl, int v)
 }
 EXPORT_SYMBOL(closure_sub);
 
+/**
+ * closure_put - decrement a closure's refcount
+ */
 void closure_put(struct closure *cl)
 {
        closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
 }
 EXPORT_SYMBOL(closure_put);
 
-static void set_waiting(struct closure *cl, unsigned long f)
-{
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
-       cl->waiting_on = f;
-#endif
-}
-
+/**
+ * closure_wake_up - wake up all closures on a wait list, without memory barrier
+ */
 void __closure_wake_up(struct closure_waitlist *wait_list)
 {
        struct llist_node *list;
@@ -106,27 +85,34 @@ void __closure_wake_up(struct closure_waitlist *wait_list)
                cl = container_of(reverse, struct closure, list);
                reverse = llist_next(reverse);
 
-               set_waiting(cl, 0);
+               closure_set_waiting(cl, 0);
                closure_sub(cl, CLOSURE_WAITING + 1);
        }
 }
 EXPORT_SYMBOL(__closure_wake_up);
 
-bool closure_wait(struct closure_waitlist *list, struct closure *cl)
+/**
+ * closure_wait - add a closure to a waitlist
+ *
+ * @waitlist will own a ref on @cl, which will be released when
+ * closure_wake_up() is called on @waitlist.
+ *
+ */
+bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
 {
        if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
                return false;
 
-       set_waiting(cl, _RET_IP_);
+       closure_set_waiting(cl, _RET_IP_);
        atomic_add(CLOSURE_WAITING + 1, &cl->remaining);
-       llist_add(&cl->list, &list->list);
+       llist_add(&cl->list, &waitlist->list);
 
        return true;
 }
 EXPORT_SYMBOL(closure_wait);
 
 /**
- * closure_sync() - sleep until a closure a closure has nothing left to wait on
+ * closure_sync - sleep until a closure a closure has nothing left to wait on
  *
  * Sleeps until the refcount hits 1 - the thread that's running the closure owns
  * the last refcount.
@@ -148,46 +134,6 @@ void closure_sync(struct closure *cl)
 }
 EXPORT_SYMBOL(closure_sync);
 
-/**
- * closure_trylock() - try to acquire the closure, without waiting
- * @cl:                closure to lock
- *
- * Returns true if the closure was succesfully locked.
- */
-bool closure_trylock(struct closure *cl, struct closure *parent)
-{
-       if (atomic_cmpxchg(&cl->remaining, -1,
-                          CLOSURE_REMAINING_INITIALIZER) != -1)
-               return false;
-
-       smp_mb();
-
-       cl->parent = parent;
-       if (parent)
-               closure_get(parent);
-
-       closure_set_ret_ip(cl);
-       closure_debug_create(cl);
-       return true;
-}
-EXPORT_SYMBOL(closure_trylock);
-
-void __closure_lock(struct closure *cl, struct closure *parent,
-                   struct closure_waitlist *wait_list)
-{
-       struct closure wait;
-       closure_init_stack(&wait);
-
-       while (1) {
-               if (closure_trylock(cl, parent))
-                       return;
-
-               closure_wait_event(wait_list, &wait,
-                                  atomic_read(&cl->remaining) == -1);
-       }
-}
-EXPORT_SYMBOL(__closure_lock);
-
 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG
 
 static LIST_HEAD(closure_list);
index 9762f1be3304f1349cc21f001cfa02d5401470e9..7ef7461912be252d00ce23b79504664e96ffd84f 100644 (file)
  * closure - _always_ use continue_at(). Doing so consistently will help
  * eliminate an entire class of particularly pernicious races.
  *
- * For a closure to wait on an arbitrary event, we need to introduce waitlists:
- *
- * struct closure_waitlist list;
- * closure_wait_event(list, cl, condition);
- * closure_wake_up(wait_list);
- *
- * These work analagously to wait_event() and wake_up() - except that instead of
- * operating on the current thread (for wait_event()) and lists of threads, they
- * operate on an explicit closure and lists of closures.
- *
- * Because it's a closure we can now wait either synchronously or
- * asynchronously. closure_wait_event() returns the current value of the
- * condition, and if it returned false continue_at() or closure_sync() can be
- * used to wait for it to become true.
- *
- * It's useful for waiting on things when you can't sleep in the context in
- * which you must check the condition (perhaps a spinlock held, or you might be
- * beneath generic_make_request() - in which case you can't sleep on IO).
- *
- * closure_wait_event() will wait either synchronously or asynchronously,
- * depending on whether the closure is in blocking mode or not. You can pick a
- * mode explicitly with closure_wait_event_sync() and
- * closure_wait_event_async(), which do just what you might expect.
- *
  * Lastly, you might have a wait list dedicated to a specific event, and have no
  * need for specifying the condition - you just want to wait until someone runs
  * closure_wake_up() on the appropriate wait list. In that case, just use
  * All this implies that a closure should typically be embedded in a particular
  * struct (which its refcount will normally control the lifetime of), and that
  * struct can very much be thought of as a stack frame.
- *
- * Locking:
- *
- * Closures are based on work items but they can be thought of as more like
- * threads - in that like threads and unlike work items they have a well
- * defined lifetime; they are created (with closure_init()) and eventually
- * complete after a continue_at(cl, NULL, NULL).
- *
- * Suppose you've got some larger structure with a closure embedded in it that's
- * used for periodically doing garbage collection. You only want one garbage
- * collection happening at a time, so the natural thing to do is protect it with
- * a lock. However, it's difficult to use a lock protecting a closure correctly
- * because the unlock should come after the last continue_to() (additionally, if
- * you're using the closure asynchronously a mutex won't work since a mutex has
- * to be unlocked by the same process that locked it).
- *
- * So to make it less error prone and more efficient, we also have the ability
- * to use closures as locks:
- *
- * closure_init_unlocked();
- * closure_trylock();
- *
- * That's all we need for trylock() - the last closure_put() implicitly unlocks
- * it for you.  But for closure_lock(), we also need a wait list:
- *
- * struct closure_with_waitlist frobnicator_cl;
- *
- * closure_init_unlocked(&frobnicator_cl);
- * closure_lock(&frobnicator_cl);
- *
- * A closure_with_waitlist embeds a closure and a wait list - much like struct
- * delayed_work embeds a work item and a timer_list. The important thing is, use
- * it exactly like you would a regular closure and closure_put() will magically
- * handle everything for you.
  */
 
 struct closure;
@@ -164,12 +106,6 @@ struct closure_waitlist {
        struct llist_head       list;
 };
 
-enum closure_type {
-       TYPE_closure                            = 0,
-       TYPE_closure_with_waitlist              = 1,
-       MAX_CLOSURE_TYPE                        = 1,
-};
-
 enum closure_state {
        /*
         * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by
@@ -224,8 +160,6 @@ struct closure {
 
        atomic_t                remaining;
 
-       enum closure_type       type;
-
 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG
 #define CLOSURE_MAGIC_DEAD     0xc054dead
 #define CLOSURE_MAGIC_ALIVE    0xc054a11e
@@ -237,34 +171,12 @@ struct closure {
 #endif
 };
 
-struct closure_with_waitlist {
-       struct closure          cl;
-       struct closure_waitlist wait;
-};
-
-extern unsigned invalid_closure_type(void);
-
-#define __CLOSURE_TYPE(cl, _t)                                         \
-         __builtin_types_compatible_p(typeof(cl), struct _t)           \
-               ? TYPE_ ## _t :                                         \
-
-#define __closure_type(cl)                                             \
-(                                                                      \
-       __CLOSURE_TYPE(cl, closure)                                     \
-       __CLOSURE_TYPE(cl, closure_with_waitlist)                       \
-       invalid_closure_type()                                          \
-)
-
 void closure_sub(struct closure *cl, int v);
 void closure_put(struct closure *cl);
 void __closure_wake_up(struct closure_waitlist *list);
 bool closure_wait(struct closure_waitlist *list, struct closure *cl);
 void closure_sync(struct closure *cl);
 
-bool closure_trylock(struct closure *cl, struct closure *parent);
-void __closure_lock(struct closure *cl, struct closure *parent,
-                   struct closure_waitlist *wait_list);
-
 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG
 
 void closure_debug_init(void);
@@ -293,134 +205,97 @@ static inline void closure_set_ret_ip(struct closure *cl)
 #endif
 }
 
-static inline void closure_get(struct closure *cl)
+static inline void closure_set_waiting(struct closure *cl, unsigned long f)
 {
 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG
-       BUG_ON((atomic_inc_return(&cl->remaining) &
-               CLOSURE_REMAINING_MASK) <= 1);
-#else
-       atomic_inc(&cl->remaining);
+       cl->waiting_on = f;
 #endif
 }
 
-static inline void closure_set_stopped(struct closure *cl)
+static inline void __closure_end_sleep(struct closure *cl)
 {
-       atomic_sub(CLOSURE_RUNNING, &cl->remaining);
+       __set_current_state(TASK_RUNNING);
+
+       if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING)
+               atomic_sub(CLOSURE_SLEEPING, &cl->remaining);
 }
 
-static inline bool closure_is_unlocked(struct closure *cl)
+static inline void __closure_start_sleep(struct closure *cl)
 {
-       return atomic_read(&cl->remaining) == -1;
+       closure_set_ip(cl);
+       cl->task = current;
+       set_current_state(TASK_UNINTERRUPTIBLE);
+
+       if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING))
+               atomic_add(CLOSURE_SLEEPING, &cl->remaining);
 }
 
-static inline void do_closure_init(struct closure *cl, struct closure *parent,
-                                  bool running)
+static inline void closure_set_stopped(struct closure *cl)
 {
-       cl->parent = parent;
-       if (parent)
-               closure_get(parent);
-
-       if (running) {
-               closure_debug_create(cl);
-               atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
-       } else
-               atomic_set(&cl->remaining, -1);
+       atomic_sub(CLOSURE_RUNNING, &cl->remaining);
+}
 
+static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
+                                 struct workqueue_struct *wq)
+{
+       BUG_ON(object_is_on_stack(cl));
        closure_set_ip(cl);
+       cl->fn = fn;
+       cl->wq = wq;
+       /* between atomic_dec() in closure_put() */
+       smp_mb__before_atomic_dec();
 }
 
-/*
- * Hack to get at the embedded closure if there is one, by doing an unsafe cast:
- * the result of __closure_type() is thrown away, it's used merely for type
- * checking.
- */
-#define __to_internal_closure(cl)                              \
-({                                                             \
-       BUILD_BUG_ON(__closure_type(*cl) > MAX_CLOSURE_TYPE);   \
-       (struct closure *) cl;                                  \
-})
-
-#define closure_init_type(cl, parent, running)                 \
-do {                                                           \
-       struct closure *_cl = __to_internal_closure(cl);        \
-       _cl->type = __closure_type(*(cl));                      \
-       do_closure_init(_cl, parent, running);                  \
-} while (0)
+static inline void closure_queue(struct closure *cl)
+{
+       struct workqueue_struct *wq = cl->wq;
+       if (wq) {
+               INIT_WORK(&cl->work, cl->work.func);
+               BUG_ON(!queue_work(wq, &cl->work));
+       } else
+               cl->fn(cl);
+}
 
 /**
- * __closure_init() - Initialize a closure, skipping the memset()
- *
- * May be used instead of closure_init() when memory has already been zeroed.
+ * closure_get - increment a closure's refcount
  */
-#define __closure_init(cl, parent)                             \
-       closure_init_type(cl, parent, true)
+static inline void closure_get(struct closure *cl)
+{
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+       BUG_ON((atomic_inc_return(&cl->remaining) &
+               CLOSURE_REMAINING_MASK) <= 1);
+#else
+       atomic_inc(&cl->remaining);
+#endif
+}
 
 /**
- * closure_init() - Initialize a closure, setting the refcount to 1
+ * closure_init - Initialize a closure, setting the refcount to 1
  * @cl:                closure to initialize
  * @parent:    parent of the new closure. cl will take a refcount on it for its
  *             lifetime; may be NULL.
  */
-#define closure_init(cl, parent)                               \
-do {                                                           \
-       memset((cl), 0, sizeof(*(cl)));                         \
-       __closure_init(cl, parent);                             \
-} while (0)
-
-static inline void closure_init_stack(struct closure *cl)
+static inline void closure_init(struct closure *cl, struct closure *parent)
 {
        memset(cl, 0, sizeof(struct closure));
-       atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK);
-}
-
-/**
- * closure_init_unlocked() - Initialize a closure but leave it unlocked.
- * @cl:                closure to initialize
- *
- * For when the closure will be used as a lock. The closure may not be used
- * until after a closure_lock() or closure_trylock().
- */
-#define closure_init_unlocked(cl)                              \
-do {                                                           \
-       memset((cl), 0, sizeof(*(cl)));                         \
-       closure_init_type(cl, NULL, false);                     \
-} while (0)
-
-/**
- * closure_lock() - lock and initialize a closure.
- * @cl:                the closure to lock
- * @parent:    the new parent for this closure
- *
- * The closure must be of one of the types that has a waitlist (otherwise we
- * wouldn't be able to sleep on contention).
- *
- * @parent has exactly the same meaning as in closure_init(); if non null, the
- * closure will take a reference on @parent which will be released when it is
- * unlocked.
- */
-#define closure_lock(cl, parent)                               \
-       __closure_lock(__to_internal_closure(cl), parent, &(cl)->wait)
+       cl->parent = parent;
+       if (parent)
+               closure_get(parent);
 
-static inline void __closure_end_sleep(struct closure *cl)
-{
-       __set_current_state(TASK_RUNNING);
+       atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
 
-       if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING)
-               atomic_sub(CLOSURE_SLEEPING, &cl->remaining);
+       closure_debug_create(cl);
+       closure_set_ip(cl);
 }
 
-static inline void __closure_start_sleep(struct closure *cl)
+static inline void closure_init_stack(struct closure *cl)
 {
-       closure_set_ip(cl);
-       cl->task = current;
-       set_current_state(TASK_UNINTERRUPTIBLE);
-
-       if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING))
-               atomic_add(CLOSURE_SLEEPING, &cl->remaining);
+       memset(cl, 0, sizeof(struct closure));
+       atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK);
 }
 
 /**
- * closure_wake_up() - wake up all closures on a wait list.
+ * closure_wake_up - wake up all closures on a wait list.
  */
 static inline void closure_wake_up(struct closure_waitlist *list)
 {
@@ -428,69 +303,19 @@ static inline void closure_wake_up(struct closure_waitlist *list)
        __closure_wake_up(list);
 }
 
-/*
- * Wait on an event, synchronously or asynchronously - analogous to wait_event()
- * but for closures.
- *
- * The loop is oddly structured so as to avoid a race; we must check the
- * condition again after we've added ourself to the waitlist. We know if we were
- * already on the waitlist because closure_wait() returns false; thus, we only
- * schedule or break if closure_wait() returns false. If it returns true, we
- * just loop again - rechecking the condition.
- *
- * The __closure_wake_up() is necessary because we may race with the event
- * becoming true; i.e. we see event false -> wait -> recheck condition, but the
- * thread that made the event true may have called closure_wake_up() before we
- * added ourself to the wait list.
- *
- * We have to call closure_sync() at the end instead of just
- * __closure_end_sleep() because a different thread might've called
- * closure_wake_up() before us and gotten preempted before they dropped the
- * refcount on our closure. If this was a stack allocated closure, that would be
- * bad.
+/**
+ * continue_at - jump to another function with barrier
+ *
+ * After @cl is no longer waiting on anything (i.e. all outstanding refs have
+ * been dropped with closure_put()), it will resume execution at @fn running out
+ * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly).
+ *
+ * NOTE: This macro expands to a return in the calling function!
+ *
+ * This is because after calling continue_at() you no longer have a ref on @cl,
+ * and whatever @cl owns may be freed out from under you - a running closure fn
+ * has a ref on its own closure which continue_at() drops.
  */
-#define closure_wait_event(list, cl, condition)                                \
-({                                                                     \
-       typeof(condition) ret;                                          \
-                                                                       \
-       while (1) {                                                     \
-               ret = (condition);                                      \
-               if (ret) {                                              \
-                       __closure_wake_up(list);                        \
-                       closure_sync(cl);                               \
-                       break;                                          \
-               }                                                       \
-                                                                       \
-               __closure_start_sleep(cl);                              \
-                                                                       \
-               if (!closure_wait(list, cl))                            \
-                       schedule();                                     \
-       }                                                               \
-                                                                       \
-       ret;                                                            \
-})
-
-static inline void closure_queue(struct closure *cl)
-{
-       struct workqueue_struct *wq = cl->wq;
-       if (wq) {
-               INIT_WORK(&cl->work, cl->work.func);
-               BUG_ON(!queue_work(wq, &cl->work));
-       } else
-               cl->fn(cl);
-}
-
-static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
-                                 struct workqueue_struct *wq)
-{
-       BUG_ON(object_is_on_stack(cl));
-       closure_set_ip(cl);
-       cl->fn = fn;
-       cl->wq = wq;
-       /* between atomic_dec() in closure_put() */
-       smp_mb__before_atomic_dec();
-}
-
 #define continue_at(_cl, _fn, _wq)                                     \
 do {                                                                   \
        set_closure_fn(_cl, _fn, _wq);                                  \
@@ -498,8 +323,28 @@ do {                                                                       \
        return;                                                         \
 } while (0)
 
+/**
+ * closure_return - finish execution of a closure
+ *
+ * This is used to indicate that @cl is finished: when all outstanding refs on
+ * @cl have been dropped @cl's ref on its parent closure (as passed to
+ * closure_init()) will be dropped, if one was specified - thus this can be
+ * thought of as returning to the parent closure.
+ */
 #define closure_return(_cl)    continue_at((_cl), NULL, NULL)
 
+/**
+ * continue_at_nobarrier - jump to another function without barrier
+ *
+ * Causes @fn to be executed out of @cl, in @wq context (or called directly if
+ * @wq is NULL).
+ *
+ * NOTE: like continue_at(), this macro expands to a return in the caller!
+ *
+ * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn,
+ * thus it's not safe to touch anything protected by @cl after a
+ * continue_at_nobarrier().
+ */
 #define continue_at_nobarrier(_cl, _fn, _wq)                           \
 do {                                                                   \
        set_closure_fn(_cl, _fn, _wq);                                  \
@@ -507,6 +352,15 @@ do {                                                                       \
        return;                                                         \
 } while (0)
 
+/**
+ * closure_return - finish execution of a closure, with destructor
+ *
+ * Works like closure_return(), except @destructor will be called when all
+ * outstanding refs on @cl have been dropped; @destructor may be used to safely
+ * free the memory occupied by @cl, and it is called with the ref on the parent
+ * closure still held - so @destructor could safely return an item to a
+ * freelist protected by @cl's parent.
+ */
 #define closure_return_with_destructor(_cl, _destructor)               \
 do {                                                                   \
        set_closure_fn(_cl, _destructor, NULL);                         \
@@ -514,6 +368,13 @@ do {                                                                       \
        return;                                                         \
 } while (0)
 
+/**
+ * closure_call - execute @fn out of a new, uninitialized closure
+ *
+ * Typically used when running out of one closure, and we want to run @fn
+ * asynchronously out of a new closure - @parent will then wait for @cl to
+ * finish.
+ */
 static inline void closure_call(struct closure *cl, closure_fn fn,
                                struct workqueue_struct *wq,
                                struct closure *parent)
@@ -522,12 +383,4 @@ static inline void closure_call(struct closure *cl, closure_fn fn,
        continue_at_nobarrier(cl, fn, wq);
 }
 
-static inline void closure_trylock_call(struct closure *cl, closure_fn fn,
-                                       struct workqueue_struct *wq,
-                                       struct closure *parent)
-{
-       if (closure_trylock(cl, parent))
-               continue_at_nobarrier(cl, fn, wq);
-}
-
 #endif /* _LINUX_CLOSURE_H */
index 03cb4d114e1624997d114a3ac4a131d80d3d90fb..8b1f1d5c18198f078dea917f40303c98fce1ee94 100644 (file)
@@ -8,6 +8,7 @@
 #include "bcache.h"
 #include "btree.h"
 #include "debug.h"
+#include "extents.h"
 
 #include <linux/console.h>
 #include <linux/debugfs.h>
 
 static struct dentry *debug;
 
-const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
-{
-       unsigned i;
-
-       for (i = 0; i < KEY_PTRS(k); i++)
-               if (ptr_available(c, k, i)) {
-                       struct cache *ca = PTR_CACHE(c, k, i);
-                       size_t bucket = PTR_BUCKET_NR(c, k, i);
-                       size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
-
-                       if (KEY_SIZE(k) + r > c->sb.bucket_size)
-                               return "bad, length too big";
-                       if (bucket <  ca->sb.first_bucket)
-                               return "bad, short offset";
-                       if (bucket >= ca->sb.nbuckets)
-                               return "bad, offset past end of device";
-                       if (ptr_stale(c, k, i))
-                               return "stale";
-               }
-
-       if (!bkey_cmp(k, &ZERO_KEY))
-               return "bad, null key";
-       if (!KEY_PTRS(k))
-               return "bad, no pointers";
-       if (!KEY_SIZE(k))
-               return "zeroed key";
-       return "";
-}
-
-int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k)
-{
-       unsigned i = 0;
-       char *out = buf, *end = buf + size;
-
-#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
-
-       p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_OFFSET(k), KEY_SIZE(k));
-
-       if (KEY_PTRS(k))
-               while (1) {
-                       p("%llu:%llu gen %llu",
-                         PTR_DEV(k, i), PTR_OFFSET(k, i), PTR_GEN(k, i));
-
-                       if (++i == KEY_PTRS(k))
-                               break;
-
-                       p(", ");
-               }
-
-       p("]");
-
-       if (KEY_DIRTY(k))
-               p(" dirty");
-       if (KEY_CSUM(k))
-               p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);
-#undef p
-       return out - buf;
-}
-
 #ifdef CONFIG_BCACHE_DEBUG
 
-static void dump_bset(struct btree *b, struct bset *i)
-{
-       struct bkey *k, *next;
-       unsigned j;
-       char buf[80];
-
-       for (k = i->start; k < end(i); k = next) {
-               next = bkey_next(k);
-
-               bch_bkey_to_text(buf, sizeof(buf), k);
-               printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b),
-                      (uint64_t *) k - i->d, i->keys, buf);
-
-               for (j = 0; j < KEY_PTRS(k); j++) {
-                       size_t n = PTR_BUCKET_NR(b->c, k, j);
-                       printk(" bucket %zu", n);
-
-                       if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets)
-                               printk(" prio %i",
-                                      PTR_BUCKET(b->c, k, j)->prio);
-               }
+#define for_each_written_bset(b, start, i)                             \
+       for (i = (start);                                               \
+            (void *) i < (void *) (start) + (KEY_SIZE(&b->key) << 9) &&\
+            i->seq == (start)->seq;                                    \
+            i = (void *) i + set_blocks(i, block_bytes(b->c)) *        \
+                block_bytes(b->c))
 
-               printk(" %s\n", bch_ptr_status(b->c, k));
-
-               if (next < end(i) &&
-                   bkey_cmp(k, !b->level ? &START_KEY(next) : next) > 0)
-                       printk(KERN_ERR "Key skipped backwards\n");
-       }
-}
-
-static void bch_dump_bucket(struct btree *b)
-{
-       unsigned i;
-
-       console_lock();
-       for (i = 0; i <= b->nsets; i++)
-               dump_bset(b, b->sets[i].data);
-       console_unlock();
-}
-
-void bch_btree_verify(struct btree *b, struct bset *new)
+void bch_btree_verify(struct btree *b)
 {
        struct btree *v = b->c->verify_data;
-       struct closure cl;
-       closure_init_stack(&cl);
+       struct bset *ondisk, *sorted, *inmemory;
+       struct bio *bio;
 
-       if (!b->c->verify)
+       if (!b->c->verify || !b->c->verify_ondisk)
                return;
 
-       closure_wait_event(&b->io.wait, &cl,
-                          atomic_read(&b->io.cl.remaining) == -1);
-
+       down(&b->io_mutex);
        mutex_lock(&b->c->verify_lock);
 
+       ondisk = b->c->verify_ondisk;
+       sorted = b->c->verify_data->keys.set->data;
+       inmemory = b->keys.set->data;
+
        bkey_copy(&v->key, &b->key);
        v->written = 0;
        v->level = b->level;
+       v->keys.ops = b->keys.ops;
+
+       bio = bch_bbio_alloc(b->c);
+       bio->bi_bdev            = PTR_CACHE(b->c, &b->key, 0)->bdev;
+       bio->bi_iter.bi_sector  = PTR_OFFSET(&b->key, 0);
+       bio->bi_iter.bi_size    = KEY_SIZE(&v->key) << 9;
+       bch_bio_map(bio, sorted);
 
-       bch_btree_node_read(v);
-       closure_wait_event(&v->io.wait, &cl,
-                          atomic_read(&b->io.cl.remaining) == -1);
+       submit_bio_wait(REQ_META|READ_SYNC, bio);
+       bch_bbio_free(bio, b->c);
 
-       if (new->keys != v->sets[0].data->keys ||
-           memcmp(new->start,
-                  v->sets[0].data->start,
-                  (void *) end(new) - (void *) new->start)) {
-               unsigned i, j;
+       memcpy(ondisk, sorted, KEY_SIZE(&v->key) << 9);
+
+       bch_btree_node_read_done(v);
+       sorted = v->keys.set->data;
+
+       if (inmemory->keys != sorted->keys ||
+           memcmp(inmemory->start,
+                  sorted->start,
+                  (void *) bset_bkey_last(inmemory) - (void *) inmemory->start)) {
+               struct bset *i;
+               unsigned j;
 
                console_lock();
 
-               printk(KERN_ERR "*** original memory node:\n");
-               for (i = 0; i <= b->nsets; i++)
-                       dump_bset(b, b->sets[i].data);
+               printk(KERN_ERR "*** in memory:\n");
+               bch_dump_bset(&b->keys, inmemory, 0);
 
-               printk(KERN_ERR "*** sorted memory node:\n");
-               dump_bset(b, new);
+               printk(KERN_ERR "*** read back in:\n");
+               bch_dump_bset(&v->keys, sorted, 0);
 
-               printk(KERN_ERR "*** on disk node:\n");
-               dump_bset(v, v->sets[0].data);
+               for_each_written_bset(b, ondisk, i) {
+                       unsigned block = ((void *) i - (void *) ondisk) /
+                               block_bytes(b->c);
+
+                       printk(KERN_ERR "*** on disk block %u:\n", block);
+                       bch_dump_bset(&b->keys, i, block);
+               }
 
-               for (j = 0; j < new->keys; j++)
-                       if (new->d[j] != v->sets[0].data->d[j])
+               printk(KERN_ERR "*** block %zu not written\n",
+                      ((void *) i - (void *) ondisk) / block_bytes(b->c));
+
+               for (j = 0; j < inmemory->keys; j++)
+                       if (inmemory->d[j] != sorted->d[j])
                                break;
 
+               printk(KERN_ERR "b->written %u\n", b->written);
+
                console_unlock();
                panic("verify failed at %u\n", j);
        }
 
        mutex_unlock(&b->c->verify_lock);
+       up(&b->io_mutex);
 }
 
 void bch_data_verify(struct cached_dev *dc, struct bio *bio)
@@ -207,74 +140,6 @@ out_put:
        bio_put(check);
 }
 
-int __bch_count_data(struct btree *b)
-{
-       unsigned ret = 0;
-       struct btree_iter iter;
-       struct bkey *k;
-
-       if (!b->level)
-               for_each_key(b, k, &iter)
-                       ret += KEY_SIZE(k);
-       return ret;
-}
-
-void __bch_check_keys(struct btree *b, const char *fmt, ...)
-{
-       va_list args;
-       struct bkey *k, *p = NULL;
-       struct btree_iter iter;
-       const char *err;
-
-       for_each_key(b, k, &iter) {
-               if (!b->level) {
-                       err = "Keys out of order";
-                       if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0)
-                               goto bug;
-
-                       if (bch_ptr_invalid(b, k))
-                               continue;
-
-                       err =  "Overlapping keys";
-                       if (p && bkey_cmp(p, &START_KEY(k)) > 0)
-                               goto bug;
-               } else {
-                       if (bch_ptr_bad(b, k))
-                               continue;
-
-                       err = "Duplicate keys";
-                       if (p && !bkey_cmp(p, k))
-                               goto bug;
-               }
-               p = k;
-       }
-
-       err = "Key larger than btree node key";
-       if (p && bkey_cmp(p, &b->key) > 0)
-               goto bug;
-
-       return;
-bug:
-       bch_dump_bucket(b);
-
-       va_start(args, fmt);
-       vprintk(fmt, args);
-       va_end(args);
-
-       panic("bcache error: %s:\n", err);
-}
-
-void bch_btree_iter_next_check(struct btree_iter *iter)
-{
-       struct bkey *k = iter->data->k, *next = bkey_next(k);
-
-       if (next < iter->data->end &&
-           bkey_cmp(k, iter->b->level ? next : &START_KEY(next)) > 0) {
-               bch_dump_bucket(iter->b);
-               panic("Key skipped backwards\n");
-       }
-}
-
 #endif
 
 #ifdef CONFIG_DEBUG_FS
@@ -321,7 +186,7 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf,
                if (!w)
                        break;
 
-               bch_bkey_to_text(kbuf, sizeof(kbuf), &w->key);
+               bch_extent_to_text(kbuf, sizeof(kbuf), &w->key);
                i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf);
                bch_keybuf_del(&i->keys, w);
        }
index 2ede60e3187475d114004111ef70c2ebaf4f38af..1f63c195d2476ece1b0c50b6acdddd1a1d92ba8b 100644 (file)
@@ -1,47 +1,30 @@
 #ifndef _BCACHE_DEBUG_H
 #define _BCACHE_DEBUG_H
 
-/* Btree/bkey debug printing */
-
-int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k);
+struct bio;
+struct cached_dev;
+struct cache_set;
 
 #ifdef CONFIG_BCACHE_DEBUG
 
-void bch_btree_verify(struct btree *, struct bset *);
+void bch_btree_verify(struct btree *);
 void bch_data_verify(struct cached_dev *, struct bio *);
-int __bch_count_data(struct btree *);
-void __bch_check_keys(struct btree *, const char *, ...);
-void bch_btree_iter_next_check(struct btree_iter *);
 
-#define EBUG_ON(cond)                  BUG_ON(cond)
 #define expensive_debug_checks(c)      ((c)->expensive_debug_checks)
 #define key_merging_disabled(c)                ((c)->key_merging_disabled)
 #define bypass_torture_test(d)         ((d)->bypass_torture_test)
 
 #else /* DEBUG */
 
-static inline void bch_btree_verify(struct btree *b, struct bset *i) {}
+static inline void bch_btree_verify(struct btree *b) {}
 static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {}
-static inline int __bch_count_data(struct btree *b) { return -1; }
-static inline void __bch_check_keys(struct btree *b, const char *fmt, ...) {}
-static inline void bch_btree_iter_next_check(struct btree_iter *iter) {}
 
-#define EBUG_ON(cond)                  do { if (cond); } while (0)
 #define expensive_debug_checks(c)      0
 #define key_merging_disabled(c)                0
 #define bypass_torture_test(d)         0
 
 #endif
 
-#define bch_count_data(b)                                              \
-       (expensive_debug_checks((b)->c) ? __bch_count_data(b) : -1)
-
-#define bch_check_keys(b, ...)                                         \
-do {                                                                   \
-       if (expensive_debug_checks((b)->c))                             \
-               __bch_check_keys(b, __VA_ARGS__);                       \
-} while (0)
-
 #ifdef CONFIG_DEBUG_FS
 void bch_debug_init_cache_set(struct cache_set *);
 #else
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
new file mode 100644 (file)
index 0000000..c3ead58
--- /dev/null
@@ -0,0 +1,616 @@
+/*
+ * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
+ *
+ * Uses a block device as cache for other block devices; optimized for SSDs.
+ * All allocation is done in buckets, which should match the erase block size
+ * of the device.
+ *
+ * Buckets containing cached data are kept on a heap sorted by priority;
+ * bucket priority is increased on cache hit, and periodically all the buckets
+ * on the heap have their priority scaled down. This currently is just used as
+ * an LRU but in the future should allow for more intelligent heuristics.
+ *
+ * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
+ * counter. Garbage collection is used to remove stale pointers.
+ *
+ * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
+ * as keys are inserted we only sort the pages that have not yet been written.
+ * When garbage collection is run, we resort the entire node.
+ *
+ * All configuration is done via sysfs; see Documentation/bcache.txt.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "extents.h"
+#include "writeback.h"
+
+static void sort_key_next(struct btree_iter *iter,
+                         struct btree_iter_set *i)
+{
+       i->k = bkey_next(i->k);
+
+       if (i->k == i->end)
+               *i = iter->data[--iter->used];
+}
+
+static bool bch_key_sort_cmp(struct btree_iter_set l,
+                            struct btree_iter_set r)
+{
+       int64_t c = bkey_cmp(l.k, r.k);
+
+       return c ? c > 0 : l.k < r.k;
+}
+
+static bool __ptr_invalid(struct cache_set *c, const struct bkey *k)
+{
+       unsigned i;
+
+       for (i = 0; i < KEY_PTRS(k); i++)
+               if (ptr_available(c, k, i)) {
+                       struct cache *ca = PTR_CACHE(c, k, i);
+                       size_t bucket = PTR_BUCKET_NR(c, k, i);
+                       size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
+
+                       if (KEY_SIZE(k) + r > c->sb.bucket_size ||
+                           bucket <  ca->sb.first_bucket ||
+                           bucket >= ca->sb.nbuckets)
+                               return true;
+               }
+
+       return false;
+}
+
+/* Common among btree and extent ptrs */
+
+static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
+{
+       unsigned i;
+
+       for (i = 0; i < KEY_PTRS(k); i++)
+               if (ptr_available(c, k, i)) {
+                       struct cache *ca = PTR_CACHE(c, k, i);
+                       size_t bucket = PTR_BUCKET_NR(c, k, i);
+                       size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
+
+                       if (KEY_SIZE(k) + r > c->sb.bucket_size)
+                               return "bad, length too big";
+                       if (bucket <  ca->sb.first_bucket)
+                               return "bad, short offset";
+                       if (bucket >= ca->sb.nbuckets)
+                               return "bad, offset past end of device";
+                       if (ptr_stale(c, k, i))
+                               return "stale";
+               }
+
+       if (!bkey_cmp(k, &ZERO_KEY))
+               return "bad, null key";
+       if (!KEY_PTRS(k))
+               return "bad, no pointers";
+       if (!KEY_SIZE(k))
+               return "zeroed key";
+       return "";
+}
+
+void bch_extent_to_text(char *buf, size_t size, const struct bkey *k)
+{
+       unsigned i = 0;
+       char *out = buf, *end = buf + size;
+
+#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
+
+       p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_START(k), KEY_SIZE(k));
+
+       for (i = 0; i < KEY_PTRS(k); i++) {
+               if (i)
+                       p(", ");
+
+               if (PTR_DEV(k, i) == PTR_CHECK_DEV)
+                       p("check dev");
+               else
+                       p("%llu:%llu gen %llu", PTR_DEV(k, i),
+                         PTR_OFFSET(k, i), PTR_GEN(k, i));
+       }
+
+       p("]");
+
+       if (KEY_DIRTY(k))
+               p(" dirty");
+       if (KEY_CSUM(k))
+               p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);
+#undef p
+}
+
+static void bch_bkey_dump(struct btree_keys *keys, const struct bkey *k)
+{
+       struct btree *b = container_of(keys, struct btree, keys);
+       unsigned j;
+       char buf[80];
+
+       bch_extent_to_text(buf, sizeof(buf), k);
+       printk(" %s", buf);
+
+       for (j = 0; j < KEY_PTRS(k); j++) {
+               size_t n = PTR_BUCKET_NR(b->c, k, j);
+               printk(" bucket %zu", n);
+
+               if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets)
+                       printk(" prio %i",
+                              PTR_BUCKET(b->c, k, j)->prio);
+       }
+
+       printk(" %s\n", bch_ptr_status(b->c, k));
+}
+
+/* Btree ptrs */
+
+bool __bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k)
+{
+       char buf[80];
+
+       if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))
+               goto bad;
+
+       if (__ptr_invalid(c, k))
+               goto bad;
+
+       return false;
+bad:
+       bch_extent_to_text(buf, sizeof(buf), k);
+       cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k));
+       return true;
+}
+
+static bool bch_btree_ptr_invalid(struct btree_keys *bk, const struct bkey *k)
+{
+       struct btree *b = container_of(bk, struct btree, keys);
+       return __bch_btree_ptr_invalid(b->c, k);
+}
+
+static bool btree_ptr_bad_expensive(struct btree *b, const struct bkey *k)
+{
+       unsigned i;
+       char buf[80];
+       struct bucket *g;
+
+       if (mutex_trylock(&b->c->bucket_lock)) {
+               for (i = 0; i < KEY_PTRS(k); i++)
+                       if (ptr_available(b->c, k, i)) {
+                               g = PTR_BUCKET(b->c, k, i);
+
+                               if (KEY_DIRTY(k) ||
+                                   g->prio != BTREE_PRIO ||
+                                   (b->c->gc_mark_valid &&
+                                    GC_MARK(g) != GC_MARK_METADATA))
+                                       goto err;
+                       }
+
+               mutex_unlock(&b->c->bucket_lock);
+       }
+
+       return false;
+err:
+       mutex_unlock(&b->c->bucket_lock);
+       bch_extent_to_text(buf, sizeof(buf), k);
+       btree_bug(b,
+"inconsistent btree pointer %s: bucket %li pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
+                 buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
+                 g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
+       return true;
+}
+
+static bool bch_btree_ptr_bad(struct btree_keys *bk, const struct bkey *k)
+{
+       struct btree *b = container_of(bk, struct btree, keys);
+       unsigned i;
+
+       if (!bkey_cmp(k, &ZERO_KEY) ||
+           !KEY_PTRS(k) ||
+           bch_ptr_invalid(bk, k))
+               return true;
+
+       for (i = 0; i < KEY_PTRS(k); i++)
+               if (!ptr_available(b->c, k, i) ||
+                   ptr_stale(b->c, k, i))
+                       return true;
+
+       if (expensive_debug_checks(b->c) &&
+           btree_ptr_bad_expensive(b, k))
+               return true;
+
+       return false;
+}
+
+static bool bch_btree_ptr_insert_fixup(struct btree_keys *bk,
+                                      struct bkey *insert,
+                                      struct btree_iter *iter,
+                                      struct bkey *replace_key)
+{
+       struct btree *b = container_of(bk, struct btree, keys);
+
+       if (!KEY_OFFSET(insert))
+               btree_current_write(b)->prio_blocked++;
+
+       return false;
+}
+
+const struct btree_keys_ops bch_btree_keys_ops = {
+       .sort_cmp       = bch_key_sort_cmp,
+       .insert_fixup   = bch_btree_ptr_insert_fixup,
+       .key_invalid    = bch_btree_ptr_invalid,
+       .key_bad        = bch_btree_ptr_bad,
+       .key_to_text    = bch_extent_to_text,
+       .key_dump       = bch_bkey_dump,
+};
+
+/* Extents */
+
+/*
+ * Returns true if l > r - unless l == r, in which case returns true if l is
+ * older than r.
+ *
+ * Necessary for btree_sort_fixup() - if there are multiple keys that compare
+ * equal in different sets, we have to process them newest to oldest.
+ */
+static bool bch_extent_sort_cmp(struct btree_iter_set l,
+                               struct btree_iter_set r)
+{
+       int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k));
+
+       return c ? c > 0 : l.k < r.k;
+}
+
+static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter,
+                                         struct bkey *tmp)
+{
+       while (iter->used > 1) {
+               struct btree_iter_set *top = iter->data, *i = top + 1;
+
+               if (iter->used > 2 &&
+                   bch_extent_sort_cmp(i[0], i[1]))
+                       i++;
+
+               if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0)
+                       break;
+
+               if (!KEY_SIZE(i->k)) {
+                       sort_key_next(iter, i);
+                       heap_sift(iter, i - top, bch_extent_sort_cmp);
+                       continue;
+               }
+
+               if (top->k > i->k) {
+                       if (bkey_cmp(top->k, i->k) >= 0)
+                               sort_key_next(iter, i);
+                       else
+                               bch_cut_front(top->k, i->k);
+
+                       heap_sift(iter, i - top, bch_extent_sort_cmp);
+               } else {
+                       /* can't happen because of comparison func */
+                       BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k)));
+
+                       if (bkey_cmp(i->k, top->k) < 0) {
+                               bkey_copy(tmp, top->k);
+
+                               bch_cut_back(&START_KEY(i->k), tmp);
+                               bch_cut_front(i->k, top->k);
+                               heap_sift(iter, 0, bch_extent_sort_cmp);
+
+                               return tmp;
+                       } else {
+                               bch_cut_back(&START_KEY(i->k), top->k);
+                       }
+               }
+       }
+
+       return NULL;
+}
+
+static bool bch_extent_insert_fixup(struct btree_keys *b,
+                                   struct bkey *insert,
+                                   struct btree_iter *iter,
+                                   struct bkey *replace_key)
+{
+       struct cache_set *c = container_of(b, struct btree, keys)->c;
+
+       void subtract_dirty(struct bkey *k, uint64_t offset, int sectors)
+       {
+               if (KEY_DIRTY(k))
+                       bcache_dev_sectors_dirty_add(c, KEY_INODE(k),
+                                                    offset, -sectors);
+       }
+
+       uint64_t old_offset;
+       unsigned old_size, sectors_found = 0;
+
+       BUG_ON(!KEY_OFFSET(insert));
+       BUG_ON(!KEY_SIZE(insert));
+
+       while (1) {
+               struct bkey *k = bch_btree_iter_next(iter);
+               if (!k)
+                       break;
+
+               if (bkey_cmp(&START_KEY(k), insert) >= 0) {
+                       if (KEY_SIZE(k))
+                               break;
+                       else
+                               continue;
+               }
+
+               if (bkey_cmp(k, &START_KEY(insert)) <= 0)
+                       continue;
+
+               old_offset = KEY_START(k);
+               old_size = KEY_SIZE(k);
+
+               /*
+                * We might overlap with 0 size extents; we can't skip these
+                * because if they're in the set we're inserting to we have to
+                * adjust them so they don't overlap with the key we're
+                * inserting. But we don't want to check them for replace
+                * operations.
+                */
+
+               if (replace_key && KEY_SIZE(k)) {
+                       /*
+                        * k might have been split since we inserted/found the
+                        * key we're replacing
+                        */
+                       unsigned i;
+                       uint64_t offset = KEY_START(k) -
+                               KEY_START(replace_key);
+
+                       /* But it must be a subset of the replace key */
+                       if (KEY_START(k) < KEY_START(replace_key) ||
+                           KEY_OFFSET(k) > KEY_OFFSET(replace_key))
+                               goto check_failed;
+
+                       /* We didn't find a key that we were supposed to */
+                       if (KEY_START(k) > KEY_START(insert) + sectors_found)
+                               goto check_failed;
+
+                       if (!bch_bkey_equal_header(k, replace_key))
+                               goto check_failed;
+
+                       /* skip past gen */
+                       offset <<= 8;
+
+                       BUG_ON(!KEY_PTRS(replace_key));
+
+                       for (i = 0; i < KEY_PTRS(replace_key); i++)
+                               if (k->ptr[i] != replace_key->ptr[i] + offset)
+                                       goto check_failed;
+
+                       sectors_found = KEY_OFFSET(k) - KEY_START(insert);
+               }
+
+               if (bkey_cmp(insert, k) < 0 &&
+                   bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) {
+                       /*
+                        * We overlapped in the middle of an existing key: that
+                        * means we have to split the old key. But we have to do
+                        * slightly different things depending on whether the
+                        * old key has been written out yet.
+                        */
+
+                       struct bkey *top;
+
+                       subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert));
+
+                       if (bkey_written(b, k)) {
+                               /*
+                                * We insert a new key to cover the top of the
+                                * old key, and the old key is modified in place
+                                * to represent the bottom split.
+                                *
+                                * It's completely arbitrary whether the new key
+                                * is the top or the bottom, but it has to match
+                                * up with what btree_sort_fixup() does - it
+                                * doesn't check for this kind of overlap, it
+                                * depends on us inserting a new key for the top
+                                * here.
+                                */
+                               top = bch_bset_search(b, bset_tree_last(b),
+                                                     insert);
+                               bch_bset_insert(b, top, k);
+                       } else {
+                               BKEY_PADDED(key) temp;
+                               bkey_copy(&temp.key, k);
+                               bch_bset_insert(b, k, &temp.key);
+                               top = bkey_next(k);
+                       }
+
+                       bch_cut_front(insert, top);
+                       bch_cut_back(&START_KEY(insert), k);
+                       bch_bset_fix_invalidated_key(b, k);
+                       goto out;
+               }
+
+               if (bkey_cmp(insert, k) < 0) {
+                       bch_cut_front(insert, k);
+               } else {
+                       if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0)
+                               old_offset = KEY_START(insert);
+
+                       if (bkey_written(b, k) &&
+                           bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
+                               /*
+                                * Completely overwrote, so we don't have to
+                                * invalidate the binary search tree
+                                */
+                               bch_cut_front(k, k);
+                       } else {
+                               __bch_cut_back(&START_KEY(insert), k);
+                               bch_bset_fix_invalidated_key(b, k);
+                       }
+               }
+
+               subtract_dirty(k, old_offset, old_size - KEY_SIZE(k));
+       }
+
+check_failed:
+       if (replace_key) {
+               if (!sectors_found) {
+                       return true;
+               } else if (sectors_found < KEY_SIZE(insert)) {
+                       SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
+                                      (KEY_SIZE(insert) - sectors_found));
+                       SET_KEY_SIZE(insert, sectors_found);
+               }
+       }
+out:
+       if (KEY_DIRTY(insert))
+               bcache_dev_sectors_dirty_add(c, KEY_INODE(insert),
+                                            KEY_START(insert),
+                                            KEY_SIZE(insert));
+
+       return false;
+}
+
+static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k)
+{
+       struct btree *b = container_of(bk, struct btree, keys);
+       char buf[80];
+
+       if (!KEY_SIZE(k))
+               return true;
+
+       if (KEY_SIZE(k) > KEY_OFFSET(k))
+               goto bad;
+
+       if (__ptr_invalid(b->c, k))
+               goto bad;
+
+       return false;
+bad:
+       bch_extent_to_text(buf, sizeof(buf), k);
+       cache_bug(b->c, "spotted extent %s: %s", buf, bch_ptr_status(b->c, k));
+       return true;
+}
+
+static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k,
+                                    unsigned ptr)
+{
+       struct bucket *g = PTR_BUCKET(b->c, k, ptr);
+       char buf[80];
+
+       if (mutex_trylock(&b->c->bucket_lock)) {
+               if (b->c->gc_mark_valid &&
+                   ((GC_MARK(g) != GC_MARK_DIRTY &&
+                     KEY_DIRTY(k)) ||
+                    GC_MARK(g) == GC_MARK_METADATA))
+                       goto err;
+
+               if (g->prio == BTREE_PRIO)
+                       goto err;
+
+               mutex_unlock(&b->c->bucket_lock);
+       }
+
+       return false;
+err:
+       mutex_unlock(&b->c->bucket_lock);
+       bch_extent_to_text(buf, sizeof(buf), k);
+       btree_bug(b,
+"inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
+                 buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin),
+                 g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
+       return true;
+}
+
+static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k)
+{
+       struct btree *b = container_of(bk, struct btree, keys);
+       struct bucket *g;
+       unsigned i, stale;
+
+       if (!KEY_PTRS(k) ||
+           bch_extent_invalid(bk, k))
+               return true;
+
+       for (i = 0; i < KEY_PTRS(k); i++)
+               if (!ptr_available(b->c, k, i))
+                       return true;
+
+       if (!expensive_debug_checks(b->c) && KEY_DIRTY(k))
+               return false;
+
+       for (i = 0; i < KEY_PTRS(k); i++) {
+               g = PTR_BUCKET(b->c, k, i);
+               stale = ptr_stale(b->c, k, i);
+
+               btree_bug_on(stale > 96, b,
+                            "key too stale: %i, need_gc %u",
+                            stale, b->c->need_gc);
+
+               btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k),
+                            b, "stale dirty pointer");
+
+               if (stale)
+                       return true;
+
+               if (expensive_debug_checks(b->c) &&
+                   bch_extent_bad_expensive(b, k, i))
+                       return true;
+       }
+
+       return false;
+}
+
+static uint64_t merge_chksums(struct bkey *l, struct bkey *r)
+{
+       return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) &
+               ~((uint64_t)1 << 63);
+}
+
+static bool bch_extent_merge(struct btree_keys *bk, struct bkey *l, struct bkey *r)
+{
+       struct btree *b = container_of(bk, struct btree, keys);
+       unsigned i;
+
+       if (key_merging_disabled(b->c))
+               return false;
+
+       for (i = 0; i < KEY_PTRS(l); i++)
+               if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
+                   PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i))
+                       return false;
+
+       /* Keys with no pointers aren't restricted to one bucket and could
+        * overflow KEY_SIZE
+        */
+       if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) {
+               SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l));
+               SET_KEY_SIZE(l, USHRT_MAX);
+
+               bch_cut_front(l, r);
+               return false;
+       }
+
+       if (KEY_CSUM(l)) {
+               if (KEY_CSUM(r))
+                       l->ptr[KEY_PTRS(l)] = merge_chksums(l, r);
+               else
+                       SET_KEY_CSUM(l, 0);
+       }
+
+       SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r));
+       SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r));
+
+       return true;
+}
+
+const struct btree_keys_ops bch_extent_keys_ops = {
+       .sort_cmp       = bch_extent_sort_cmp,
+       .sort_fixup     = bch_extent_sort_fixup,
+       .insert_fixup   = bch_extent_insert_fixup,
+       .key_invalid    = bch_extent_invalid,
+       .key_bad        = bch_extent_bad,
+       .key_merge      = bch_extent_merge,
+       .key_to_text    = bch_extent_to_text,
+       .key_dump       = bch_bkey_dump,
+       .is_extents     = true,
+};
diff --git a/drivers/md/bcache/extents.h b/drivers/md/bcache/extents.h
new file mode 100644 (file)
index 0000000..e4e2340
--- /dev/null
@@ -0,0 +1,13 @@
+#ifndef _BCACHE_EXTENTS_H
+#define _BCACHE_EXTENTS_H
+
+extern const struct btree_keys_ops bch_btree_keys_ops;
+extern const struct btree_keys_ops bch_extent_keys_ops;
+
+struct bkey;
+struct cache_set;
+
+void bch_extent_to_text(char *, size_t, const struct bkey *);
+bool __bch_btree_ptr_invalid(struct cache_set *, const struct bkey *);
+
+#endif /* _BCACHE_EXTENTS_H */
index 7eafdf09a0ae11cb1c4108926e4d1c51ac90c9f4..18039affc306b539e187b9b57ba3f1c3c3b95e3d 100644 (file)
@@ -44,11 +44,11 @@ static int journal_read_bucket(struct cache *ca, struct list_head *list,
 
        closure_init_stack(&cl);
 
-       pr_debug("reading %llu", (uint64_t) bucket);
+       pr_debug("reading %u", bucket_index);
 
        while (offset < ca->sb.bucket_size) {
 reread:                left = ca->sb.bucket_size - offset;
-               len = min_t(unsigned, left, PAGE_SECTORS * 8);
+               len = min_t(unsigned, left, PAGE_SECTORS << JSET_BITS);
 
                bio_reset(bio);
                bio->bi_iter.bi_sector  = bucket + offset;
@@ -74,19 +74,28 @@ reread:             left = ca->sb.bucket_size - offset;
                        struct list_head *where;
                        size_t blocks, bytes = set_bytes(j);
 
-                       if (j->magic != jset_magic(&ca->sb))
+                       if (j->magic != jset_magic(&ca->sb)) {
+                               pr_debug("%u: bad magic", bucket_index);
                                return ret;
+                       }
 
-                       if (bytes > left << 9)
+                       if (bytes > left << 9 ||
+                           bytes > PAGE_SIZE << JSET_BITS) {
+                               pr_info("%u: too big, %zu bytes, offset %u",
+                                       bucket_index, bytes, offset);
                                return ret;
+                       }
 
                        if (bytes > len << 9)
                                goto reread;
 
-                       if (j->csum != csum_set(j))
+                       if (j->csum != csum_set(j)) {
+                               pr_info("%u: bad csum, %zu bytes, offset %u",
+                                       bucket_index, bytes, offset);
                                return ret;
+                       }
 
-                       blocks = set_blocks(j, ca->set);
+                       blocks = set_blocks(j, block_bytes(ca->set));
 
                        while (!list_empty(list)) {
                                i = list_first_entry(list,
@@ -275,7 +284,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list)
                }
 
                for (k = i->j.start;
-                    k < end(&i->j);
+                    k < bset_bkey_last(&i->j);
                     k = bkey_next(k)) {
                        unsigned j;
 
@@ -313,7 +322,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
                                 n, i->j.seq - 1, start, end);
 
                for (k = i->j.start;
-                    k < end(&i->j);
+                    k < bset_bkey_last(&i->j);
                     k = bkey_next(k)) {
                        trace_bcache_journal_replay_key(k);
 
@@ -555,6 +564,14 @@ static void journal_write_done(struct closure *cl)
        continue_at_nobarrier(cl, journal_write, system_wq);
 }
 
+static void journal_write_unlock(struct closure *cl)
+{
+       struct cache_set *c = container_of(cl, struct cache_set, journal.io);
+
+       c->journal.io_in_flight = 0;
+       spin_unlock(&c->journal.lock);
+}
+
 static void journal_write_unlocked(struct closure *cl)
        __releases(c->journal.lock)
 {
@@ -562,22 +579,15 @@ static void journal_write_unlocked(struct closure *cl)
        struct cache *ca;
        struct journal_write *w = c->journal.cur;
        struct bkey *k = &c->journal.key;
-       unsigned i, sectors = set_blocks(w->data, c) * c->sb.block_size;
+       unsigned i, sectors = set_blocks(w->data, block_bytes(c)) *
+               c->sb.block_size;
 
        struct bio *bio;
        struct bio_list list;
        bio_list_init(&list);
 
        if (!w->need_write) {
-               /*
-                * XXX: have to unlock closure before we unlock journal lock,
-                * else we race with bch_journal(). But this way we race
-                * against cache set unregister. Doh.
-                */
-               set_closure_fn(cl, NULL, NULL);
-               closure_sub(cl, CLOSURE_RUNNING + 1);
-               spin_unlock(&c->journal.lock);
-               return;
+               closure_return_with_destructor(cl, journal_write_unlock);
        } else if (journal_full(&c->journal)) {
                journal_reclaim(c);
                spin_unlock(&c->journal.lock);
@@ -586,7 +596,7 @@ static void journal_write_unlocked(struct closure *cl)
                continue_at(cl, journal_write, system_wq);
        }
 
-       c->journal.blocks_free -= set_blocks(w->data, c);
+       c->journal.blocks_free -= set_blocks(w->data, block_bytes(c));
 
        w->data->btree_level = c->root->level;
 
@@ -653,10 +663,12 @@ static void journal_try_write(struct cache_set *c)
 
        w->need_write = true;
 
-       if (closure_trylock(cl, &c->cl))
-               journal_write_unlocked(cl);
-       else
+       if (!c->journal.io_in_flight) {
+               c->journal.io_in_flight = 1;
+               closure_call(cl, journal_write_unlocked, NULL, &c->cl);
+       } else {
                spin_unlock(&c->journal.lock);
+       }
 }
 
 static struct journal_write *journal_wait_for_write(struct cache_set *c,
@@ -664,6 +676,7 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
 {
        size_t sectors;
        struct closure cl;
+       bool wait = false;
 
        closure_init_stack(&cl);
 
@@ -673,16 +686,19 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
                struct journal_write *w = c->journal.cur;
 
                sectors = __set_blocks(w->data, w->data->keys + nkeys,
-                                      c) * c->sb.block_size;
+                                      block_bytes(c)) * c->sb.block_size;
 
                if (sectors <= min_t(size_t,
                                     c->journal.blocks_free * c->sb.block_size,
                                     PAGE_SECTORS << JSET_BITS))
                        return w;
 
-               /* XXX: tracepoint */
+               if (wait)
+                       closure_wait(&c->journal.wait, &cl);
+
                if (!journal_full(&c->journal)) {
-                       trace_bcache_journal_entry_full(c);
+                       if (wait)
+                               trace_bcache_journal_entry_full(c);
 
                        /*
                         * XXX: If we were inserting so many keys that they
@@ -692,12 +708,11 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
                         */
                        BUG_ON(!w->data->keys);
 
-                       closure_wait(&w->wait, &cl);
                        journal_try_write(c); /* unlocks */
                } else {
-                       trace_bcache_journal_full(c);
+                       if (wait)
+                               trace_bcache_journal_full(c);
 
-                       closure_wait(&c->journal.wait, &cl);
                        journal_reclaim(c);
                        spin_unlock(&c->journal.lock);
 
@@ -706,6 +721,7 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
 
                closure_sync(&cl);
                spin_lock(&c->journal.lock);
+               wait = true;
        }
 }
 
@@ -736,7 +752,7 @@ atomic_t *bch_journal(struct cache_set *c,
 
        w = journal_wait_for_write(c, bch_keylist_nkeys(keys));
 
-       memcpy(end(w->data), keys->keys, bch_keylist_bytes(keys));
+       memcpy(bset_bkey_last(w->data), keys->keys, bch_keylist_bytes(keys));
        w->data->keys += bch_keylist_nkeys(keys);
 
        ret = &fifo_back(&c->journal.pin);
@@ -780,7 +796,6 @@ int bch_journal_alloc(struct cache_set *c)
 {
        struct journal *j = &c->journal;
 
-       closure_init_unlocked(&j->io);
        spin_lock_init(&j->lock);
        INIT_DELAYED_WORK(&j->work, journal_write_work);
 
index a6472fda94b25c00a340bcfefd48b3ac0c762c6e..9180c44650759b61b843ea8fe763a38b62277cd4 100644 (file)
@@ -104,6 +104,7 @@ struct journal {
        /* used when waiting because the journal was full */
        struct closure_waitlist wait;
        struct closure          io;
+       int                     io_in_flight;
        struct delayed_work     work;
 
        /* Number of blocks free in the bucket(s) we're currently writing to */
index 052bd24d24b42b42c3d434564a361dc3b693a2e5..9eb60d102de84532e3a662390b1ba2934b744673 100644 (file)
@@ -211,7 +211,7 @@ void bch_moving_gc(struct cache_set *c)
        for_each_cache(ca, c, i) {
                unsigned sectors_to_move = 0;
                unsigned reserve_sectors = ca->sb.bucket_size *
-                       min(fifo_used(&ca->free), ca->free.size / 2);
+                       fifo_used(&ca->free[RESERVE_MOVINGGC]);
 
                ca->heap.used = 0;
 
index c906571997d7a4ab256188f05f4a8c11ea5928f8..72cd213f213f9e806dc9a0360000ffffe6466896 100644 (file)
@@ -254,6 +254,24 @@ static void bch_data_insert_keys(struct closure *cl)
        closure_return(cl);
 }
 
+static int bch_keylist_realloc(struct keylist *l, unsigned u64s,
+                              struct cache_set *c)
+{
+       size_t oldsize = bch_keylist_nkeys(l);
+       size_t newsize = oldsize + u64s;
+
+       /*
+        * The journalling code doesn't handle the case where the keys to insert
+        * is bigger than an empty write: If we just return -ENOMEM here,
+        * bio_insert() and bio_invalidate() will insert the keys created so far
+        * and finish the rest when the keylist is empty.
+        */
+       if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset))
+               return -ENOMEM;
+
+       return __bch_keylist_realloc(l, u64s);
+}
+
 static void bch_data_invalidate(struct closure *cl)
 {
        struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
@@ -266,7 +284,7 @@ static void bch_data_invalidate(struct closure *cl)
                unsigned sectors = min(bio_sectors(bio),
                                       1U << (KEY_SIZE_BITS - 1));
 
-               if (bch_keylist_realloc(&op->insert_keys, 0, op->c))
+               if (bch_keylist_realloc(&op->insert_keys, 2, op->c))
                        goto out;
 
                bio->bi_iter.bi_sector  += sectors;
@@ -356,7 +374,7 @@ static void bch_data_insert_start(struct closure *cl)
 
                /* 1 for the device pointer and 1 for the chksum */
                if (bch_keylist_realloc(&op->insert_keys,
-                                       1 + (op->csum ? 1 : 0),
+                                       3 + (op->csum ? 1 : 0),
                                        op->c))
                        continue_at(cl, bch_data_insert_keys, bcache_wq);
 
@@ -596,14 +614,12 @@ struct search {
        /* Stack frame for bio_complete */
        struct closure          cl;
 
-       struct bcache_device    *d;
-
        struct bbio             bio;
        struct bio              *orig_bio;
        struct bio              *cache_miss;
+       struct bcache_device    *d;
 
        unsigned                insert_bio_sectors;
-
        unsigned                recoverable:1;
        unsigned                write:1;
        unsigned                read_dirty_data:1;
@@ -629,7 +645,8 @@ static void bch_cache_read_endio(struct bio *bio, int error)
 
        if (error)
                s->iop.error = error;
-       else if (ptr_stale(s->iop.c, &b->key, 0)) {
+       else if (!KEY_DIRTY(&b->key) &&
+                ptr_stale(s->iop.c, &b->key, 0)) {
                atomic_long_inc(&s->iop.c->cache_read_races);
                s->iop.error = -EINTR;
        }
@@ -710,10 +727,13 @@ static void cache_lookup(struct closure *cl)
 {
        struct search *s = container_of(cl, struct search, iop.cl);
        struct bio *bio = &s->bio.bio;
+       int ret;
+
+       bch_btree_op_init(&s->op, -1);
 
-       int ret = bch_btree_map_keys(&s->op, s->iop.c,
-                                    &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0),
-                                    cache_lookup_fn, MAP_END_KEY);
+       ret = bch_btree_map_keys(&s->op, s->iop.c,
+                                &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0),
+                                cache_lookup_fn, MAP_END_KEY);
        if (ret == -EAGAIN)
                continue_at(cl, cache_lookup, bcache_wq);
 
@@ -754,12 +774,12 @@ static void bio_complete(struct search *s)
        }
 }
 
-static void do_bio_hook(struct search *s)
+static void do_bio_hook(struct search *s, struct bio *orig_bio)
 {
        struct bio *bio = &s->bio.bio;
 
        bio_init(bio);
-       __bio_clone_fast(bio, s->orig_bio);
+       __bio_clone_fast(bio, orig_bio);
        bio->bi_end_io          = request_endio;
        bio->bi_private         = &s->cl;
 
@@ -778,26 +798,32 @@ static void search_free(struct closure *cl)
        mempool_free(s, s->d->c->search);
 }
 
-static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
+static inline struct search *search_alloc(struct bio *bio,
+                                         struct bcache_device *d)
 {
        struct search *s;
 
        s = mempool_alloc(d->c->search, GFP_NOIO);
-       memset(s, 0, offsetof(struct search, iop.insert_keys));
 
-       __closure_init(&s->cl, NULL);
+       closure_init(&s->cl, NULL);
+       do_bio_hook(s, bio);
 
-       s->iop.inode            = d->id;
-       s->iop.c                = d->c;
-       s->d                    = d;
-       s->op.lock              = -1;
-       s->iop.write_point      = hash_long((unsigned long) current, 16);
        s->orig_bio             = bio;
-       s->write                = (bio->bi_rw & REQ_WRITE) != 0;
-       s->iop.flush_journal    = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
+       s->cache_miss           = NULL;
+       s->d                    = d;
        s->recoverable          = 1;
+       s->write                = (bio->bi_rw & REQ_WRITE) != 0;
+       s->read_dirty_data      = 0;
        s->start_time           = jiffies;
-       do_bio_hook(s);
+
+       s->iop.c                = d->c;
+       s->iop.bio              = NULL;
+       s->iop.inode            = d->id;
+       s->iop.write_point      = hash_long((unsigned long) current, 16);
+       s->iop.write_prio       = 0;
+       s->iop.error            = 0;
+       s->iop.flags            = 0;
+       s->iop.flush_journal    = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
 
        return s;
 }
@@ -843,7 +869,7 @@ static void cached_dev_read_error(struct closure *cl)
                trace_bcache_read_retry(s->orig_bio);
 
                s->iop.error = 0;
-               do_bio_hook(s);
+               do_bio_hook(s, s->orig_bio);
 
                /* XXX: invalidate cache */
 
index 2cd65bf073c24689542d78f0ef316f81ed21ba1c..39f21dbedc38b43ec7bf4818cfca319d27d27433 100644 (file)
@@ -13,17 +13,22 @@ struct data_insert_op {
        uint16_t                write_prio;
        short                   error;
 
-       unsigned                bypass:1;
-       unsigned                writeback:1;
-       unsigned                flush_journal:1;
-       unsigned                csum:1;
+       union {
+               uint16_t        flags;
 
-       unsigned                replace:1;
-       unsigned                replace_collision:1;
+       struct {
+               unsigned        bypass:1;
+               unsigned        writeback:1;
+               unsigned        flush_journal:1;
+               unsigned        csum:1;
 
-       unsigned                insert_data_done:1;
+               unsigned        replace:1;
+               unsigned        replace_collision:1;
+
+               unsigned        insert_data_done:1;
+       };
+       };
 
-       /* Anything past this point won't get zeroed in search_alloc() */
        struct keylist          insert_keys;
        BKEY_PADDED(replace_key);
 };
index 93d593f957f662829c30feba2ea2804e88fa8aa8..24a3a1546caa7a4c8e9c0aa70c9461574f87a591 100644 (file)
@@ -9,6 +9,7 @@
 #include "bcache.h"
 #include "btree.h"
 #include "debug.h"
+#include "extents.h"
 #include "request.h"
 #include "writeback.h"
 
@@ -225,7 +226,7 @@ static void write_bdev_super_endio(struct bio *bio, int error)
        struct cached_dev *dc = bio->bi_private;
        /* XXX: error checking */
 
-       closure_put(&dc->sb_write.cl);
+       closure_put(&dc->sb_write);
 }
 
 static void __write_super(struct cache_sb *sb, struct bio *bio)
@@ -263,12 +264,20 @@ static void __write_super(struct cache_sb *sb, struct bio *bio)
        submit_bio(REQ_WRITE, bio);
 }
 
+static void bch_write_bdev_super_unlock(struct closure *cl)
+{
+       struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);
+
+       up(&dc->sb_write_mutex);
+}
+
 void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
 {
-       struct closure *cl = &dc->sb_write.cl;
+       struct closure *cl = &dc->sb_write;
        struct bio *bio = &dc->sb_bio;
 
-       closure_lock(&dc->sb_write, parent);
+       down(&dc->sb_write_mutex);
+       closure_init(cl, parent);
 
        bio_reset(bio);
        bio->bi_bdev    = dc->bdev;
@@ -278,7 +287,7 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
        closure_get(cl);
        __write_super(&dc->sb, bio);
 
-       closure_return(cl);
+       closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
 }
 
 static void write_super_endio(struct bio *bio, int error)
@@ -286,16 +295,24 @@ static void write_super_endio(struct bio *bio, int error)
        struct cache *ca = bio->bi_private;
 
        bch_count_io_errors(ca, error, "writing superblock");
-       closure_put(&ca->set->sb_write.cl);
+       closure_put(&ca->set->sb_write);
+}
+
+static void bcache_write_super_unlock(struct closure *cl)
+{
+       struct cache_set *c = container_of(cl, struct cache_set, sb_write);
+
+       up(&c->sb_write_mutex);
 }
 
 void bcache_write_super(struct cache_set *c)
 {
-       struct closure *cl = &c->sb_write.cl;
+       struct closure *cl = &c->sb_write;
        struct cache *ca;
        unsigned i;
 
-       closure_lock(&c->sb_write, &c->cl);
+       down(&c->sb_write_mutex);
+       closure_init(cl, &c->cl);
 
        c->sb.seq++;
 
@@ -317,7 +334,7 @@ void bcache_write_super(struct cache_set *c)
                __write_super(&ca->sb, bio);
        }
 
-       closure_return(cl);
+       closure_return_with_destructor(cl, bcache_write_super_unlock);
 }
 
 /* UUID io */
@@ -325,23 +342,31 @@ void bcache_write_super(struct cache_set *c)
 static void uuid_endio(struct bio *bio, int error)
 {
        struct closure *cl = bio->bi_private;
-       struct cache_set *c = container_of(cl, struct cache_set, uuid_write.cl);
+       struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
 
        cache_set_err_on(error, c, "accessing uuids");
        bch_bbio_free(bio, c);
        closure_put(cl);
 }
 
+static void uuid_io_unlock(struct closure *cl)
+{
+       struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
+
+       up(&c->uuid_write_mutex);
+}
+
 static void uuid_io(struct cache_set *c, unsigned long rw,
                    struct bkey *k, struct closure *parent)
 {
-       struct closure *cl = &c->uuid_write.cl;
+       struct closure *cl = &c->uuid_write;
        struct uuid_entry *u;
        unsigned i;
        char buf[80];
 
        BUG_ON(!parent);
-       closure_lock(&c->uuid_write, parent);
+       down(&c->uuid_write_mutex);
+       closure_init(cl, parent);
 
        for (i = 0; i < KEY_PTRS(k); i++) {
                struct bio *bio = bch_bbio_alloc(c);
@@ -359,7 +384,7 @@ static void uuid_io(struct cache_set *c, unsigned long rw,
                        break;
        }
 
-       bch_bkey_to_text(buf, sizeof(buf), k);
+       bch_extent_to_text(buf, sizeof(buf), k);
        pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf);
 
        for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
@@ -368,14 +393,14 @@ static void uuid_io(struct cache_set *c, unsigned long rw,
                                 u - c->uuids, u->uuid, u->label,
                                 u->first_reg, u->last_reg, u->invalidated);
 
-       closure_return(cl);
+       closure_return_with_destructor(cl, uuid_io_unlock);
 }
 
 static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
 {
        struct bkey *k = &j->uuid_bucket;
 
-       if (bch_btree_ptr_invalid(c, k))
+       if (__bch_btree_ptr_invalid(c, k))
                return "bad uuid pointer";
 
        bkey_copy(&c->uuid_bucket, k);
@@ -420,7 +445,7 @@ static int __uuid_write(struct cache_set *c)
 
        lockdep_assert_held(&bch_register_lock);
 
-       if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, true))
+       if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true))
                return 1;
 
        SET_KEY_SIZE(&k.key, c->sb.bucket_size);
@@ -538,8 +563,8 @@ void bch_prio_write(struct cache *ca)
        atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
                        &ca->meta_sectors_written);
 
-       pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
-                fifo_used(&ca->free_inc), fifo_used(&ca->unused));
+       //pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
+       //       fifo_used(&ca->free_inc), fifo_used(&ca->unused));
 
        for (i = prio_buckets(ca) - 1; i >= 0; --i) {
                long bucket;
@@ -558,7 +583,7 @@ void bch_prio_write(struct cache *ca)
                p->magic        = pset_magic(&ca->sb);
                p->csum         = bch_crc64(&p->magic, bucket_bytes(ca) - 8);
 
-               bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, true);
+               bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true);
                BUG_ON(bucket == -1);
 
                mutex_unlock(&ca->set->bucket_lock);
@@ -1098,7 +1123,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
        set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
        kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
        INIT_WORK(&dc->detach, cached_dev_detach_finish);
-       closure_init_unlocked(&dc->sb_write);
+       sema_init(&dc->sb_write_mutex, 1);
        INIT_LIST_HEAD(&dc->io_lru);
        spin_lock_init(&dc->io_lock);
        bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
@@ -1110,6 +1135,12 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
                hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
        }
 
+       dc->disk.stripe_size = q->limits.io_opt >> 9;
+
+       if (dc->disk.stripe_size)
+               dc->partial_stripes_expensive =
+                       q->limits.raid_partial_stripes_expensive;
+
        ret = bcache_device_init(&dc->disk, block_size,
                         dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
        if (ret)
@@ -1321,8 +1352,8 @@ static void cache_set_free(struct closure *cl)
                if (ca)
                        kobject_put(&ca->kobj);
 
+       bch_bset_sort_state_free(&c->sort);
        free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
-       free_pages((unsigned long) c->sort, ilog2(bucket_pages(c)));
 
        if (c->bio_split)
                bioset_free(c->bio_split);
@@ -1447,21 +1478,17 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
        c->block_bits           = ilog2(sb->block_size);
        c->nr_uuids             = bucket_bytes(c) / sizeof(struct uuid_entry);
 
-       c->btree_pages          = c->sb.bucket_size / PAGE_SECTORS;
+       c->btree_pages          = bucket_pages(c);
        if (c->btree_pages > BTREE_MAX_PAGES)
                c->btree_pages = max_t(int, c->btree_pages / 4,
                                       BTREE_MAX_PAGES);
 
-       c->sort_crit_factor = int_sqrt(c->btree_pages);
-
-       closure_init_unlocked(&c->sb_write);
+       sema_init(&c->sb_write_mutex, 1);
        mutex_init(&c->bucket_lock);
        init_waitqueue_head(&c->try_wait);
        init_waitqueue_head(&c->bucket_wait);
-       closure_init_unlocked(&c->uuid_write);
-       mutex_init(&c->sort_lock);
+       sema_init(&c->uuid_write_mutex, 1);
 
-       spin_lock_init(&c->sort_time.lock);
        spin_lock_init(&c->btree_gc_time.lock);
        spin_lock_init(&c->btree_split_time.lock);
        spin_lock_init(&c->btree_read_time.lock);
@@ -1489,11 +1516,11 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
                                bucket_pages(c))) ||
            !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
            !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
-           !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) ||
            !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
            bch_journal_alloc(c) ||
            bch_btree_cache_alloc(c) ||
-           bch_open_buckets_alloc(c))
+           bch_open_buckets_alloc(c) ||
+           bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
                goto err;
 
        c->congested_read_threshold_us  = 2000;
@@ -1549,7 +1576,7 @@ static void run_cache_set(struct cache_set *c)
                k = &j->btree_root;
 
                err = "bad btree root";
-               if (bch_btree_ptr_invalid(c, k))
+               if (__bch_btree_ptr_invalid(c, k))
                        goto err;
 
                err = "error reading btree root";
@@ -1743,6 +1770,7 @@ err:
 void bch_cache_release(struct kobject *kobj)
 {
        struct cache *ca = container_of(kobj, struct cache, kobj);
+       unsigned i;
 
        if (ca->set)
                ca->set->cache[ca->sb.nr_this_dev] = NULL;
@@ -1756,7 +1784,9 @@ void bch_cache_release(struct kobject *kobj)
        free_heap(&ca->heap);
        free_fifo(&ca->unused);
        free_fifo(&ca->free_inc);
-       free_fifo(&ca->free);
+
+       for (i = 0; i < RESERVE_NR; i++)
+               free_fifo(&ca->free[i]);
 
        if (ca->sb_bio.bi_inline_vecs[0].bv_page)
                put_page(ca->sb_bio.bi_io_vec[0].bv_page);
@@ -1782,10 +1812,12 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
        ca->journal.bio.bi_max_vecs = 8;
        ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs;
 
-       free = roundup_pow_of_two(ca->sb.nbuckets) >> 9;
-       free = max_t(size_t, free, (prio_buckets(ca) + 8) * 2);
+       free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
 
-       if (!init_fifo(&ca->free,       free, GFP_KERNEL) ||
+       if (!init_fifo(&ca->free[RESERVE_BTREE], 8, GFP_KERNEL) ||
+           !init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
+           !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) ||
+           !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) ||
            !init_fifo(&ca->free_inc,   free << 2, GFP_KERNEL) ||
            !init_fifo(&ca->unused,     free << 2, GFP_KERNEL) ||
            !init_heap(&ca->heap,       free << 3, GFP_KERNEL) ||
@@ -2030,7 +2062,8 @@ static void bcache_exit(void)
                kobject_put(bcache_kobj);
        if (bcache_wq)
                destroy_workqueue(bcache_wq);
-       unregister_blkdev(bcache_major, "bcache");
+       if (bcache_major)
+               unregister_blkdev(bcache_major, "bcache");
        unregister_reboot_notifier(&reboot);
 }
 
index a1f85612f0b3dfc5c90b768aaef48118034c02c7..c6ab69333a6dfde7761e9e2b05c16574f5c52480 100644 (file)
@@ -102,7 +102,6 @@ rw_attribute(bypass_torture_test);
 rw_attribute(key_merging_disabled);
 rw_attribute(gc_always_rewrite);
 rw_attribute(expensive_debug_checks);
-rw_attribute(freelist_percent);
 rw_attribute(cache_replacement_policy);
 rw_attribute(btree_shrinker_disabled);
 rw_attribute(copy_gc_enabled);
@@ -401,6 +400,48 @@ static struct attribute *bch_flash_dev_files[] = {
 };
 KTYPE(bch_flash_dev);
 
+struct bset_stats_op {
+       struct btree_op op;
+       size_t nodes;
+       struct bset_stats stats;
+};
+
+static int btree_bset_stats(struct btree_op *b_op, struct btree *b)
+{
+       struct bset_stats_op *op = container_of(b_op, struct bset_stats_op, op);
+
+       op->nodes++;
+       bch_btree_keys_stats(&b->keys, &op->stats);
+
+       return MAP_CONTINUE;
+}
+
+int bch_bset_print_stats(struct cache_set *c, char *buf)
+{
+       struct bset_stats_op op;
+       int ret;
+
+       memset(&op, 0, sizeof(op));
+       bch_btree_op_init(&op.op, -1);
+
+       ret = bch_btree_map_nodes(&op.op, c, &ZERO_KEY, btree_bset_stats);
+       if (ret < 0)
+               return ret;
+
+       return snprintf(buf, PAGE_SIZE,
+                       "btree nodes:           %zu\n"
+                       "written sets:          %zu\n"
+                       "unwritten sets:                %zu\n"
+                       "written key bytes:     %zu\n"
+                       "unwritten key bytes:   %zu\n"
+                       "floats:                        %zu\n"
+                       "failed:                        %zu\n",
+                       op.nodes,
+                       op.stats.sets_written, op.stats.sets_unwritten,
+                       op.stats.bytes_written, op.stats.bytes_unwritten,
+                       op.stats.floats, op.stats.failed);
+}
+
 SHOW(__bch_cache_set)
 {
        unsigned root_usage(struct cache_set *c)
@@ -419,7 +460,7 @@ lock_root:
                        rw_lock(false, b, b->level);
                } while (b != c->root);
 
-               for_each_key_filter(b, k, &iter, bch_ptr_bad)
+               for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
                        bytes += bkey_bytes(k);
 
                rw_unlock(false, b);
@@ -434,7 +475,7 @@ lock_root:
 
                mutex_lock(&c->bucket_lock);
                list_for_each_entry(b, &c->btree_cache, list)
-                       ret += 1 << (b->page_order + PAGE_SHIFT);
+                       ret += 1 << (b->keys.page_order + PAGE_SHIFT);
 
                mutex_unlock(&c->bucket_lock);
                return ret;
@@ -491,7 +532,7 @@ lock_root:
 
        sysfs_print_time_stats(&c->btree_gc_time,       btree_gc, sec, ms);
        sysfs_print_time_stats(&c->btree_split_time,    btree_split, sec, us);
-       sysfs_print_time_stats(&c->sort_time,           btree_sort, ms, us);
+       sysfs_print_time_stats(&c->sort.time,           btree_sort, ms, us);
        sysfs_print_time_stats(&c->btree_read_time,     btree_read, ms, us);
        sysfs_print_time_stats(&c->try_harder_time,     try_harder, ms, us);
 
@@ -711,9 +752,6 @@ SHOW(__bch_cache)
        sysfs_print(io_errors,
                    atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT);
 
-       sysfs_print(freelist_percent, ca->free.size * 100 /
-                   ((size_t) ca->sb.nbuckets));
-
        if (attr == &sysfs_cache_replacement_policy)
                return bch_snprint_string_list(buf, PAGE_SIZE,
                                               cache_replacement_policies,
@@ -820,32 +858,6 @@ STORE(__bch_cache)
                }
        }
 
-       if (attr == &sysfs_freelist_percent) {
-               DECLARE_FIFO(long, free);
-               long i;
-               size_t p = strtoul_or_return(buf);
-
-               p = clamp_t(size_t,
-                           ((size_t) ca->sb.nbuckets * p) / 100,
-                           roundup_pow_of_two(ca->sb.nbuckets) >> 9,
-                           ca->sb.nbuckets / 2);
-
-               if (!init_fifo_exact(&free, p, GFP_KERNEL))
-                       return -ENOMEM;
-
-               mutex_lock(&ca->set->bucket_lock);
-
-               fifo_move(&free, &ca->free);
-               fifo_swap(&free, &ca->free);
-
-               mutex_unlock(&ca->set->bucket_lock);
-
-               while (fifo_pop(&free, i))
-                       atomic_dec(&ca->buckets[i].pin);
-
-               free_fifo(&free);
-       }
-
        if (attr == &sysfs_clear_stats) {
                atomic_long_set(&ca->sectors_written, 0);
                atomic_long_set(&ca->btree_sectors_written, 0);
@@ -869,7 +881,6 @@ static struct attribute *bch_cache_files[] = {
        &sysfs_metadata_written,
        &sysfs_io_errors,
        &sysfs_clear_stats,
-       &sysfs_freelist_percent,
        &sysfs_cache_replacement_policy,
        NULL
 };
index 1030c6020e986934e21c94628794e5e342271b49..ac7d0d1f70d7be9ae818a51c6d77c461d770eb4b 100644 (file)
@@ -2,6 +2,7 @@
 #ifndef _BCACHE_UTIL_H
 #define _BCACHE_UTIL_H
 
+#include <linux/blkdev.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/llist.h>
@@ -17,11 +18,13 @@ struct closure;
 
 #ifdef CONFIG_BCACHE_DEBUG
 
+#define EBUG_ON(cond)                  BUG_ON(cond)
 #define atomic_dec_bug(v)      BUG_ON(atomic_dec_return(v) < 0)
 #define atomic_inc_bug(v, i)   BUG_ON(atomic_inc_return(v) <= i)
 
 #else /* DEBUG */
 
+#define EBUG_ON(cond)                  do { if (cond); } while (0)
 #define atomic_dec_bug(v)      atomic_dec(v)
 #define atomic_inc_bug(v, i)   atomic_inc(v)
 
@@ -391,6 +394,11 @@ struct time_stats {
 
 void bch_time_stats_update(struct time_stats *stats, uint64_t time);
 
+static inline unsigned local_clock_us(void)
+{
+       return local_clock() >> 10;
+}
+
 #define NSEC_PER_ns                    1L
 #define NSEC_PER_us                    NSEC_PER_USEC
 #define NSEC_PER_ms                    NSEC_PER_MSEC
index 67ca9c3d2939c5e4468d51f0ea0454dfdceac731..f1feadeb7bb2d1b68a6592d946516e4036c7e939 100644 (file)
@@ -6103,6 +6103,7 @@ static int run(struct mddev *mddev)
                blk_queue_io_min(mddev->queue, chunk_size);
                blk_queue_io_opt(mddev->queue, chunk_size *
                                 (conf->raid_disks - conf->max_degraded));
+               mddev->queue->limits.raid_partial_stripes_expensive = 1;
                /*
                 * We can only discard a whole stripe. It doesn't make sense to
                 * discard data disk but write parity disk
index 1c04ec66974e0329e57d09f85f597632b3b9a26d..651dba10b9c2b5468af528ad3114166b90d8c067 100644 (file)
@@ -1312,7 +1312,7 @@ static void bh_lru_install(struct buffer_head *bh)
                }
                while (out < BH_LRU_SIZE)
                        bhs[out++] = NULL;
-               memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
+               memcpy(this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
        }
        bh_lru_unlock();
 
index 02cb6f0ea71d52a09c3b2147f2bbc91997904650..0375654adb28cb423911aa7afccce6eb1f015e14 100644 (file)
@@ -291,6 +291,7 @@ struct queue_limits {
        unsigned char           discard_misaligned;
        unsigned char           cluster;
        unsigned char           discard_zeroes_data;
+       unsigned char           raid_partial_stripes_expensive;
 };
 
 struct request_queue {
index 095c6e4fe1e87ea02e3c6d9eeecdb66ddc1a1367..7110897c3dfa595e385b8a946f7547ca05583fec 100644 (file)
@@ -247,7 +247,7 @@ TRACE_EVENT(bcache_btree_write,
        TP_fast_assign(
                __entry->bucket = PTR_BUCKET_NR(b->c, &b->key, 0);
                __entry->block  = b->written;
-               __entry->keys   = b->sets[b->nsets].data->keys;
+               __entry->keys   = b->keys.set[b->keys.nsets].data->keys;
        ),
 
        TP_printk("bucket %zu", __entry->bucket)
@@ -411,7 +411,7 @@ TRACE_EVENT(bcache_alloc_invalidate,
        ),
 
        TP_fast_assign(
-               __entry->free           = fifo_used(&ca->free);
+               __entry->free           = fifo_used(&ca->free[RESERVE_NONE]);
                __entry->free_inc       = fifo_used(&ca->free_inc);
                __entry->free_inc_size  = ca->free_inc.size;
                __entry->unused         = fifo_used(&ca->unused);
@@ -422,8 +422,8 @@ TRACE_EVENT(bcache_alloc_invalidate,
 );
 
 TRACE_EVENT(bcache_alloc_fail,
-       TP_PROTO(struct cache *ca),
-       TP_ARGS(ca),
+       TP_PROTO(struct cache *ca, unsigned reserve),
+       TP_ARGS(ca, reserve),
 
        TP_STRUCT__entry(
                __field(unsigned,       free                    )
@@ -433,7 +433,7 @@ TRACE_EVENT(bcache_alloc_fail,
        ),
 
        TP_fast_assign(
-               __entry->free           = fifo_used(&ca->free);
+               __entry->free           = fifo_used(&ca->free[reserve]);
                __entry->free_inc       = fifo_used(&ca->free_inc);
                __entry->unused         = fifo_used(&ca->unused);
                __entry->blocked        = atomic_read(&ca->set->prio_blocked);
index 164a7e2639886333bded7e9e0c5948c4c0c06da7..22b6ad31c706dae59544286faea5808d7b303342 100644 (file)
@@ -39,6 +39,7 @@ static inline void SET_##name(struct bkey *k, unsigned i, __u64 v)    \
 }
 
 #define KEY_SIZE_BITS          16
+#define KEY_MAX_U64S           8
 
 KEY_FIELD(KEY_PTRS,    high, 60, 3)
 KEY_FIELD(HEADER_SIZE, high, 58, 2)
@@ -118,7 +119,7 @@ static inline struct bkey *bkey_next(const struct bkey *k)
        return (struct bkey *) (d + bkey_u64s(k));
 }
 
-static inline struct bkey *bkey_last(const struct bkey *k, unsigned nr_keys)
+static inline struct bkey *bkey_idx(const struct bkey *k, unsigned nr_keys)
 {
        __u64 *d = (void *) k;
        return (struct bkey *) (d + nr_keys);
index f1f3dd5981b28ff73d5e52c2fb1606d174c3c5dd..84c517cbce902df6ba75be909c016b2da2fca3d5 100644 (file)
@@ -185,7 +185,8 @@ enum {
                                 * to clear media change status */
        FD_UNUSED_BIT,
        FD_DISK_CHANGED_BIT,    /* disk has been changed since last i/o */
-       FD_DISK_WRITABLE_BIT    /* disk is writable */
+       FD_DISK_WRITABLE_BIT,   /* disk is writable */
+       FD_OPEN_SHOULD_FAIL_BIT
 };
 
 #define FDSETDRVPRM _IOW(2, 0x90, struct floppy_drive_params)