Pileus Git - ~andy/linux/blob - fs/xfs/linux-2.6/xfs_aops.c

   1 /*
   2  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
   3  * All Rights Reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write the Free Software Foundation,
  16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18 #include "xfs.h"
  19 #include "xfs_bit.h"
  20 #include "xfs_log.h"
  21 #include "xfs_inum.h"
  22 #include "xfs_sb.h"
  23 #include "xfs_ag.h"
  24 #include "xfs_dir2.h"
  25 #include "xfs_trans.h"
  26 #include "xfs_dmapi.h"
  27 #include "xfs_mount.h"
  28 #include "xfs_bmap_btree.h"
  29 #include "xfs_alloc_btree.h"
  30 #include "xfs_ialloc_btree.h"
  31 #include "xfs_dir2_sf.h"
  32 #include "xfs_attr_sf.h"
  33 #include "xfs_dinode.h"
  34 #include "xfs_inode.h"
  35 #include "xfs_alloc.h"
  36 #include "xfs_btree.h"
  37 #include "xfs_error.h"
  38 #include "xfs_rw.h"
  39 #include "xfs_iomap.h"
  40 #include "xfs_vnodeops.h"
  41 #include <linux/mpage.h>
  42 #include <linux/pagevec.h>
  43 #include <linux/writeback.h>
  44
  45 STATIC void
  46 xfs_count_page_state(
  47         struct page             *page,
  48         int                     *delalloc,
  49         int                     *unmapped,
  50         int                     *unwritten)
  51 {
  52         struct buffer_head      *bh, *head;
  53
  54         *delalloc = *unmapped = *unwritten = 0;
  55
  56         bh = head = page_buffers(page);
  57         do {
  58                 if (buffer_uptodate(bh) && !buffer_mapped(bh))
  59                         (*unmapped) = 1;
  60                 else if (buffer_unwritten(bh))
  61                         (*unwritten) = 1;
  62                 else if (buffer_delay(bh))
  63                         (*delalloc) = 1;
  64         } while ((bh = bh->b_this_page) != head);
  65 }
  66
  67 #if defined(XFS_RW_TRACE)
  68 void
  69 xfs_page_trace(
  70         int             tag,
  71         struct inode    *inode,
  72         struct page     *page,
  73         unsigned long   pgoff)
  74 {
  75         xfs_inode_t     *ip;
  76         loff_t          isize = i_size_read(inode);
  77         loff_t          offset = page_offset(page);
  78         int             delalloc = -1, unmapped = -1, unwritten = -1;
  79
  80         if (page_has_buffers(page))
  81                 xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
  82
  83         ip = XFS_I(inode);
  84         if (!ip->i_rwtrace)
  85                 return;
  86
  87         ktrace_enter(ip->i_rwtrace,
  88                 (void *)((unsigned long)tag),
  89                 (void *)ip,
  90                 (void *)inode,
  91                 (void *)page,
  92                 (void *)pgoff,
  93                 (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
  94                 (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
  95                 (void *)((unsigned long)((isize >> 32) & 0xffffffff)),
  96                 (void *)((unsigned long)(isize & 0xffffffff)),
  97                 (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
  98                 (void *)((unsigned long)(offset & 0xffffffff)),
  99                 (void *)((unsigned long)delalloc),
 100                 (void *)((unsigned long)unmapped),
 101                 (void *)((unsigned long)unwritten),
 102                 (void *)((unsigned long)current_pid()),
 103                 (void *)NULL);
 104 }
 105 #else
 106 #define xfs_page_trace(tag, inode, page, pgoff)
 107 #endif
 108
 109 STATIC struct block_device *
 110 xfs_find_bdev_for_inode(
 111         struct xfs_inode        *ip)
 112 {
 113         struct xfs_mount        *mp = ip->i_mount;
 114
 115         if (XFS_IS_REALTIME_INODE(ip))
 116                 return mp->m_rtdev_targp->bt_bdev;
 117         else
 118                 return mp->m_ddev_targp->bt_bdev;
 119 }
 120
 121 /*
 122  * Schedule IO completion handling on a xfsdatad if this was
 123  * the final hold on this ioend. If we are asked to wait,
 124  * flush the workqueue.
 125  */
 126 STATIC void
 127 xfs_finish_ioend(
 128         xfs_ioend_t     *ioend,
 129         int             wait)
 130 {
 131         if (atomic_dec_and_test(&ioend->io_remaining)) {
 132                 queue_work(xfsdatad_workqueue, &ioend->io_work);
 133                 if (wait)
 134                         flush_workqueue(xfsdatad_workqueue);
 135         }
 136 }
 137
 138 /*
 139  * We're now finished for good with this ioend structure.
 140  * Update the page state via the associated buffer_heads,
 141  * release holds on the inode and bio, and finally free
 142  * up memory.  Do not use the ioend after this.
 143  */
 144 STATIC void
 145 xfs_destroy_ioend(
 146         xfs_ioend_t             *ioend)
 147 {
 148         struct buffer_head      *bh, *next;
 149
 150         for (bh = ioend->io_buffer_head; bh; bh = next) {
 151                 next = bh->b_private;
 152                 bh->b_end_io(bh, !ioend->io_error);
 153         }
 154         if (unlikely(ioend->io_error)) {
 155                 vn_ioerror(XFS_I(ioend->io_inode), ioend->io_error,
 156                                 __FILE__,__LINE__);
 157         }
 158         vn_iowake(XFS_I(ioend->io_inode));
 159         mempool_free(ioend, xfs_ioend_pool);
 160 }
 161
 162 /*
 163  * Update on-disk file size now that data has been written to disk.
 164  * The current in-memory file size is i_size.  If a write is beyond
 165  * eof i_new_size will be the intended file size until i_size is
 166  * updated.  If this write does not extend all the way to the valid
 167  * file size then restrict this update to the end of the write.
 168  */
 169 STATIC void
 170 xfs_setfilesize(
 171         xfs_ioend_t             *ioend)
 172 {
 173         xfs_inode_t             *ip = XFS_I(ioend->io_inode);
 174         xfs_fsize_t             isize;
 175         xfs_fsize_t             bsize;
 176
 177         ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
 178         ASSERT(ioend->io_type != IOMAP_READ);
 179
 180         if (unlikely(ioend->io_error))
 181                 return;
 182
 183         bsize = ioend->io_offset + ioend->io_size;
 184
 185         xfs_ilock(ip, XFS_ILOCK_EXCL);
 186
 187         isize = MAX(ip->i_size, ip->i_new_size);
 188         isize = MIN(isize, bsize);
 189
 190         if (ip->i_d.di_size < isize) {
 191                 ip->i_d.di_size = isize;
 192                 ip->i_update_core = 1;
 193                 ip->i_update_size = 1;
 194                 xfs_mark_inode_dirty_sync(ip);
 195         }
 196
 197         xfs_iunlock(ip, XFS_ILOCK_EXCL);
 198 }
 199
 200 /*
 201  * Buffered IO write completion for delayed allocate extents.
 202  */
 203 STATIC void
 204 xfs_end_bio_delalloc(
 205         struct work_struct      *work)
 206 {
 207         xfs_ioend_t             *ioend =
 208                 container_of(work, xfs_ioend_t, io_work);
 209
 210         xfs_setfilesize(ioend);
 211         xfs_destroy_ioend(ioend);
 212 }
 213
 214 /*
 215  * Buffered IO write completion for regular, written extents.
 216  */
 217 STATIC void
 218 xfs_end_bio_written(
 219         struct work_struct      *work)
 220 {
 221         xfs_ioend_t             *ioend =
 222                 container_of(work, xfs_ioend_t, io_work);
 223
 224         xfs_setfilesize(ioend);
 225         xfs_destroy_ioend(ioend);
 226 }
 227
 228 /*
 229  * IO write completion for unwritten extents.
 230  *
 231  * Issue transactions to convert a buffer range from unwritten
 232  * to written extents.
 233  */
 234 STATIC void
 235 xfs_end_bio_unwritten(
 236         struct work_struct      *work)
 237 {
 238         xfs_ioend_t             *ioend =
 239                 container_of(work, xfs_ioend_t, io_work);
 240         struct xfs_inode        *ip = XFS_I(ioend->io_inode);
 241         xfs_off_t               offset = ioend->io_offset;
 242         size_t                  size = ioend->io_size;
 243
 244         if (likely(!ioend->io_error)) {
 245                 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 246                         int error;
 247                         error = xfs_iomap_write_unwritten(ip, offset, size);
 248                         if (error)
 249                                 ioend->io_error = error;
 250                 }
 251                 xfs_setfilesize(ioend);
 252         }
 253         xfs_destroy_ioend(ioend);
 254 }
 255
 256 /*
 257  * IO read completion for regular, written extents.
 258  */
 259 STATIC void
 260 xfs_end_bio_read(
 261         struct work_struct      *work)
 262 {
 263         xfs_ioend_t             *ioend =
 264                 container_of(work, xfs_ioend_t, io_work);
 265
 266         xfs_destroy_ioend(ioend);
 267 }
 268
 269 /*
 270  * Allocate and initialise an IO completion structure.
 271  * We need to track unwritten extent write completion here initially.
 272  * We'll need to extend this for updating the ondisk inode size later
 273  * (vs. incore size).
 274  */
 275 STATIC xfs_ioend_t *
 276 xfs_alloc_ioend(
 277         struct inode            *inode,
 278         unsigned int            type)
 279 {
 280         xfs_ioend_t             *ioend;
 281
 282         ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS);
 283
 284         /*
 285          * Set the count to 1 initially, which will prevent an I/O
 286          * completion callback from happening before we have started
 287          * all the I/O from calling the completion routine too early.
 288          */
 289         atomic_set(&ioend->io_remaining, 1);
 290         ioend->io_error = 0;
 291         ioend->io_list = NULL;
 292         ioend->io_type = type;
 293         ioend->io_inode = inode;
 294         ioend->io_buffer_head = NULL;
 295         ioend->io_buffer_tail = NULL;
 296         atomic_inc(&XFS_I(ioend->io_inode)->i_iocount);
 297         ioend->io_offset = 0;
 298         ioend->io_size = 0;
 299
 300         if (type == IOMAP_UNWRITTEN)
 301                 INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten);
 302         else if (type == IOMAP_DELAY)
 303                 INIT_WORK(&ioend->io_work, xfs_end_bio_delalloc);
 304         else if (type == IOMAP_READ)
 305                 INIT_WORK(&ioend->io_work, xfs_end_bio_read);
 306         else
 307                 INIT_WORK(&ioend->io_work, xfs_end_bio_written);
 308
 309         return ioend;
 310 }
 311
 312 STATIC int
 313 xfs_map_blocks(
 314         struct inode            *inode,
 315         loff_t                  offset,
 316         ssize_t                 count,
 317         xfs_iomap_t             *mapp,
 318         int                     flags)
 319 {
 320         int                     nmaps = 1;
 321
 322         return -xfs_iomap(XFS_I(inode), offset, count, flags, mapp, &nmaps);
 323 }
 324
 325 STATIC_INLINE int
 326 xfs_iomap_valid(
 327         xfs_iomap_t             *iomapp,
 328         loff_t                  offset)
 329 {
 330         return offset >= iomapp->iomap_offset &&
 331                 offset < iomapp->iomap_offset + iomapp->iomap_bsize;
 332 }
 333
 334 /*
 335  * BIO completion handler for buffered IO.
 336  */
 337 STATIC void
 338 xfs_end_bio(
 339         struct bio              *bio,
 340         int                     error)
 341 {
 342         xfs_ioend_t             *ioend = bio->bi_private;
 343
 344         ASSERT(atomic_read(&bio->bi_cnt) >= 1);
 345         ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error;
 346
 347         /* Toss bio and pass work off to an xfsdatad thread */
 348         bio->bi_private = NULL;
 349         bio->bi_end_io = NULL;
 350         bio_put(bio);
 351
 352         xfs_finish_ioend(ioend, 0);
 353 }
 354
 355 STATIC void
 356 xfs_submit_ioend_bio(
 357         xfs_ioend_t     *ioend,
 358         struct bio      *bio)
 359 {
 360         atomic_inc(&ioend->io_remaining);
 361
 362         bio->bi_private = ioend;
 363         bio->bi_end_io = xfs_end_bio;
 364
 365         submit_bio(WRITE, bio);
 366         ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
 367         bio_put(bio);
 368 }
 369
 370 STATIC struct bio *
 371 xfs_alloc_ioend_bio(
 372         struct buffer_head      *bh)
 373 {
 374         struct bio              *bio;
 375         int                     nvecs = bio_get_nr_vecs(bh->b_bdev);
 376
 377         do {
 378                 bio = bio_alloc(GFP_NOIO, nvecs);
 379                 nvecs >>= 1;
 380         } while (!bio);
 381
 382         ASSERT(bio->bi_private == NULL);
 383         bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 384         bio->bi_bdev = bh->b_bdev;
 385         bio_get(bio);
 386         return bio;
 387 }
 388
 389 STATIC void
 390 xfs_start_buffer_writeback(
 391         struct buffer_head      *bh)
 392 {
 393         ASSERT(buffer_mapped(bh));
 394         ASSERT(buffer_locked(bh));
 395         ASSERT(!buffer_delay(bh));
 396         ASSERT(!buffer_unwritten(bh));
 397
 398         mark_buffer_async_write(bh);
 399         set_buffer_uptodate(bh);
 400         clear_buffer_dirty(bh);
 401 }
 402
 403 STATIC void
 404 xfs_start_page_writeback(
 405         struct page             *page,
 406         int                     clear_dirty,
 407         int                     buffers)
 408 {
 409         ASSERT(PageLocked(page));
 410         ASSERT(!PageWriteback(page));
 411         if (clear_dirty)
 412                 clear_page_dirty_for_io(page);
 413         set_page_writeback(page);
 414         unlock_page(page);
 415         /* If no buffers on the page are to be written, finish it here */
 416         if (!buffers)
 417                 end_page_writeback(page);
 418 }
 419
 420 static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)
 421 {
 422         return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
 423 }
 424
 425 /*
 426  * Submit all of the bios for all of the ioends we have saved up, covering the
 427  * initial writepage page and also any probed pages.
 428  *
 429  * Because we may have multiple ioends spanning a page, we need to start
 430  * writeback on all the buffers before we submit them for I/O. If we mark the
 431  * buffers as we got, then we can end up with a page that only has buffers
 432  * marked async write and I/O complete on can occur before we mark the other
 433  * buffers async write.
 434  *
 435  * The end result of this is that we trip a bug in end_page_writeback() because
 436  * we call it twice for the one page as the code in end_buffer_async_write()
 437  * assumes that all buffers on the page are started at the same time.
 438  *
 439  * The fix is two passes across the ioend list - one to start writeback on the
 440  * buffer_heads, and then submit them for I/O on the second pass.
 441  */
 442 STATIC void
 443 xfs_submit_ioend(
 444         xfs_ioend_t             *ioend)
 445 {
 446         xfs_ioend_t             *head = ioend;
 447         xfs_ioend_t             *next;
 448         struct buffer_head      *bh;
 449         struct bio              *bio;
 450         sector_t                lastblock = 0;
 451
 452         /* Pass 1 - start writeback */
 453         do {
 454                 next = ioend->io_list;
 455                 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
 456                         xfs_start_buffer_writeback(bh);
 457                 }
 458         } while ((ioend = next) != NULL);
 459
 460         /* Pass 2 - submit I/O */
 461         ioend = head;
 462         do {
 463                 next = ioend->io_list;
 464                 bio = NULL;
 465
 466                 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
 467
 468                         if (!bio) {
 469  retry:
 470                                 bio = xfs_alloc_ioend_bio(bh);
 471                         } else if (bh->b_blocknr != lastblock + 1) {
 472                                 xfs_submit_ioend_bio(ioend, bio);
 473                                 goto retry;
 474                         }
 475
 476                         if (bio_add_buffer(bio, bh) != bh->b_size) {
 477                                 xfs_submit_ioend_bio(ioend, bio);
 478                                 goto retry;
 479                         }
 480
 481                         lastblock = bh->b_blocknr;
 482                 }
 483                 if (bio)
 484                         xfs_submit_ioend_bio(ioend, bio);
 485                 xfs_finish_ioend(ioend, 0);
 486         } while ((ioend = next) != NULL);
 487 }
 488
 489 /*
 490  * Cancel submission of all buffer_heads so far in this endio.
 491  * Toss the endio too.  Only ever called for the initial page
 492  * in a writepage request, so only ever one page.
 493  */
 494 STATIC void
 495 xfs_cancel_ioend(
 496         xfs_ioend_t             *ioend)
 497 {
 498         xfs_ioend_t             *next;
 499         struct buffer_head      *bh, *next_bh;
 500
 501         do {
 502                 next = ioend->io_list;
 503                 bh = ioend->io_buffer_head;
 504                 do {
 505                         next_bh = bh->b_private;
 506                         clear_buffer_async_write(bh);
 507                         unlock_buffer(bh);
 508                 } while ((bh = next_bh) != NULL);
 509
 510                 vn_iowake(XFS_I(ioend->io_inode));
 511                 mempool_free(ioend, xfs_ioend_pool);
 512         } while ((ioend = next) != NULL);
 513 }
 514
 515 /*
 516  * Test to see if we've been building up a completion structure for
 517  * earlier buffers -- if so, we try to append to this ioend if we
 518  * can, otherwise we finish off any current ioend and start another.
 519  * Return true if we've finished the given ioend.
 520  */
 521 STATIC void
 522 xfs_add_to_ioend(
 523         struct inode            *inode,
 524         struct buffer_head      *bh,
 525         xfs_off_t               offset,
 526         unsigned int            type,
 527         xfs_ioend_t             **result,
 528         int                     need_ioend)
 529 {
 530         xfs_ioend_t             *ioend = *result;
 531
 532         if (!ioend || need_ioend || type != ioend->io_type) {
 533                 xfs_ioend_t     *previous = *result;
 534
 535                 ioend = xfs_alloc_ioend(inode, type);
 536                 ioend->io_offset = offset;
 537                 ioend->io_buffer_head = bh;
 538                 ioend->io_buffer_tail = bh;
 539                 if (previous)
 540                         previous->io_list = ioend;
 541                 *result = ioend;
 542         } else {
 543                 ioend->io_buffer_tail->b_private = bh;
 544                 ioend->io_buffer_tail = bh;
 545         }
 546
 547         bh->b_private = NULL;
 548         ioend->io_size += bh->b_size;
 549 }
 550
 551 STATIC void
 552 xfs_map_buffer(
 553         struct buffer_head      *bh,
 554         xfs_iomap_t             *mp,
 555         xfs_off_t               offset,
 556         uint                    block_bits)
 557 {
 558         sector_t                bn;
 559
 560         ASSERT(mp->iomap_bn != IOMAP_DADDR_NULL);
 561
 562         bn = (mp->iomap_bn >> (block_bits - BBSHIFT)) +
 563               ((offset - mp->iomap_offset) >> block_bits);
 564
 565         ASSERT(bn || (mp->iomap_flags & IOMAP_REALTIME));
 566
 567         bh->b_blocknr = bn;
 568         set_buffer_mapped(bh);
 569 }
 570
 571 STATIC void
 572 xfs_map_at_offset(
 573         struct buffer_head      *bh,
 574         loff_t                  offset,
 575         int                     block_bits,
 576         xfs_iomap_t             *iomapp)
 577 {
 578         ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE));
 579         ASSERT(!(iomapp->iomap_flags & IOMAP_DELAY));
 580
 581         lock_buffer(bh);
 582         xfs_map_buffer(bh, iomapp, offset, block_bits);
 583         bh->b_bdev = iomapp->iomap_target->bt_bdev;
 584         set_buffer_mapped(bh);
 585         clear_buffer_delay(bh);
 586         clear_buffer_unwritten(bh);
 587 }
 588
 589 /*
 590  * Look for a page at index that is suitable for clustering.
 591  */
 592 STATIC unsigned int
 593 xfs_probe_page(
 594         struct page             *page,
 595         unsigned int            pg_offset,
 596         int                     mapped)
 597 {
 598         int                     ret = 0;
 599
 600         if (PageWriteback(page))
 601                 return 0;
 602
 603         if (page->mapping && PageDirty(page)) {
 604                 if (page_has_buffers(page)) {
 605                         struct buffer_head      *bh, *head;
 606
 607                         bh = head = page_buffers(page);
 608                         do {
 609                                 if (!buffer_uptodate(bh))
 610                                         break;
 611                                 if (mapped != buffer_mapped(bh))
 612                                         break;
 613                                 ret += bh->b_size;
 614                                 if (ret >= pg_offset)
 615                                         break;
 616                         } while ((bh = bh->b_this_page) != head);
 617                 } else
 618                         ret = mapped ? 0 : PAGE_CACHE_SIZE;
 619         }
 620
 621         return ret;
 622 }
 623
 624 STATIC size_t
 625 xfs_probe_cluster(
 626         struct inode            *inode,
 627         struct page             *startpage,
 628         struct buffer_head      *bh,
 629         struct buffer_head      *head,
 630         int                     mapped)
 631 {
 632         struct pagevec          pvec;
 633         pgoff_t                 tindex, tlast, tloff;
 634         size_t                  total = 0;
 635         int                     done = 0, i;
 636
 637         /* First sum forwards in this page */
 638         do {
 639                 if (!buffer_uptodate(bh) || (mapped != buffer_mapped(bh)))
 640                         return total;
 641                 total += bh->b_size;
 642         } while ((bh = bh->b_this_page) != head);
 643
 644         /* if we reached the end of the page, sum forwards in following pages */
 645         tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
 646         tindex = startpage->index + 1;
 647
 648         /* Prune this back to avoid pathological behavior */
 649         tloff = min(tlast, startpage->index + 64);
 650
 651         pagevec_init(&pvec, 0);
 652         while (!done && tindex <= tloff) {
 653                 unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
 654
 655                 if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
 656                         break;
 657
 658                 for (i = 0; i < pagevec_count(&pvec); i++) {
 659                         struct page *page = pvec.pages[i];
 660                         size_t pg_offset, pg_len = 0;
 661
 662                         if (tindex == tlast) {
 663                                 pg_offset =
 664                                     i_size_read(inode) & (PAGE_CACHE_SIZE - 1);
 665                                 if (!pg_offset) {
 666                                         done = 1;
 667                                         break;
 668                                 }
 669                         } else
 670                                 pg_offset = PAGE_CACHE_SIZE;
 671
 672                         if (page->index == tindex && trylock_page(page)) {
 673                                 pg_len = xfs_probe_page(page, pg_offset, mapped);
 674                                 unlock_page(page);
 675                         }
 676
 677                         if (!pg_len) {
 678                                 done = 1;
 679                                 break;
 680                         }
 681
 682                         total += pg_len;
 683                         tindex++;
 684                 }
 685
 686                 pagevec_release(&pvec);
 687                 cond_resched();
 688         }
 689
 690         return total;
 691 }
 692
 693 /*
 694  * Test if a given page is suitable for writing as part of an unwritten
 695  * or delayed allocate extent.
 696  */
 697 STATIC int
 698 xfs_is_delayed_page(
 699         struct page             *page,
 700         unsigned int            type)
 701 {
 702         if (PageWriteback(page))
 703                 return 0;
 704
 705         if (page->mapping && page_has_buffers(page)) {
 706                 struct buffer_head      *bh, *head;
 707                 int                     acceptable = 0;
 708
 709                 bh = head = page_buffers(page);
 710                 do {
 711                         if (buffer_unwritten(bh))
 712                                 acceptable = (type == IOMAP_UNWRITTEN);
 713                         else if (buffer_delay(bh))
 714                                 acceptable = (type == IOMAP_DELAY);
 715                         else if (buffer_dirty(bh) && buffer_mapped(bh))
 716                                 acceptable = (type == IOMAP_NEW);
 717                         else
 718                                 break;
 719                 } while ((bh = bh->b_this_page) != head);
 720
 721                 if (acceptable)
 722                         return 1;
 723         }
 724
 725         return 0;
 726 }
 727
 728 /*
 729  * Allocate & map buffers for page given the extent map. Write it out.
 730  * except for the original page of a writepage, this is called on
 731  * delalloc/unwritten pages only, for the original page it is possible
 732  * that the page has no mapping at all.
 733  */
 734 STATIC int
 735 xfs_convert_page(
 736         struct inode            *inode,
 737         struct page             *page,
 738         loff_t                  tindex,
 739         xfs_iomap_t             *mp,
 740         xfs_ioend_t             **ioendp,
 741         struct writeback_control *wbc,
 742         int                     startio,
 743         int                     all_bh)
 744 {
 745         struct buffer_head      *bh, *head;
 746         xfs_off_t               end_offset;
 747         unsigned long           p_offset;
 748         unsigned int            type;
 749         int                     bbits = inode->i_blkbits;
 750         int                     len, page_dirty;
 751         int                     count = 0, done = 0, uptodate = 1;
 752         xfs_off_t               offset = page_offset(page);
 753
 754         if (page->index != tindex)
 755                 goto fail;
 756         if (!trylock_page(page))
 757                 goto fail;
 758         if (PageWriteback(page))
 759                 goto fail_unlock_page;
 760         if (page->mapping != inode->i_mapping)
 761                 goto fail_unlock_page;
 762         if (!xfs_is_delayed_page(page, (*ioendp)->io_type))
 763                 goto fail_unlock_page;
 764
 765         /*
 766          * page_dirty is initially a count of buffers on the page before
 767          * EOF and is decremented as we move each into a cleanable state.
 768          *
 769          * Derivation:
 770          *
 771          * End offset is the highest offset that this page should represent.
 772          * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
 773          * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
 774          * hence give us the correct page_dirty count. On any other page,
 775          * it will be zero and in that case we need page_dirty to be the
 776          * count of buffers on the page.
 777          */
 778         end_offset = min_t(unsigned long long,
 779                         (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
 780                         i_size_read(inode));
 781
 782         len = 1 << inode->i_blkbits;
 783         p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
 784                                         PAGE_CACHE_SIZE);
 785         p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
 786         page_dirty = p_offset / len;
 787
 788         bh = head = page_buffers(page);
 789         do {
 790                 if (offset >= end_offset)
 791                         break;
 792                 if (!buffer_uptodate(bh))
 793                         uptodate = 0;
 794                 if (!(PageUptodate(page) || buffer_uptodate(bh))) {
 795                         done = 1;
 796                         continue;
 797                 }
 798
 799                 if (buffer_unwritten(bh) || buffer_delay(bh)) {
 800                         if (buffer_unwritten(bh))
 801                                 type = IOMAP_UNWRITTEN;
 802                         else
 803                                 type = IOMAP_DELAY;
 804
 805                         if (!xfs_iomap_valid(mp, offset)) {
 806                                 done = 1;
 807                                 continue;
 808                         }
 809
 810                         ASSERT(!(mp->iomap_flags & IOMAP_HOLE));
 811                         ASSERT(!(mp->iomap_flags & IOMAP_DELAY));
 812
 813                         xfs_map_at_offset(bh, offset, bbits, mp);
 814                         if (startio) {
 815                                 xfs_add_to_ioend(inode, bh, offset,
 816                                                 type, ioendp, done);
 817                         } else {
 818                                 set_buffer_dirty(bh);
 819                                 unlock_buffer(bh);
 820                                 mark_buffer_dirty(bh);
 821                         }
 822                         page_dirty--;
 823                         count++;
 824                 } else {
 825                         type = IOMAP_NEW;
 826                         if (buffer_mapped(bh) && all_bh && startio) {
 827                                 lock_buffer(bh);
 828                                 xfs_add_to_ioend(inode, bh, offset,
 829                                                 type, ioendp, done);
 830                                 count++;
 831                                 page_dirty--;
 832                         } else {
 833                                 done = 1;
 834                         }
 835                 }
 836         } while (offset += len, (bh = bh->b_this_page) != head);
 837
 838         if (uptodate && bh == head)
 839                 SetPageUptodate(page);
 840
 841         if (startio) {
 842                 if (count) {
 843                         struct backing_dev_info *bdi;
 844
 845                         bdi = inode->i_mapping->backing_dev_info;
 846                         wbc->nr_to_write--;
 847                         if (bdi_write_congested(bdi)) {
 848                                 wbc->encountered_congestion = 1;
 849                                 done = 1;
 850                         } else if (wbc->nr_to_write <= 0) {
 851                                 done = 1;
 852                         }
 853                 }
 854                 xfs_start_page_writeback(page, !page_dirty, count);
 855         }
 856
 857         return done;
 858  fail_unlock_page:
 859         unlock_page(page);
 860  fail:
 861         return 1;
 862 }
 863
 864 /*
 865  * Convert & write out a cluster of pages in the same extent as defined
 866  * by mp and following the start page.
 867  */
 868 STATIC void
 869 xfs_cluster_write(
 870         struct inode            *inode,
 871         pgoff_t                 tindex,
 872         xfs_iomap_t             *iomapp,
 873         xfs_ioend_t             **ioendp,
 874         struct writeback_control *wbc,
 875         int                     startio,
 876         int                     all_bh,
 877         pgoff_t                 tlast)
 878 {
 879         struct pagevec          pvec;
 880         int                     done = 0, i;
 881
 882         pagevec_init(&pvec, 0);
 883         while (!done && tindex <= tlast) {
 884                 unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
 885
 886                 if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
 887                         break;
 888
 889                 for (i = 0; i < pagevec_count(&pvec); i++) {
 890                         done = xfs_convert_page(inode, pvec.pages[i], tindex++,
 891                                         iomapp, ioendp, wbc, startio, all_bh);
 892                         if (done)
 893                                 break;
 894                 }
 895
 896                 pagevec_release(&pvec);
 897                 cond_resched();
 898         }
 899 }
 900
 901 /*
 902  * Calling this without startio set means we are being asked to make a dirty
 903  * page ready for freeing it's buffers.  When called with startio set then
 904  * we are coming from writepage.
 905  *
 906  * When called with startio set it is important that we write the WHOLE
 907  * page if possible.
 908  * The bh->b_state's cannot know if any of the blocks or which block for
 909  * that matter are dirty due to mmap writes, and therefore bh uptodate is
 910  * only valid if the page itself isn't completely uptodate.  Some layers
 911  * may clear the page dirty flag prior to calling write page, under the
 912  * assumption the entire page will be written out; by not writing out the
 913  * whole page the page can be reused before all valid dirty data is
 914  * written out.  Note: in the case of a page that has been dirty'd by
 915  * mapwrite and but partially setup by block_prepare_write the
 916  * bh->b_states's will not agree and only ones setup by BPW/BCW will have
 917  * valid state, thus the whole page must be written out thing.
 918  */
 919
 920 STATIC int
 921 xfs_page_state_convert(
 922         struct inode    *inode,
 923         struct page     *page,
 924         struct writeback_control *wbc,
 925         int             startio,
 926         int             unmapped) /* also implies page uptodate */
 927 {
 928         struct buffer_head      *bh, *head;
 929         xfs_iomap_t             iomap;
 930         xfs_ioend_t             *ioend = NULL, *iohead = NULL;
 931         loff_t                  offset;
 932         unsigned long           p_offset = 0;
 933         unsigned int            type;
 934         __uint64_t              end_offset;
 935         pgoff_t                 end_index, last_index, tlast;
 936         ssize_t                 size, len;
 937         int                     flags, err, iomap_valid = 0, uptodate = 1;
 938         int                     page_dirty, count = 0;
 939         int                     trylock = 0;
 940         int                     all_bh = unmapped;
 941
 942         if (startio) {
 943                 if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)
 944                         trylock |= BMAPI_TRYLOCK;
 945         }
 946
 947         /* Is this page beyond the end of the file? */
 948         offset = i_size_read(inode);
 949         end_index = offset >> PAGE_CACHE_SHIFT;
 950         last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
 951         if (page->index >= end_index) {
 952                 if ((page->index >= end_index + 1) ||
 953                     !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {
 954                         if (startio)
 955                                 unlock_page(page);
 956                         return 0;
 957                 }
 958         }
 959
 960         /*
 961          * page_dirty is initially a count of buffers on the page before
 962          * EOF and is decremented as we move each into a cleanable state.
 963          *
 964          * Derivation:
 965          *
 966          * End offset is the highest offset that this page should represent.
 967          * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
 968          * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
 969          * hence give us the correct page_dirty count. On any other page,
 970          * it will be zero and in that case we need page_dirty to be the
 971          * count of buffers on the page.
 972          */
 973         end_offset = min_t(unsigned long long,
 974                         (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, offset);
 975         len = 1 << inode->i_blkbits;
 976         p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
 977                                         PAGE_CACHE_SIZE);
 978         p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
 979         page_dirty = p_offset / len;
 980
 981         bh = head = page_buffers(page);
 982         offset = page_offset(page);
 983         flags = BMAPI_READ;
 984         type = IOMAP_NEW;
 985
 986         /* TODO: cleanup count and page_dirty */
 987
 988         do {
 989                 if (offset >= end_offset)
 990                         break;
 991                 if (!buffer_uptodate(bh))
 992                         uptodate = 0;
 993                 if (!(PageUptodate(page) || buffer_uptodate(bh)) && !startio) {
 994                         /*
 995                          * the iomap is actually still valid, but the ioend
 996                          * isn't.  shouldn't happen too often.
 997                          */
 998                         iomap_valid = 0;
 999                         continue;
1000                 }
1001
1002                 if (iomap_valid)
1003                         iomap_valid = xfs_iomap_valid(&iomap, offset);
1004
1005                 /*
1006                  * First case, map an unwritten extent and prepare for
1007                  * extent state conversion transaction on completion.
1008                  *
1009                  * Second case, allocate space for a delalloc buffer.
1010                  * We can return EAGAIN here in the release page case.
1011                  *
1012                  * Third case, an unmapped buffer was found, and we are
1013                  * in a path where we need to write the whole page out.
1014                  */
1015                 if (buffer_unwritten(bh) || buffer_delay(bh) ||
1016                     ((buffer_uptodate(bh) || PageUptodate(page)) &&
1017                      !buffer_mapped(bh) && (unmapped || startio))) {
1018                         int new_ioend = 0;
1019
1020                         /*
1021                          * Make sure we don't use a read-only iomap
1022                          */
1023                         if (flags == BMAPI_READ)
1024                                 iomap_valid = 0;
1025
1026                         if (buffer_unwritten(bh)) {
1027                                 type = IOMAP_UNWRITTEN;
1028                                 flags = BMAPI_WRITE | BMAPI_IGNSTATE;
1029                         } else if (buffer_delay(bh)) {
1030                                 type = IOMAP_DELAY;
1031                                 flags = BMAPI_ALLOCATE | trylock;
1032                         } else {
1033                                 type = IOMAP_NEW;
1034                                 flags = BMAPI_WRITE | BMAPI_MMAP;
1035                         }
1036
1037                         if (!iomap_valid) {
1038                                 /*
1039                                  * if we didn't have a valid mapping then we
1040                                  * need to ensure that we put the new mapping
1041                                  * in a new ioend structure. This needs to be
1042                                  * done to ensure that the ioends correctly
1043                                  * reflect the block mappings at io completion
1044                                  * for unwritten extent conversion.
1045                                  */
1046                                 new_ioend = 1;
1047                                 if (type == IOMAP_NEW) {
1048                                         size = xfs_probe_cluster(inode,
1049                                                         page, bh, head, 0);
1050                                 } else {
1051                                         size = len;
1052                                 }
1053
1054                                 err = xfs_map_blocks(inode, offset, size,
1055                                                 &iomap, flags);
1056                                 if (err)
1057                                         goto error;
1058                                 iomap_valid = xfs_iomap_valid(&iomap, offset);
1059                         }
1060                         if (iomap_valid) {
1061                                 xfs_map_at_offset(bh, offset,
1062                                                 inode->i_blkbits, &iomap);
1063                                 if (startio) {
1064                                         xfs_add_to_ioend(inode, bh, offset,
1065                                                         type, &ioend,
1066                                                         new_ioend);
1067                                 } else {
1068                                         set_buffer_dirty(bh);
1069                                         unlock_buffer(bh);
1070                                         mark_buffer_dirty(bh);
1071                                 }
1072                                 page_dirty--;
1073                                 count++;
1074                         }
1075                 } else if (buffer_uptodate(bh) && startio) {
1076                         /*
1077                          * we got here because the buffer is already mapped.
1078                          * That means it must already have extents allocated
1079                          * underneath it. Map the extent by reading it.
1080                          */
1081                         if (!iomap_valid || flags != BMAPI_READ) {
1082                                 flags = BMAPI_READ;
1083                                 size = xfs_probe_cluster(inode, page, bh,
1084                                                                 head, 1);
1085                                 err = xfs_map_blocks(inode, offset, size,
1086                                                 &iomap, flags);
1087                                 if (err)
1088                                         goto error;
1089                                 iomap_valid = xfs_iomap_valid(&iomap, offset);
1090                         }
1091
1092                         /*
1093                          * We set the type to IOMAP_NEW in case we are doing a
1094                          * small write at EOF that is extending the file but
1095                          * without needing an allocation. We need to update the
1096                          * file size on I/O completion in this case so it is
1097                          * the same case as having just allocated a new extent
1098                          * that we are writing into for the first time.
1099                          */
1100                         type = IOMAP_NEW;
1101                         if (trylock_buffer(bh)) {
1102                                 ASSERT(buffer_mapped(bh));
1103                                 if (iomap_valid)
1104                                         all_bh = 1;
1105                                 xfs_add_to_ioend(inode, bh, offset, type,
1106                                                 &ioend, !iomap_valid);
1107                                 page_dirty--;
1108                                 count++;
1109                         } else {
1110                                 iomap_valid = 0;
1111                         }
1112                 } else if ((buffer_uptodate(bh) || PageUptodate(page)) &&
1113                            (unmapped || startio)) {
1114                         iomap_valid = 0;
1115                 }
1116
1117                 if (!iohead)
1118                         iohead = ioend;
1119
1120         } while (offset += len, ((bh = bh->b_this_page) != head));
1121
1122         if (uptodate && bh == head)
1123                 SetPageUptodate(page);
1124
1125         if (startio)
1126                 xfs_start_page_writeback(page, 1, count);
1127
1128         if (ioend && iomap_valid) {
1129                 offset = (iomap.iomap_offset + iomap.iomap_bsize - 1) >>
1130                                         PAGE_CACHE_SHIFT;
1131                 tlast = min_t(pgoff_t, offset, last_index);
1132                 xfs_cluster_write(inode, page->index + 1, &iomap, &ioend,
1133                                         wbc, startio, all_bh, tlast);
1134         }
1135
1136         if (iohead)
1137                 xfs_submit_ioend(iohead);
1138
1139         return page_dirty;
1140
1141 error:
1142         if (iohead)
1143                 xfs_cancel_ioend(iohead);
1144
1145         /*
1146          * If it's delalloc and we have nowhere to put it,
1147          * throw it away, unless the lower layers told
1148          * us to try again.
1149          */
1150         if (err != -EAGAIN) {
1151                 if (!unmapped)
1152                         block_invalidatepage(page, 0);
1153                 ClearPageUptodate(page);
1154         }
1155         return err;
1156 }
1157
1158 /*
1159  * writepage: Called from one of two places:
1160  *
1161  * 1. we are flushing a delalloc buffer head.
1162  *
1163  * 2. we are writing out a dirty page. Typically the page dirty
1164  *    state is cleared before we get here. In this case is it
1165  *    conceivable we have no buffer heads.
1166  *
1167  * For delalloc space on the page we need to allocate space and
1168  * flush it. For unmapped buffer heads on the page we should
1169  * allocate space if the page is uptodate. For any other dirty
1170  * buffer heads on the page we should flush them.
1171  *
1172  * If we detect that a transaction would be required to flush
1173  * the page, we have to check the process flags first, if we
1174  * are already in a transaction or disk I/O during allocations
1175  * is off, we need to fail the writepage and redirty the page.
1176  */
1177
1178 STATIC int
1179 xfs_vm_writepage(
1180         struct page             *page,
1181         struct writeback_control *wbc)
1182 {
1183         int                     error;
1184         int                     need_trans;
1185         int                     delalloc, unmapped, unwritten;
1186         struct inode            *inode = page->mapping->host;
1187
1188         xfs_page_trace(XFS_WRITEPAGE_ENTER, inode, page, 0);
1189
1190         /*
1191          * We need a transaction if:
1192          *  1. There are delalloc buffers on the page
1193          *  2. The page is uptodate and we have unmapped buffers
1194          *  3. The page is uptodate and we have no buffers
1195          *  4. There are unwritten buffers on the page
1196          */
1197
1198         if (!page_has_buffers(page)) {
1199                 unmapped = 1;
1200                 need_trans = 1;
1201         } else {
1202                 xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
1203                 if (!PageUptodate(page))
1204                         unmapped = 0;
1205                 need_trans = delalloc + unmapped + unwritten;
1206         }
1207
1208         /*
1209          * If we need a transaction and the process flags say
1210          * we are already in a transaction, or no IO is allowed
1211          * then mark the page dirty again and leave the page
1212          * as is.
1213          */
1214         if (current_test_flags(PF_FSTRANS) && need_trans)
1215                 goto out_fail;
1216
1217         /*
1218          * Delay hooking up buffer heads until we have
1219          * made our go/no-go decision.
1220          */
1221         if (!page_has_buffers(page))
1222                 create_empty_buffers(page, 1 << inode->i_blkbits, 0);
1223
1224         /*
1225          * Convert delayed allocate, unwritten or unmapped space
1226          * to real space and flush out to disk.
1227          */
1228         error = xfs_page_state_convert(inode, page, wbc, 1, unmapped);
1229         if (error == -EAGAIN)
1230                 goto out_fail;
1231         if (unlikely(error < 0))
1232                 goto out_unlock;
1233
1234         return 0;
1235
1236 out_fail:
1237         redirty_page_for_writepage(wbc, page);
1238         unlock_page(page);
1239         return 0;
1240 out_unlock:
1241         unlock_page(page);
1242         return error;
1243 }
1244
1245 STATIC int
1246 xfs_vm_writepages(
1247         struct address_space    *mapping,
1248         struct writeback_control *wbc)
1249 {
1250         xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
1251         return generic_writepages(mapping, wbc);
1252 }
1253
1254 /*
1255  * Called to move a page into cleanable state - and from there
1256  * to be released. Possibly the page is already clean. We always
1257  * have buffer heads in this call.
1258  *
1259  * Returns 0 if the page is ok to release, 1 otherwise.
1260  *
1261  * Possible scenarios are:
1262  *
1263  * 1. We are being called to release a page which has been written
1264  *    to via regular I/O. buffer heads will be dirty and possibly
1265  *    delalloc. If no delalloc buffer heads in this case then we
1266  *    can just return zero.
1267  *
1268  * 2. We are called to release a page which has been written via
1269  *    mmap, all we need to do is ensure there is no delalloc
1270  *    state in the buffer heads, if not we can let the caller
1271  *    free them and we should come back later via writepage.
1272  */
1273 STATIC int
1274 xfs_vm_releasepage(
1275         struct page             *page,
1276         gfp_t                   gfp_mask)
1277 {
1278         struct inode            *inode = page->mapping->host;
1279         int                     dirty, delalloc, unmapped, unwritten;
1280         struct writeback_control wbc = {
1281                 .sync_mode = WB_SYNC_ALL,
1282                 .nr_to_write = 1,
1283         };
1284
1285         xfs_page_trace(XFS_RELEASEPAGE_ENTER, inode, page, 0);
1286
1287         if (!page_has_buffers(page))
1288                 return 0;
1289
1290         xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
1291         if (!delalloc && !unwritten)
1292                 goto free_buffers;
1293
1294         if (!(gfp_mask & __GFP_FS))
1295                 return 0;
1296
1297         /* If we are already inside a transaction or the thread cannot
1298          * do I/O, we cannot release this page.
1299          */
1300         if (current_test_flags(PF_FSTRANS))
1301                 return 0;
1302
1303         /*
1304          * Convert delalloc space to real space, do not flush the
1305          * data out to disk, that will be done by the caller.
1306          * Never need to allocate space here - we will always
1307          * come back to writepage in that case.
1308          */
1309         dirty = xfs_page_state_convert(inode, page, &wbc, 0, 0);
1310         if (dirty == 0 && !unwritten)
1311                 goto free_buffers;
1312         return 0;
1313
1314 free_buffers:
1315         return try_to_free_buffers(page);
1316 }
1317
1318 STATIC int
1319 __xfs_get_blocks(
1320         struct inode            *inode,
1321         sector_t                iblock,
1322         struct buffer_head      *bh_result,
1323         int                     create,
1324         int                     direct,
1325         bmapi_flags_t           flags)
1326 {
1327         xfs_iomap_t             iomap;
1328         xfs_off_t               offset;
1329         ssize_t                 size;
1330         int                     niomap = 1;
1331         int                     error;
1332
1333         offset = (xfs_off_t)iblock << inode->i_blkbits;
1334         ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
1335         size = bh_result->b_size;
1336
1337         if (!create && direct && offset >= i_size_read(inode))
1338                 return 0;
1339
1340         error = xfs_iomap(XFS_I(inode), offset, size,
1341                              create ? flags : BMAPI_READ, &iomap, &niomap);
1342         if (error)
1343                 return -error;
1344         if (niomap == 0)
1345                 return 0;
1346
1347         if (iomap.iomap_bn != IOMAP_DADDR_NULL) {
1348                 /*
1349                  * For unwritten extents do not report a disk address on
1350                  * the read case (treat as if we're reading into a hole).
1351                  */
1352                 if (create || !(iomap.iomap_flags & IOMAP_UNWRITTEN)) {
1353                         xfs_map_buffer(bh_result, &iomap, offset,
1354                                        inode->i_blkbits);
1355                 }
1356                 if (create && (iomap.iomap_flags & IOMAP_UNWRITTEN)) {
1357                         if (direct)
1358                                 bh_result->b_private = inode;
1359                         set_buffer_unwritten(bh_result);
1360                 }
1361         }
1362
1363         /*
1364          * If this is a realtime file, data may be on a different device.
1365          * to that pointed to from the buffer_head b_bdev currently.
1366          */
1367         bh_result->b_bdev = iomap.iomap_target->bt_bdev;
1368
1369         /*
1370          * If we previously allocated a block out beyond eof and we are now
1371          * coming back to use it then we will need to flag it as new even if it
1372          * has a disk address.
1373          *
1374          * With sub-block writes into unwritten extents we also need to mark
1375          * the buffer as new so that the unwritten parts of the buffer gets
1376          * correctly zeroed.
1377          */
1378         if (create &&
1379             ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
1380              (offset >= i_size_read(inode)) ||
1381              (iomap.iomap_flags & (IOMAP_NEW|IOMAP_UNWRITTEN))))
1382                 set_buffer_new(bh_result);
1383
1384         if (iomap.iomap_flags & IOMAP_DELAY) {
1385                 BUG_ON(direct);
1386                 if (create) {
1387                         set_buffer_uptodate(bh_result);
1388                         set_buffer_mapped(bh_result);
1389                         set_buffer_delay(bh_result);
1390                 }
1391         }
1392
1393         if (direct || size > (1 << inode->i_blkbits)) {
1394                 ASSERT(iomap.iomap_bsize - iomap.iomap_delta > 0);
1395                 offset = min_t(xfs_off_t,
1396                                 iomap.iomap_bsize - iomap.iomap_delta, size);
1397                 bh_result->b_size = (ssize_t)min_t(xfs_off_t, LONG_MAX, offset);
1398         }
1399
1400         return 0;
1401 }
1402
1403 int
1404 xfs_get_blocks(
1405         struct inode            *inode,
1406         sector_t                iblock,
1407         struct buffer_head      *bh_result,
1408         int                     create)
1409 {
1410         return __xfs_get_blocks(inode, iblock,
1411                                 bh_result, create, 0, BMAPI_WRITE);
1412 }
1413
1414 STATIC int
1415 xfs_get_blocks_direct(
1416         struct inode            *inode,
1417         sector_t                iblock,
1418         struct buffer_head      *bh_result,
1419         int                     create)
1420 {
1421         return __xfs_get_blocks(inode, iblock,
1422                                 bh_result, create, 1, BMAPI_WRITE|BMAPI_DIRECT);
1423 }
1424
1425 STATIC void
1426 xfs_end_io_direct(
1427         struct kiocb    *iocb,
1428         loff_t          offset,
1429         ssize_t         size,
1430         void            *private)
1431 {
1432         xfs_ioend_t     *ioend = iocb->private;
1433
1434         /*
1435          * Non-NULL private data means we need to issue a transaction to
1436          * convert a range from unwritten to written extents.  This needs
1437          * to happen from process context but aio+dio I/O completion
1438          * happens from irq context so we need to defer it to a workqueue.
1439          * This is not necessary for synchronous direct I/O, but we do
1440          * it anyway to keep the code uniform and simpler.
1441          *
1442          * Well, if only it were that simple. Because synchronous direct I/O
1443          * requires extent conversion to occur *before* we return to userspace,
1444          * we have to wait for extent conversion to complete. Look at the
1445          * iocb that has been passed to us to determine if this is AIO or
1446          * not. If it is synchronous, tell xfs_finish_ioend() to kick the
1447          * workqueue and wait for it to complete.
1448          *
1449          * The core direct I/O code might be changed to always call the
1450          * completion handler in the future, in which case all this can
1451          * go away.
1452          */
1453         ioend->io_offset = offset;
1454         ioend->io_size = size;
1455         if (ioend->io_type == IOMAP_READ) {
1456                 xfs_finish_ioend(ioend, 0);
1457         } else if (private && size > 0) {
1458                 xfs_finish_ioend(ioend, is_sync_kiocb(iocb));
1459         } else {
1460                 /*
1461                  * A direct I/O write ioend starts it's life in unwritten
1462                  * state in case they map an unwritten extent.  This write
1463                  * didn't map an unwritten extent so switch it's completion
1464                  * handler.
1465                  */
1466                 INIT_WORK(&ioend->io_work, xfs_end_bio_written);
1467                 xfs_finish_ioend(ioend, 0);
1468         }
1469
1470         /*
1471          * blockdev_direct_IO can return an error even after the I/O
1472          * completion handler was called.  Thus we need to protect
1473          * against double-freeing.
1474          */
1475         iocb->private = NULL;
1476 }
1477
1478 STATIC ssize_t
1479 xfs_vm_direct_IO(
1480         int                     rw,
1481         struct kiocb            *iocb,
1482         const struct iovec      *iov,
1483         loff_t                  offset,
1484         unsigned long           nr_segs)
1485 {
1486         struct file     *file = iocb->ki_filp;
1487         struct inode    *inode = file->f_mapping->host;
1488         struct block_device *bdev;
1489         ssize_t         ret;
1490
1491         bdev = xfs_find_bdev_for_inode(XFS_I(inode));
1492
1493         if (rw == WRITE) {
1494                 iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN);
1495                 ret = blockdev_direct_IO_own_locking(rw, iocb, inode,
1496                         bdev, iov, offset, nr_segs,
1497                         xfs_get_blocks_direct,
1498                         xfs_end_io_direct);
1499         } else {
1500                 iocb->private = xfs_alloc_ioend(inode, IOMAP_READ);
1501                 ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
1502                         bdev, iov, offset, nr_segs,
1503                         xfs_get_blocks_direct,
1504                         xfs_end_io_direct);
1505         }
1506
1507         if (unlikely(ret != -EIOCBQUEUED && iocb->private))
1508                 xfs_destroy_ioend(iocb->private);
1509         return ret;
1510 }
1511
1512 STATIC int
1513 xfs_vm_write_begin(
1514         struct file             *file,
1515         struct address_space    *mapping,
1516         loff_t                  pos,
1517         unsigned                len,
1518         unsigned                flags,
1519         struct page             **pagep,
1520         void                    **fsdata)
1521 {
1522         *pagep = NULL;
1523         return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
1524                                                                 xfs_get_blocks);
1525 }
1526
1527 STATIC sector_t
1528 xfs_vm_bmap(
1529         struct address_space    *mapping,
1530         sector_t                block)
1531 {
1532         struct inode            *inode = (struct inode *)mapping->host;
1533         struct xfs_inode        *ip = XFS_I(inode);
1534
1535         xfs_itrace_entry(XFS_I(inode));
1536         xfs_ilock(ip, XFS_IOLOCK_SHARED);
1537         xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF);
1538         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
1539         return generic_block_bmap(mapping, block, xfs_get_blocks);
1540 }
1541
1542 STATIC int
1543 xfs_vm_readpage(
1544         struct file             *unused,
1545         struct page             *page)
1546 {
1547         return mpage_readpage(page, xfs_get_blocks);
1548 }
1549
1550 STATIC int
1551 xfs_vm_readpages(
1552         struct file             *unused,
1553         struct address_space    *mapping,
1554         struct list_head        *pages,
1555         unsigned                nr_pages)
1556 {
1557         return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
1558 }
1559
1560 STATIC void
1561 xfs_vm_invalidatepage(
1562         struct page             *page,
1563         unsigned long           offset)
1564 {
1565         xfs_page_trace(XFS_INVALIDPAGE_ENTER,
1566                         page->mapping->host, page, offset);
1567         block_invalidatepage(page, offset);
1568 }
1569
1570 const struct address_space_operations xfs_address_space_operations = {
1571         .readpage               = xfs_vm_readpage,
1572         .readpages              = xfs_vm_readpages,
1573         .writepage              = xfs_vm_writepage,
1574         .writepages             = xfs_vm_writepages,
1575         .sync_page              = block_sync_page,
1576         .releasepage            = xfs_vm_releasepage,
1577         .invalidatepage         = xfs_vm_invalidatepage,
1578         .write_begin            = xfs_vm_write_begin,
1579         .write_end              = generic_write_end,
1580         .bmap                   = xfs_vm_bmap,
1581         .direct_IO              = xfs_vm_direct_IO,
1582         .migratepage            = buffer_migrate_page,
1583 };