4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2012, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
51 #include "cl_object.h"
53 struct ll_file_data *ll_file_data_get(void)
55 struct ll_file_data *fd;
57 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, __GFP_IO);
60 fd->fd_write_failed = false;
64 static void ll_file_data_put(struct ll_file_data *fd)
67 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
70 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
71 struct lustre_handle *fh)
73 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
74 op_data->op_attr.ia_mode = inode->i_mode;
75 op_data->op_attr.ia_atime = inode->i_atime;
76 op_data->op_attr.ia_mtime = inode->i_mtime;
77 op_data->op_attr.ia_ctime = inode->i_ctime;
78 op_data->op_attr.ia_size = i_size_read(inode);
79 op_data->op_attr_blocks = inode->i_blocks;
80 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
81 ll_inode_to_ext_flags(inode->i_flags);
82 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
84 op_data->op_handle = *fh;
85 op_data->op_capa1 = ll_mdscapa_get(inode);
87 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
88 op_data->op_bias |= MDS_DATA_MODIFIED;
92 * Closes the IO epoch and packs all the attributes into @op_data for
95 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
96 struct obd_client_handle *och)
98 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
99 ATTR_MTIME | ATTR_MTIME_SET |
100 ATTR_CTIME | ATTR_CTIME_SET;
102 if (!(och->och_flags & FMODE_WRITE))
105 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
106 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
108 ll_ioepoch_close(inode, op_data, &och, 0);
111 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
112 ll_prep_md_op_data(op_data, inode, NULL, NULL,
113 0, 0, LUSTRE_OPC_ANY, NULL);
116 static int ll_close_inode_openhandle(struct obd_export *md_exp,
118 struct obd_client_handle *och,
119 const __u64 *data_version)
121 struct obd_export *exp = ll_i2mdexp(inode);
122 struct md_op_data *op_data;
123 struct ptlrpc_request *req = NULL;
124 struct obd_device *obd = class_exp2obd(exp);
130 * XXX: in case of LMV, is this correct to access
133 CERROR("Invalid MDC connection handle "LPX64"\n",
134 ll_i2mdexp(inode)->exp_handle.h_cookie);
138 OBD_ALLOC_PTR(op_data);
140 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
142 ll_prepare_close(inode, op_data, och);
143 if (data_version != NULL) {
144 /* Pass in data_version implies release. */
145 op_data->op_bias |= MDS_HSM_RELEASE;
146 op_data->op_data_version = *data_version;
147 op_data->op_lease_handle = och->och_lease_handle;
148 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
150 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
151 rc = md_close(md_exp, op_data, och->och_mod, &req);
153 /* This close must have the epoch closed. */
154 LASSERT(epoch_close);
155 /* MDS has instructed us to obtain Size-on-MDS attribute from
156 * OSTs and send setattr to back to MDS. */
157 rc = ll_som_update(inode, op_data);
159 CERROR("inode %lu mdc Size-on-MDS update failed: "
160 "rc = %d\n", inode->i_ino, rc);
164 CERROR("inode %lu mdc close failed: rc = %d\n",
168 /* DATA_MODIFIED flag was successfully sent on close, cancel data
169 * modification flag. */
170 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
171 struct ll_inode_info *lli = ll_i2info(inode);
173 spin_lock(&lli->lli_lock);
174 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
175 spin_unlock(&lli->lli_lock);
179 rc = ll_objects_destroy(req, inode);
181 CERROR("inode %lu ll_objects destroy: rc = %d\n",
184 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
185 struct mdt_body *body;
186 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
187 if (!(body->valid & OBD_MD_FLRELEASED))
191 ll_finish_md_op_data(op_data);
194 if (exp_connect_som(exp) && !epoch_close &&
195 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
196 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
198 md_clear_open_replay_data(md_exp, och);
199 /* Free @och if it is not waiting for DONE_WRITING. */
200 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
203 if (req) /* This is close request */
204 ptlrpc_req_finished(req);
208 int ll_md_real_close(struct inode *inode, int flags)
210 struct ll_inode_info *lli = ll_i2info(inode);
211 struct obd_client_handle **och_p;
212 struct obd_client_handle *och;
216 if (flags & FMODE_WRITE) {
217 och_p = &lli->lli_mds_write_och;
218 och_usecount = &lli->lli_open_fd_write_count;
219 } else if (flags & FMODE_EXEC) {
220 och_p = &lli->lli_mds_exec_och;
221 och_usecount = &lli->lli_open_fd_exec_count;
223 LASSERT(flags & FMODE_READ);
224 och_p = &lli->lli_mds_read_och;
225 och_usecount = &lli->lli_open_fd_read_count;
228 mutex_lock(&lli->lli_och_mutex);
229 if (*och_usecount) { /* There are still users of this handle, so
231 mutex_unlock(&lli->lli_och_mutex);
236 mutex_unlock(&lli->lli_och_mutex);
238 if (och) { /* There might be a race and somebody have freed this och
240 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
247 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
250 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
251 struct ll_inode_info *lli = ll_i2info(inode);
254 /* clear group lock, if present */
255 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
256 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
258 if (fd->fd_lease_och != NULL) {
261 /* Usually the lease is not released when the
262 * application crashed, we need to release here. */
263 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
264 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
265 PFID(&lli->lli_fid), rc, lease_broken);
267 fd->fd_lease_och = NULL;
270 if (fd->fd_och != NULL) {
271 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
276 /* Let's see if we have good enough OPEN lock on the file and if
277 we can skip talking to MDS */
278 if (file->f_dentry->d_inode) { /* Can this ever be false? */
280 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
281 struct lustre_handle lockh;
282 struct inode *inode = file->f_dentry->d_inode;
283 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
285 mutex_lock(&lli->lli_och_mutex);
286 if (fd->fd_omode & FMODE_WRITE) {
288 LASSERT(lli->lli_open_fd_write_count);
289 lli->lli_open_fd_write_count--;
290 } else if (fd->fd_omode & FMODE_EXEC) {
292 LASSERT(lli->lli_open_fd_exec_count);
293 lli->lli_open_fd_exec_count--;
296 LASSERT(lli->lli_open_fd_read_count);
297 lli->lli_open_fd_read_count--;
299 mutex_unlock(&lli->lli_och_mutex);
301 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
302 LDLM_IBITS, &policy, lockmode,
304 rc = ll_md_real_close(file->f_dentry->d_inode,
308 CERROR("Releasing a file %p with negative dentry %p. Name %s",
309 file, file->f_dentry, file->f_dentry->d_name.name);
313 LUSTRE_FPRIVATE(file) = NULL;
314 ll_file_data_put(fd);
315 ll_capa_close(inode);
320 /* While this returns an error code, fput() the caller does not, so we need
321 * to make every effort to clean up all of our state here. Also, applications
322 * rarely check close errors and even if an error is returned they will not
323 * re-try the close call.
325 int ll_file_release(struct inode *inode, struct file *file)
327 struct ll_file_data *fd;
328 struct ll_sb_info *sbi = ll_i2sbi(inode);
329 struct ll_inode_info *lli = ll_i2info(inode);
332 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
333 inode->i_generation, inode);
335 #ifdef CONFIG_FS_POSIX_ACL
336 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
337 inode == inode->i_sb->s_root->d_inode) {
338 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
341 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
342 fd->fd_flags &= ~LL_FILE_RMTACL;
343 rct_del(&sbi->ll_rct, current_pid());
344 et_search_free(&sbi->ll_et, current_pid());
349 if (inode->i_sb->s_root != file->f_dentry)
350 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
351 fd = LUSTRE_FPRIVATE(file);
354 /* The last ref on @file, maybe not the the owner pid of statahead.
355 * Different processes can open the same dir, "ll_opendir_key" means:
356 * it is me that should stop the statahead thread. */
357 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
358 lli->lli_opendir_pid != 0)
359 ll_stop_statahead(inode, lli->lli_opendir_key);
361 if (inode->i_sb->s_root == file->f_dentry) {
362 LUSTRE_FPRIVATE(file) = NULL;
363 ll_file_data_put(fd);
367 if (!S_ISDIR(inode->i_mode)) {
368 lov_read_and_clear_async_rc(lli->lli_clob);
369 lli->lli_async_rc = 0;
372 rc = ll_md_close(sbi->ll_md_exp, inode, file);
374 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
375 libcfs_debug_dumplog();
380 static int ll_intent_file_open(struct file *file, void *lmm,
381 int lmmsize, struct lookup_intent *itp)
383 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
384 struct dentry *parent = file->f_dentry->d_parent;
385 const char *name = file->f_dentry->d_name.name;
386 const int len = file->f_dentry->d_name.len;
387 struct md_op_data *op_data;
388 struct ptlrpc_request *req;
389 __u32 opc = LUSTRE_OPC_ANY;
395 /* Usually we come here only for NFSD, and we want open lock.
396 But we can also get here with pre 2.6.15 patchless kernels, and in
397 that case that lock is also ok */
398 /* We can also get here if there was cached open handle in revalidate_it
399 * but it disappeared while we were getting from there to ll_file_open.
400 * But this means this file was closed and immediately opened which
401 * makes a good candidate for using OPEN lock */
402 /* If lmmsize & lmm are not 0, we are just setting stripe info
403 * parameters. No need for the open lock */
404 if (lmm == NULL && lmmsize == 0) {
405 itp->it_flags |= MDS_OPEN_LOCK;
406 if (itp->it_flags & FMODE_WRITE)
407 opc = LUSTRE_OPC_CREATE;
410 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
411 file->f_dentry->d_inode, name, len,
414 return PTR_ERR(op_data);
416 itp->it_flags |= MDS_OPEN_BY_FID;
417 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
418 0 /*unused */, &req, ll_md_blocking_ast, 0);
419 ll_finish_md_op_data(op_data);
421 /* reason for keep own exit path - don`t flood log
422 * with messages with -ESTALE errors.
424 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
425 it_open_error(DISP_OPEN_OPEN, itp))
427 ll_release_openhandle(file->f_dentry, itp);
431 if (it_disposition(itp, DISP_LOOKUP_NEG))
432 GOTO(out, rc = -ENOENT);
434 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
435 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
436 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
440 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
441 if (!rc && itp->d.lustre.it_lock_mode)
442 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
446 ptlrpc_req_finished(itp->d.lustre.it_data);
447 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
448 ll_intent_drop_lock(itp);
454 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
455 * not believe attributes if a few ioepoch holders exist. Attributes for
456 * previous ioepoch if new one is opened are also skipped by MDS.
458 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
460 if (ioepoch && lli->lli_ioepoch != ioepoch) {
461 lli->lli_ioepoch = ioepoch;
462 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
463 ioepoch, PFID(&lli->lli_fid));
467 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
468 struct obd_client_handle *och)
470 struct ptlrpc_request *req = it->d.lustre.it_data;
471 struct mdt_body *body;
473 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
474 och->och_fh = body->handle;
475 och->och_fid = body->fid1;
476 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
477 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
478 och->och_flags = it->it_flags;
480 return md_set_open_replay_data(md_exp, och, req);
483 int ll_local_open(struct file *file, struct lookup_intent *it,
484 struct ll_file_data *fd, struct obd_client_handle *och)
486 struct inode *inode = file->f_dentry->d_inode;
487 struct ll_inode_info *lli = ll_i2info(inode);
489 LASSERT(!LUSTRE_FPRIVATE(file));
494 struct ptlrpc_request *req = it->d.lustre.it_data;
495 struct mdt_body *body;
498 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
502 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
503 ll_ioepoch_open(lli, body->ioepoch);
506 LUSTRE_FPRIVATE(file) = fd;
507 ll_readahead_init(inode, &fd->fd_ras);
508 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
512 /* Open a file, and (for the very first open) create objects on the OSTs at
513 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
514 * creation or open until ll_lov_setstripe() ioctl is called.
516 * If we already have the stripe MD locally then we don't request it in
517 * md_open(), by passing a lmm_size = 0.
519 * It is up to the application to ensure no other processes open this file
520 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
521 * used. We might be able to avoid races of that sort by getting lli_open_sem
522 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
523 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
525 int ll_file_open(struct inode *inode, struct file *file)
527 struct ll_inode_info *lli = ll_i2info(inode);
528 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
529 .it_flags = file->f_flags };
530 struct obd_client_handle **och_p = NULL;
531 __u64 *och_usecount = NULL;
532 struct ll_file_data *fd;
533 int rc = 0, opendir_set = 0;
535 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
536 inode->i_generation, inode, file->f_flags);
538 it = file->private_data; /* XXX: compat macro */
539 file->private_data = NULL; /* prevent ll_local_open assertion */
541 fd = ll_file_data_get();
543 GOTO(out_openerr, rc = -ENOMEM);
546 if (S_ISDIR(inode->i_mode)) {
547 spin_lock(&lli->lli_sa_lock);
548 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
549 lli->lli_opendir_pid == 0) {
550 lli->lli_opendir_key = fd;
551 lli->lli_opendir_pid = current_pid();
554 spin_unlock(&lli->lli_sa_lock);
557 if (inode->i_sb->s_root == file->f_dentry) {
558 LUSTRE_FPRIVATE(file) = fd;
562 if (!it || !it->d.lustre.it_disposition) {
563 /* Convert f_flags into access mode. We cannot use file->f_mode,
564 * because everything but O_ACCMODE mask was stripped from
566 if ((oit.it_flags + 1) & O_ACCMODE)
568 if (file->f_flags & O_TRUNC)
569 oit.it_flags |= FMODE_WRITE;
571 /* kernel only call f_op->open in dentry_open. filp_open calls
572 * dentry_open after call to open_namei that checks permissions.
573 * Only nfsd_open call dentry_open directly without checking
574 * permissions and because of that this code below is safe. */
575 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
576 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
578 /* We do not want O_EXCL here, presumably we opened the file
579 * already? XXX - NFS implications? */
580 oit.it_flags &= ~O_EXCL;
582 /* bug20584, if "it_flags" contains O_CREAT, the file will be
583 * created if necessary, then "IT_CREAT" should be set to keep
584 * consistent with it */
585 if (oit.it_flags & O_CREAT)
586 oit.it_op |= IT_CREAT;
592 /* Let's see if we have file open on MDS already. */
593 if (it->it_flags & FMODE_WRITE) {
594 och_p = &lli->lli_mds_write_och;
595 och_usecount = &lli->lli_open_fd_write_count;
596 } else if (it->it_flags & FMODE_EXEC) {
597 och_p = &lli->lli_mds_exec_och;
598 och_usecount = &lli->lli_open_fd_exec_count;
600 och_p = &lli->lli_mds_read_och;
601 och_usecount = &lli->lli_open_fd_read_count;
604 mutex_lock(&lli->lli_och_mutex);
605 if (*och_p) { /* Open handle is present */
606 if (it_disposition(it, DISP_OPEN_OPEN)) {
607 /* Well, there's extra open request that we do not need,
608 let's close it somehow. This will decref request. */
609 rc = it_open_error(DISP_OPEN_OPEN, it);
611 mutex_unlock(&lli->lli_och_mutex);
612 GOTO(out_openerr, rc);
615 ll_release_openhandle(file->f_dentry, it);
619 rc = ll_local_open(file, it, fd, NULL);
622 mutex_unlock(&lli->lli_och_mutex);
623 GOTO(out_openerr, rc);
626 LASSERT(*och_usecount == 0);
627 if (!it->d.lustre.it_disposition) {
628 /* We cannot just request lock handle now, new ELC code
629 means that one of other OPEN locks for this file
630 could be cancelled, and since blocking ast handler
631 would attempt to grab och_mutex as well, that would
632 result in a deadlock */
633 mutex_unlock(&lli->lli_och_mutex);
634 it->it_create_mode |= M_CHECK_STALE;
635 rc = ll_intent_file_open(file, NULL, 0, it);
636 it->it_create_mode &= ~M_CHECK_STALE;
638 GOTO(out_openerr, rc);
642 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
644 GOTO(out_och_free, rc = -ENOMEM);
648 /* md_intent_lock() didn't get a request ref if there was an
649 * open error, so don't do cleanup on the request here
651 /* XXX (green): Should not we bail out on any error here, not
652 * just open error? */
653 rc = it_open_error(DISP_OPEN_OPEN, it);
655 GOTO(out_och_free, rc);
657 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
659 rc = ll_local_open(file, it, fd, *och_p);
661 GOTO(out_och_free, rc);
663 mutex_unlock(&lli->lli_och_mutex);
666 /* Must do this outside lli_och_mutex lock to prevent deadlock where
667 different kind of OPEN lock for this same inode gets cancelled
668 by ldlm_cancel_lru */
669 if (!S_ISREG(inode->i_mode))
670 GOTO(out_och_free, rc);
674 if (!lli->lli_has_smd) {
675 if (file->f_flags & O_LOV_DELAY_CREATE ||
676 !(file->f_mode & FMODE_WRITE)) {
677 CDEBUG(D_INODE, "object creation was delayed\n");
678 GOTO(out_och_free, rc);
681 file->f_flags &= ~O_LOV_DELAY_CREATE;
682 GOTO(out_och_free, rc);
686 if (och_p && *och_p) {
687 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
688 *och_p = NULL; /* OBD_FREE writes some magic there */
691 mutex_unlock(&lli->lli_och_mutex);
694 if (opendir_set != 0)
695 ll_stop_statahead(inode, lli->lli_opendir_key);
697 ll_file_data_put(fd);
699 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
702 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
703 ptlrpc_req_finished(it->d.lustre.it_data);
704 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
710 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
711 struct ldlm_lock_desc *desc, void *data, int flag)
714 struct lustre_handle lockh;
717 case LDLM_CB_BLOCKING:
718 ldlm_lock2handle(lock, &lockh);
719 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
721 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
725 case LDLM_CB_CANCELING:
733 * Acquire a lease and open the file.
735 struct obd_client_handle *ll_lease_open(struct inode *inode, struct file *file,
736 fmode_t fmode, __u64 open_flags)
738 struct lookup_intent it = { .it_op = IT_OPEN };
739 struct ll_sb_info *sbi = ll_i2sbi(inode);
740 struct md_op_data *op_data;
741 struct ptlrpc_request *req;
742 struct lustre_handle old_handle = { 0 };
743 struct obd_client_handle *och = NULL;
747 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
748 return ERR_PTR(-EINVAL);
751 struct ll_inode_info *lli = ll_i2info(inode);
752 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
753 struct obd_client_handle **och_p;
756 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
757 return ERR_PTR(-EPERM);
759 /* Get the openhandle of the file */
761 mutex_lock(&lli->lli_och_mutex);
762 if (fd->fd_lease_och != NULL) {
763 mutex_unlock(&lli->lli_och_mutex);
767 if (fd->fd_och == NULL) {
768 if (file->f_mode & FMODE_WRITE) {
769 LASSERT(lli->lli_mds_write_och != NULL);
770 och_p = &lli->lli_mds_write_och;
771 och_usecount = &lli->lli_open_fd_write_count;
773 LASSERT(lli->lli_mds_read_och != NULL);
774 och_p = &lli->lli_mds_read_och;
775 och_usecount = &lli->lli_open_fd_read_count;
777 if (*och_usecount == 1) {
784 mutex_unlock(&lli->lli_och_mutex);
785 if (rc < 0) /* more than 1 opener */
788 LASSERT(fd->fd_och != NULL);
789 old_handle = fd->fd_och->och_fh;
794 return ERR_PTR(-ENOMEM);
796 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
797 LUSTRE_OPC_ANY, NULL);
799 GOTO(out, rc = PTR_ERR(op_data));
801 /* To tell the MDT this openhandle is from the same owner */
802 op_data->op_handle = old_handle;
804 it.it_flags = fmode | open_flags;
805 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
806 rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
807 ll_md_blocking_lease_ast,
808 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
809 * it can be cancelled which may mislead applications that the lease is
811 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
812 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
813 * doesn't deal with openhandle, so normal openhandle will be leaked. */
814 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
815 ll_finish_md_op_data(op_data);
817 ptlrpc_req_finished(req);
818 it_clear_disposition(&it, DISP_ENQ_COMPLETE);
821 GOTO(out_release_it, rc);
823 if (it_disposition(&it, DISP_LOOKUP_NEG))
824 GOTO(out_release_it, rc = -ENOENT);
826 rc = it_open_error(DISP_OPEN_OPEN, &it);
828 GOTO(out_release_it, rc);
830 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
831 ll_och_fill(sbi->ll_md_exp, &it, och);
833 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
834 GOTO(out_close, rc = -EOPNOTSUPP);
836 /* already get lease, handle lease lock */
837 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
838 if (it.d.lustre.it_lock_mode == 0 ||
839 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
840 /* open lock must return for lease */
841 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
842 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
843 it.d.lustre.it_lock_bits);
844 GOTO(out_close, rc = -EPROTO);
847 ll_intent_release(&it);
851 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
853 CERROR("Close openhandle returned %d\n", rc2);
855 /* cancel open lock */
856 if (it.d.lustre.it_lock_mode != 0) {
857 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
858 it.d.lustre.it_lock_mode);
859 it.d.lustre.it_lock_mode = 0;
862 ll_intent_release(&it);
867 EXPORT_SYMBOL(ll_lease_open);
870 * Release lease and close the file.
871 * It will check if the lease has ever broken.
873 int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
876 struct ldlm_lock *lock;
877 bool cancelled = true;
880 lock = ldlm_handle2lock(&och->och_lease_handle);
882 lock_res_and_lock(lock);
883 cancelled = ldlm_is_cancel(lock);
884 unlock_res_and_lock(lock);
888 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
889 PFID(&ll_i2info(inode)->lli_fid), cancelled);
892 ldlm_cli_cancel(&och->och_lease_handle, 0);
893 if (lease_broken != NULL)
894 *lease_broken = cancelled;
896 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
900 EXPORT_SYMBOL(ll_lease_close);
902 /* Fills the obdo with the attributes for the lsm */
903 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
904 struct obd_capa *capa, struct obdo *obdo,
905 __u64 ioepoch, int sync)
907 struct ptlrpc_request_set *set;
908 struct obd_info oinfo = { { { 0 } } };
911 LASSERT(lsm != NULL);
915 oinfo.oi_oa->o_oi = lsm->lsm_oi;
916 oinfo.oi_oa->o_mode = S_IFREG;
917 oinfo.oi_oa->o_ioepoch = ioepoch;
918 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
919 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
920 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
921 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
922 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
923 OBD_MD_FLDATAVERSION;
924 oinfo.oi_capa = capa;
926 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
927 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
930 set = ptlrpc_prep_set();
932 CERROR("can't allocate ptlrpc set\n");
935 rc = obd_getattr_async(exp, &oinfo, set);
937 rc = ptlrpc_set_wait(set);
938 ptlrpc_set_destroy(set);
941 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
942 OBD_MD_FLATIME | OBD_MD_FLMTIME |
943 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
944 OBD_MD_FLDATAVERSION);
949 * Performs the getattr on the inode and updates its fields.
950 * If @sync != 0, perform the getattr under the server-side lock.
952 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
953 __u64 ioepoch, int sync)
955 struct obd_capa *capa = ll_mdscapa_get(inode);
956 struct lov_stripe_md *lsm;
959 lsm = ccc_inode_lsm_get(inode);
960 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
961 capa, obdo, ioepoch, sync);
964 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
966 obdo_refresh_inode(inode, obdo, obdo->o_valid);
967 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
968 " blksize %lu\n", POSTID(oi), i_size_read(inode),
969 (unsigned long long)inode->i_blocks,
970 (unsigned long)ll_inode_blksize(inode));
972 ccc_inode_lsm_put(inode, lsm);
976 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
978 struct ll_inode_info *lli = ll_i2info(inode);
979 struct cl_object *obj = lli->lli_clob;
980 struct cl_attr *attr = ccc_env_thread_attr(env);
984 ll_inode_size_lock(inode);
985 /* merge timestamps the most recently obtained from mds with
986 timestamps obtained from osts */
987 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
988 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
989 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
990 inode_init_lvb(inode, &lvb);
992 cl_object_attr_lock(obj);
993 rc = cl_object_attr_get(env, obj, attr);
994 cl_object_attr_unlock(obj);
997 if (lvb.lvb_atime < attr->cat_atime)
998 lvb.lvb_atime = attr->cat_atime;
999 if (lvb.lvb_ctime < attr->cat_ctime)
1000 lvb.lvb_ctime = attr->cat_ctime;
1001 if (lvb.lvb_mtime < attr->cat_mtime)
1002 lvb.lvb_mtime = attr->cat_mtime;
1004 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1005 PFID(&lli->lli_fid), attr->cat_size);
1006 cl_isize_write_nolock(inode, attr->cat_size);
1008 inode->i_blocks = attr->cat_blocks;
1010 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1011 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1012 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1014 ll_inode_size_unlock(inode);
1019 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1022 struct obdo obdo = { 0 };
1025 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1027 st->st_size = obdo.o_size;
1028 st->st_blocks = obdo.o_blocks;
1029 st->st_mtime = obdo.o_mtime;
1030 st->st_atime = obdo.o_atime;
1031 st->st_ctime = obdo.o_ctime;
1036 void ll_io_init(struct cl_io *io, const struct file *file, int write)
1038 struct inode *inode = file->f_dentry->d_inode;
1040 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1042 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1043 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1044 file->f_flags & O_DIRECT ||
1047 io->ci_obj = ll_i2info(inode)->lli_clob;
1048 io->ci_lockreq = CILR_MAYBE;
1049 if (ll_file_nolock(file)) {
1050 io->ci_lockreq = CILR_NEVER;
1051 io->ci_no_srvlock = 1;
1052 } else if (file->f_flags & O_APPEND) {
1053 io->ci_lockreq = CILR_MANDATORY;
1058 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1059 struct file *file, enum cl_io_type iot,
1060 loff_t *ppos, size_t count)
1062 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1063 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1068 io = ccc_env_thread_io(env);
1069 ll_io_init(io, file, iot == CIT_WRITE);
1071 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1072 struct vvp_io *vio = vvp_env_io(env);
1073 struct ccc_io *cio = ccc_env_io(env);
1074 int write_mutex_locked = 0;
1076 cio->cui_fd = LUSTRE_FPRIVATE(file);
1077 vio->cui_io_subtype = args->via_io_subtype;
1079 switch (vio->cui_io_subtype) {
1081 cio->cui_iov = args->u.normal.via_iov;
1082 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1083 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1084 cio->cui_iocb = args->u.normal.via_iocb;
1085 if ((iot == CIT_WRITE) &&
1086 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1087 if (mutex_lock_interruptible(&lli->
1089 GOTO(out, result = -ERESTARTSYS);
1090 write_mutex_locked = 1;
1091 } else if (iot == CIT_READ) {
1092 down_read(&lli->lli_trunc_sem);
1096 vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
1097 vio->u.sendfile.cui_target = args->u.sendfile.via_target;
1100 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1101 vio->u.splice.cui_flags = args->u.splice.via_flags;
1104 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1107 result = cl_io_loop(env, io);
1108 if (write_mutex_locked)
1109 mutex_unlock(&lli->lli_write_mutex);
1110 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
1111 up_read(&lli->lli_trunc_sem);
1113 /* cl_io_rw_init() handled IO */
1114 result = io->ci_result;
1117 if (io->ci_nob > 0) {
1118 result = io->ci_nob;
1119 *ppos = io->u.ci_wr.wr.crw_pos;
1123 cl_io_fini(env, io);
1124 /* If any bit been read/written (result != 0), we just return
1125 * short read/write instead of restart io. */
1126 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1127 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1128 iot == CIT_READ ? "read" : "write",
1129 file->f_dentry->d_name.name, *ppos, count);
1130 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1134 if (iot == CIT_READ) {
1136 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1137 LPROC_LL_READ_BYTES, result);
1138 } else if (iot == CIT_WRITE) {
1140 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1141 LPROC_LL_WRITE_BYTES, result);
1142 fd->fd_write_failed = false;
1143 } else if (result != -ERESTARTSYS) {
1144 fd->fd_write_failed = true;
1153 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1155 static int ll_file_get_iov_count(const struct iovec *iov,
1156 unsigned long *nr_segs, size_t *count)
1161 for (seg = 0; seg < *nr_segs; seg++) {
1162 const struct iovec *iv = &iov[seg];
1165 * If any segment has a negative length, or the cumulative
1166 * length ever wraps negative then return -EINVAL.
1169 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1171 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1176 cnt -= iv->iov_len; /* This segment is no good */
1183 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1184 unsigned long nr_segs, loff_t pos)
1187 struct vvp_io_args *args;
1192 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1196 env = cl_env_get(&refcheck);
1198 return PTR_ERR(env);
1200 args = vvp_env_args(env, IO_NORMAL);
1201 args->u.normal.via_iov = (struct iovec *)iov;
1202 args->u.normal.via_nrsegs = nr_segs;
1203 args->u.normal.via_iocb = iocb;
1205 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1206 &iocb->ki_pos, count);
1207 cl_env_put(env, &refcheck);
1211 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1215 struct iovec *local_iov;
1216 struct kiocb *kiocb;
1220 env = cl_env_get(&refcheck);
1222 return PTR_ERR(env);
1224 local_iov = &vvp_env_info(env)->vti_local_iov;
1225 kiocb = &vvp_env_info(env)->vti_kiocb;
1226 local_iov->iov_base = (void __user *)buf;
1227 local_iov->iov_len = count;
1228 init_sync_kiocb(kiocb, file);
1229 kiocb->ki_pos = *ppos;
1230 kiocb->ki_nbytes = count;
1232 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1233 *ppos = kiocb->ki_pos;
1235 cl_env_put(env, &refcheck);
1240 * Write to a file (through the page cache).
1242 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1243 unsigned long nr_segs, loff_t pos)
1246 struct vvp_io_args *args;
1251 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1255 env = cl_env_get(&refcheck);
1257 return PTR_ERR(env);
1259 args = vvp_env_args(env, IO_NORMAL);
1260 args->u.normal.via_iov = (struct iovec *)iov;
1261 args->u.normal.via_nrsegs = nr_segs;
1262 args->u.normal.via_iocb = iocb;
1264 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1265 &iocb->ki_pos, count);
1266 cl_env_put(env, &refcheck);
1270 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1274 struct iovec *local_iov;
1275 struct kiocb *kiocb;
1279 env = cl_env_get(&refcheck);
1281 return PTR_ERR(env);
1283 local_iov = &vvp_env_info(env)->vti_local_iov;
1284 kiocb = &vvp_env_info(env)->vti_kiocb;
1285 local_iov->iov_base = (void __user *)buf;
1286 local_iov->iov_len = count;
1287 init_sync_kiocb(kiocb, file);
1288 kiocb->ki_pos = *ppos;
1289 kiocb->ki_nbytes = count;
1291 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1292 *ppos = kiocb->ki_pos;
1294 cl_env_put(env, &refcheck);
1301 * Send file content (through pagecache) somewhere with helper
1303 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1304 struct pipe_inode_info *pipe, size_t count,
1308 struct vvp_io_args *args;
1312 env = cl_env_get(&refcheck);
1314 return PTR_ERR(env);
1316 args = vvp_env_args(env, IO_SPLICE);
1317 args->u.splice.via_pipe = pipe;
1318 args->u.splice.via_flags = flags;
1320 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1321 cl_env_put(env, &refcheck);
1325 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1328 struct obd_export *exp = ll_i2dtexp(inode);
1329 struct obd_trans_info oti = { 0 };
1330 struct obdo *oa = NULL;
1333 struct lov_stripe_md *lsm = NULL, *lsm2;
1339 lsm = ccc_inode_lsm_get(inode);
1340 if (!lsm_has_objects(lsm))
1341 GOTO(out, rc = -ENOENT);
1343 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1344 (lsm->lsm_stripe_count));
1346 OBD_ALLOC_LARGE(lsm2, lsm_size);
1348 GOTO(out, rc = -ENOMEM);
1351 oa->o_nlink = ost_idx;
1352 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1353 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1354 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1355 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1356 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1357 memcpy(lsm2, lsm, lsm_size);
1358 ll_inode_size_lock(inode);
1359 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1360 ll_inode_size_unlock(inode);
1362 OBD_FREE_LARGE(lsm2, lsm_size);
1365 ccc_inode_lsm_put(inode, lsm);
1370 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1372 struct ll_recreate_obj ucreat;
1375 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1378 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1382 ostid_set_seq_mdt0(&oi);
1383 ostid_set_id(&oi, ucreat.lrc_id);
1384 return ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx);
1387 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1393 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1396 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1399 fid_to_ostid(&fid, &oi);
1400 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1401 return ll_lov_recreate(inode, &oi, ost_idx);
1404 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1405 int flags, struct lov_user_md *lum, int lum_size)
1407 struct lov_stripe_md *lsm = NULL;
1408 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1411 lsm = ccc_inode_lsm_get(inode);
1413 ccc_inode_lsm_put(inode, lsm);
1414 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1419 ll_inode_size_lock(inode);
1420 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1423 rc = oit.d.lustre.it_status;
1425 GOTO(out_req_free, rc);
1427 ll_release_openhandle(file->f_dentry, &oit);
1430 ll_inode_size_unlock(inode);
1431 ll_intent_release(&oit);
1432 ccc_inode_lsm_put(inode, lsm);
1435 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1439 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1440 struct lov_mds_md **lmmp, int *lmm_size,
1441 struct ptlrpc_request **request)
1443 struct ll_sb_info *sbi = ll_i2sbi(inode);
1444 struct mdt_body *body;
1445 struct lov_mds_md *lmm = NULL;
1446 struct ptlrpc_request *req = NULL;
1447 struct md_op_data *op_data;
1450 rc = ll_get_max_mdsize(sbi, &lmmsize);
1454 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1455 strlen(filename), lmmsize,
1456 LUSTRE_OPC_ANY, NULL);
1457 if (IS_ERR(op_data))
1458 return PTR_ERR(op_data);
1460 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1461 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1462 ll_finish_md_op_data(op_data);
1464 CDEBUG(D_INFO, "md_getattr_name failed "
1465 "on %s: rc %d\n", filename, rc);
1469 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1470 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1472 lmmsize = body->eadatasize;
1474 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1476 GOTO(out, rc = -ENODATA);
1479 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1480 LASSERT(lmm != NULL);
1482 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1483 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1484 GOTO(out, rc = -EPROTO);
1488 * This is coming from the MDS, so is probably in
1489 * little endian. We convert it to host endian before
1490 * passing it to userspace.
1492 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1495 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1496 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1499 /* if function called for directory - we should
1500 * avoid swab not existent lsm objects */
1501 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1502 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1503 if (S_ISREG(body->mode))
1504 lustre_swab_lov_user_md_objects(
1505 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1507 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1508 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1509 if (S_ISREG(body->mode))
1510 lustre_swab_lov_user_md_objects(
1511 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1518 *lmm_size = lmmsize;
1523 static int ll_lov_setea(struct inode *inode, struct file *file,
1526 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1527 struct lov_user_md *lump;
1528 int lum_size = sizeof(struct lov_user_md) +
1529 sizeof(struct lov_user_ost_data);
1532 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1535 OBD_ALLOC_LARGE(lump, lum_size);
1539 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1540 OBD_FREE_LARGE(lump, lum_size);
1544 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1546 OBD_FREE_LARGE(lump, lum_size);
1550 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1553 struct lov_user_md_v3 lumv3;
1554 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1555 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1556 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1558 int flags = FMODE_WRITE;
1560 /* first try with v1 which is smaller than v3 */
1561 lum_size = sizeof(struct lov_user_md_v1);
1562 if (copy_from_user(lumv1, lumv1p, lum_size))
1565 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1566 lum_size = sizeof(struct lov_user_md_v3);
1567 if (copy_from_user(&lumv3, lumv3p, lum_size))
1571 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1573 struct lov_stripe_md *lsm;
1576 put_user(0, &lumv1p->lmm_stripe_count);
1578 ll_layout_refresh(inode, &gen);
1579 lsm = ccc_inode_lsm_get(inode);
1580 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1581 0, lsm, (void *)arg);
1582 ccc_inode_lsm_put(inode, lsm);
1587 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1589 struct lov_stripe_md *lsm;
1592 lsm = ccc_inode_lsm_get(inode);
1594 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1596 ccc_inode_lsm_put(inode, lsm);
1600 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1602 struct ll_inode_info *lli = ll_i2info(inode);
1603 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1604 struct ccc_grouplock grouplock;
1607 if (ll_file_nolock(file))
1610 spin_lock(&lli->lli_lock);
1611 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1612 CWARN("group lock already existed with gid %lu\n",
1613 fd->fd_grouplock.cg_gid);
1614 spin_unlock(&lli->lli_lock);
1617 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1618 spin_unlock(&lli->lli_lock);
1620 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1621 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1625 spin_lock(&lli->lli_lock);
1626 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1627 spin_unlock(&lli->lli_lock);
1628 CERROR("another thread just won the race\n");
1629 cl_put_grouplock(&grouplock);
1633 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1634 fd->fd_grouplock = grouplock;
1635 spin_unlock(&lli->lli_lock);
1637 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1641 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1643 struct ll_inode_info *lli = ll_i2info(inode);
1644 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1645 struct ccc_grouplock grouplock;
1647 spin_lock(&lli->lli_lock);
1648 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1649 spin_unlock(&lli->lli_lock);
1650 CWARN("no group lock held\n");
1653 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1655 if (fd->fd_grouplock.cg_gid != arg) {
1656 CWARN("group lock %lu doesn't match current id %lu\n",
1657 arg, fd->fd_grouplock.cg_gid);
1658 spin_unlock(&lli->lli_lock);
1662 grouplock = fd->fd_grouplock;
1663 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1664 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1665 spin_unlock(&lli->lli_lock);
1667 cl_put_grouplock(&grouplock);
1668 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1673 * Close inode open handle
1675 * \param dentry [in] dentry which contains the inode
1676 * \param it [in,out] intent which contains open info and result
1679 * \retval <0 failure
1681 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1683 struct inode *inode = dentry->d_inode;
1684 struct obd_client_handle *och;
1689 /* Root ? Do nothing. */
1690 if (dentry->d_inode->i_sb->s_root == dentry)
1693 /* No open handle to close? Move away */
1694 if (!it_disposition(it, DISP_OPEN_OPEN))
1697 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1699 OBD_ALLOC(och, sizeof(*och));
1701 GOTO(out, rc = -ENOMEM);
1703 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1705 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1708 /* this one is in place of ll_file_open */
1709 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1710 ptlrpc_req_finished(it->d.lustre.it_data);
1711 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1717 * Get size for inode for which FIEMAP mapping is requested.
1718 * Make the FIEMAP get_info call and returns the result.
1720 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1723 struct obd_export *exp = ll_i2dtexp(inode);
1724 struct lov_stripe_md *lsm = NULL;
1725 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1726 int vallen = num_bytes;
1729 /* Checks for fiemap flags */
1730 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1731 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1735 /* Check for FIEMAP_FLAG_SYNC */
1736 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1737 rc = filemap_fdatawrite(inode->i_mapping);
1742 lsm = ccc_inode_lsm_get(inode);
1746 /* If the stripe_count > 1 and the application does not understand
1747 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1749 if (lsm->lsm_stripe_count > 1 &&
1750 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1751 GOTO(out, rc = -EOPNOTSUPP);
1753 fm_key.oa.o_oi = lsm->lsm_oi;
1754 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1756 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1757 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1758 /* If filesize is 0, then there would be no objects for mapping */
1759 if (fm_key.oa.o_size == 0) {
1760 fiemap->fm_mapped_extents = 0;
1764 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1766 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1769 CERROR("obd_get_info failed: rc = %d\n", rc);
1772 ccc_inode_lsm_put(inode, lsm);
1776 int ll_fid2path(struct inode *inode, void *arg)
1778 struct obd_export *exp = ll_i2mdexp(inode);
1779 struct getinfo_fid2path *gfout, *gfin;
1782 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1783 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1786 /* Need to get the buflen */
1787 OBD_ALLOC_PTR(gfin);
1790 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1795 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1796 OBD_ALLOC(gfout, outsize);
1797 if (gfout == NULL) {
1801 memcpy(gfout, gfin, sizeof(*gfout));
1804 /* Call mdc_iocontrol */
1805 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1809 if (copy_to_user(arg, gfout, outsize))
1813 OBD_FREE(gfout, outsize);
1817 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1819 struct ll_user_fiemap *fiemap_s;
1820 size_t num_bytes, ret_bytes;
1821 unsigned int extent_count;
1824 /* Get the extent count so we can calculate the size of
1825 * required fiemap buffer */
1826 if (get_user(extent_count,
1827 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1829 num_bytes = sizeof(*fiemap_s) + (extent_count *
1830 sizeof(struct ll_fiemap_extent));
1832 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1833 if (fiemap_s == NULL)
1836 /* get the fiemap value */
1837 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1839 GOTO(error, rc = -EFAULT);
1841 /* If fm_extent_count is non-zero, read the first extent since
1842 * it is used to calculate end_offset and device from previous
1845 if (copy_from_user(&fiemap_s->fm_extents[0],
1846 (char __user *)arg + sizeof(*fiemap_s),
1847 sizeof(struct ll_fiemap_extent)))
1848 GOTO(error, rc = -EFAULT);
1851 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1855 ret_bytes = sizeof(struct ll_user_fiemap);
1857 if (extent_count != 0)
1858 ret_bytes += (fiemap_s->fm_mapped_extents *
1859 sizeof(struct ll_fiemap_extent));
1861 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1865 OBD_FREE_LARGE(fiemap_s, num_bytes);
1870 * Read the data_version for inode.
1872 * This value is computed using stripe object version on OST.
1873 * Version is computed using server side locking.
1875 * @param extent_lock Take extent lock. Not needed if a process is already
1876 * holding the OST object group locks.
1878 int ll_data_version(struct inode *inode, __u64 *data_version,
1881 struct lov_stripe_md *lsm = NULL;
1882 struct ll_sb_info *sbi = ll_i2sbi(inode);
1883 struct obdo *obdo = NULL;
1886 /* If no stripe, we consider version is 0. */
1887 lsm = ccc_inode_lsm_get(inode);
1888 if (!lsm_has_objects(lsm)) {
1890 CDEBUG(D_INODE, "No object for inode\n");
1894 OBD_ALLOC_PTR(obdo);
1896 GOTO(out, rc = -ENOMEM);
1898 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1900 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1903 *data_version = obdo->o_data_version;
1908 ccc_inode_lsm_put(inode, lsm);
1913 * Trigger a HSM release request for the provided inode.
1915 int ll_hsm_release(struct inode *inode)
1917 struct cl_env_nest nest;
1919 struct obd_client_handle *och = NULL;
1920 __u64 data_version = 0;
1924 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1925 ll_get_fsname(inode->i_sb, NULL, 0),
1926 PFID(&ll_i2info(inode)->lli_fid));
1928 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1930 GOTO(out, rc = PTR_ERR(och));
1932 /* Grab latest data_version and [am]time values */
1933 rc = ll_data_version(inode, &data_version, 1);
1937 env = cl_env_nested_get(&nest);
1939 GOTO(out, rc = PTR_ERR(env));
1941 ll_merge_lvb(env, inode);
1942 cl_env_nested_put(&nest, env);
1944 /* Release the file.
1945 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1946 * we still need it to pack l_remote_handle to MDT. */
1947 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
1953 if (och != NULL && !IS_ERR(och)) /* close the file */
1954 ll_lease_close(och, inode, NULL);
1959 struct ll_swap_stack {
1960 struct iattr ia1, ia2;
1962 struct inode *inode1, *inode2;
1963 bool check_dv1, check_dv2;
1966 static int ll_swap_layouts(struct file *file1, struct file *file2,
1967 struct lustre_swap_layouts *lsl)
1969 struct mdc_swap_layouts msl;
1970 struct md_op_data *op_data;
1973 struct ll_swap_stack *llss = NULL;
1976 OBD_ALLOC_PTR(llss);
1980 llss->inode1 = file1->f_dentry->d_inode;
1981 llss->inode2 = file2->f_dentry->d_inode;
1983 if (!S_ISREG(llss->inode2->i_mode))
1984 GOTO(free, rc = -EINVAL);
1986 if (inode_permission(llss->inode1, MAY_WRITE) ||
1987 inode_permission(llss->inode2, MAY_WRITE))
1988 GOTO(free, rc = -EPERM);
1990 if (llss->inode2->i_sb != llss->inode1->i_sb)
1991 GOTO(free, rc = -EXDEV);
1993 /* we use 2 bool because it is easier to swap than 2 bits */
1994 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1995 llss->check_dv1 = true;
1997 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1998 llss->check_dv2 = true;
2000 /* we cannot use lsl->sl_dvX directly because we may swap them */
2001 llss->dv1 = lsl->sl_dv1;
2002 llss->dv2 = lsl->sl_dv2;
2004 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2005 if (rc == 0) /* same file, done! */
2008 if (rc < 0) { /* sequentialize it */
2009 swap(llss->inode1, llss->inode2);
2011 swap(llss->dv1, llss->dv2);
2012 swap(llss->check_dv1, llss->check_dv2);
2016 if (gid != 0) { /* application asks to flush dirty cache */
2017 rc = ll_get_grouplock(llss->inode1, file1, gid);
2021 rc = ll_get_grouplock(llss->inode2, file2, gid);
2023 ll_put_grouplock(llss->inode1, file1, gid);
2028 /* to be able to restore mtime and atime after swap
2029 * we need to first save them */
2031 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2032 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2033 llss->ia1.ia_atime = llss->inode1->i_atime;
2034 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2035 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2036 llss->ia2.ia_atime = llss->inode2->i_atime;
2037 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2040 /* ultimate check, before swaping the layouts we check if
2041 * dataversion has changed (if requested) */
2042 if (llss->check_dv1) {
2043 rc = ll_data_version(llss->inode1, &dv, 0);
2046 if (dv != llss->dv1)
2047 GOTO(putgl, rc = -EAGAIN);
2050 if (llss->check_dv2) {
2051 rc = ll_data_version(llss->inode2, &dv, 0);
2054 if (dv != llss->dv2)
2055 GOTO(putgl, rc = -EAGAIN);
2058 /* struct md_op_data is used to send the swap args to the mdt
2059 * only flags is missing, so we use struct mdc_swap_layouts
2060 * through the md_op_data->op_data */
2061 /* flags from user space have to be converted before they are send to
2062 * server, no flag is sent today, they are only used on the client */
2065 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2066 0, LUSTRE_OPC_ANY, &msl);
2067 if (IS_ERR(op_data))
2068 GOTO(free, rc = PTR_ERR(op_data));
2070 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2071 sizeof(*op_data), op_data, NULL);
2072 ll_finish_md_op_data(op_data);
2076 ll_put_grouplock(llss->inode2, file2, gid);
2077 ll_put_grouplock(llss->inode1, file1, gid);
2080 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2084 /* clear useless flags */
2085 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2086 llss->ia1.ia_valid &= ~ATTR_MTIME;
2087 llss->ia2.ia_valid &= ~ATTR_MTIME;
2090 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2091 llss->ia1.ia_valid &= ~ATTR_ATIME;
2092 llss->ia2.ia_valid &= ~ATTR_ATIME;
2095 /* update time if requested */
2097 if (llss->ia2.ia_valid != 0) {
2098 mutex_lock(&llss->inode1->i_mutex);
2099 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2100 mutex_unlock(&llss->inode1->i_mutex);
2103 if (llss->ia1.ia_valid != 0) {
2106 mutex_lock(&llss->inode2->i_mutex);
2107 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2108 mutex_unlock(&llss->inode2->i_mutex);
2120 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2122 struct inode *inode = file->f_dentry->d_inode;
2123 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2126 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2127 inode->i_generation, inode, cmd);
2128 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2130 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2131 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2135 case LL_IOC_GETFLAGS:
2136 /* Get the current value of the file flags */
2137 return put_user(fd->fd_flags, (int *)arg);
2138 case LL_IOC_SETFLAGS:
2139 case LL_IOC_CLRFLAGS:
2140 /* Set or clear specific file flags */
2141 /* XXX This probably needs checks to ensure the flags are
2142 * not abused, and to handle any flag side effects.
2144 if (get_user(flags, (int *) arg))
2147 if (cmd == LL_IOC_SETFLAGS) {
2148 if ((flags & LL_FILE_IGNORE_LOCK) &&
2149 !(file->f_flags & O_DIRECT)) {
2150 CERROR("%s: unable to disable locking on "
2151 "non-O_DIRECT file\n", current->comm);
2155 fd->fd_flags |= flags;
2157 fd->fd_flags &= ~flags;
2160 case LL_IOC_LOV_SETSTRIPE:
2161 return ll_lov_setstripe(inode, file, arg);
2162 case LL_IOC_LOV_SETEA:
2163 return ll_lov_setea(inode, file, arg);
2164 case LL_IOC_LOV_SWAP_LAYOUTS: {
2166 struct lustre_swap_layouts lsl;
2168 if (copy_from_user(&lsl, (char *)arg,
2169 sizeof(struct lustre_swap_layouts)))
2172 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2175 file2 = fget(lsl.sl_fd);
2180 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2181 rc = ll_swap_layouts(file, file2, &lsl);
2185 case LL_IOC_LOV_GETSTRIPE:
2186 return ll_lov_getstripe(inode, arg);
2187 case LL_IOC_RECREATE_OBJ:
2188 return ll_lov_recreate_obj(inode, arg);
2189 case LL_IOC_RECREATE_FID:
2190 return ll_lov_recreate_fid(inode, arg);
2191 case FSFILT_IOC_FIEMAP:
2192 return ll_ioctl_fiemap(inode, arg);
2193 case FSFILT_IOC_GETFLAGS:
2194 case FSFILT_IOC_SETFLAGS:
2195 return ll_iocontrol(inode, file, cmd, arg);
2196 case FSFILT_IOC_GETVERSION_OLD:
2197 case FSFILT_IOC_GETVERSION:
2198 return put_user(inode->i_generation, (int *)arg);
2199 case LL_IOC_GROUP_LOCK:
2200 return ll_get_grouplock(inode, file, arg);
2201 case LL_IOC_GROUP_UNLOCK:
2202 return ll_put_grouplock(inode, file, arg);
2203 case IOC_OBD_STATFS:
2204 return ll_obd_statfs(inode, (void *)arg);
2206 /* We need to special case any other ioctls we want to handle,
2207 * to send them to the MDS/OST as appropriate and to properly
2208 * network encode the arg field.
2209 case FSFILT_IOC_SETVERSION_OLD:
2210 case FSFILT_IOC_SETVERSION:
2212 case LL_IOC_FLUSHCTX:
2213 return ll_flush_ctx(inode);
2214 case LL_IOC_PATH2FID: {
2215 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2216 sizeof(struct lu_fid)))
2221 case OBD_IOC_FID2PATH:
2222 return ll_fid2path(inode, (void *)arg);
2223 case LL_IOC_DATA_VERSION: {
2224 struct ioc_data_version idv;
2227 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2230 rc = ll_data_version(inode, &idv.idv_version,
2231 !(idv.idv_flags & LL_DV_NOFLUSH));
2233 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2239 case LL_IOC_GET_MDTIDX: {
2242 mdtidx = ll_get_mdt_idx(inode);
2246 if (put_user((int)mdtidx, (int*)arg))
2251 case OBD_IOC_GETDTNAME:
2252 case OBD_IOC_GETMDNAME:
2253 return ll_get_obd_name(inode, cmd, arg);
2254 case LL_IOC_HSM_STATE_GET: {
2255 struct md_op_data *op_data;
2256 struct hsm_user_state *hus;
2263 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2264 LUSTRE_OPC_ANY, hus);
2265 if (IS_ERR(op_data)) {
2267 return PTR_ERR(op_data);
2270 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2273 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2276 ll_finish_md_op_data(op_data);
2280 case LL_IOC_HSM_STATE_SET: {
2281 struct md_op_data *op_data;
2282 struct hsm_state_set *hss;
2288 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2293 /* Non-root users are forbidden to set or clear flags which are
2294 * NOT defined in HSM_USER_MASK. */
2295 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK)
2296 && !cfs_capable(CFS_CAP_SYS_ADMIN)) {
2301 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2302 LUSTRE_OPC_ANY, hss);
2303 if (IS_ERR(op_data)) {
2305 return PTR_ERR(op_data);
2308 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2311 ll_finish_md_op_data(op_data);
2316 case LL_IOC_HSM_ACTION: {
2317 struct md_op_data *op_data;
2318 struct hsm_current_action *hca;
2325 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2326 LUSTRE_OPC_ANY, hca);
2327 if (IS_ERR(op_data)) {
2329 return PTR_ERR(op_data);
2332 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2335 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2338 ll_finish_md_op_data(op_data);
2342 case LL_IOC_SET_LEASE: {
2343 struct ll_inode_info *lli = ll_i2info(inode);
2344 struct obd_client_handle *och = NULL;
2350 if (!(file->f_mode & FMODE_WRITE))
2355 if (!(file->f_mode & FMODE_READ))
2360 mutex_lock(&lli->lli_och_mutex);
2361 if (fd->fd_lease_och != NULL) {
2362 och = fd->fd_lease_och;
2363 fd->fd_lease_och = NULL;
2365 mutex_unlock(&lli->lli_och_mutex);
2368 mode = och->och_flags &
2369 (FMODE_READ|FMODE_WRITE);
2370 rc = ll_lease_close(och, inode, &lease_broken);
2371 if (rc == 0 && lease_broken)
2377 /* return the type of lease or error */
2378 return rc < 0 ? rc : (int)mode;
2383 CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2385 /* apply for lease */
2386 och = ll_lease_open(inode, file, mode, 0);
2388 return PTR_ERR(och);
2391 mutex_lock(&lli->lli_och_mutex);
2392 if (fd->fd_lease_och == NULL) {
2393 fd->fd_lease_och = och;
2396 mutex_unlock(&lli->lli_och_mutex);
2398 /* impossible now that only excl is supported for now */
2399 ll_lease_close(och, inode, &lease_broken);
2404 case LL_IOC_GET_LEASE: {
2405 struct ll_inode_info *lli = ll_i2info(inode);
2406 struct ldlm_lock *lock = NULL;
2409 mutex_lock(&lli->lli_och_mutex);
2410 if (fd->fd_lease_och != NULL) {
2411 struct obd_client_handle *och = fd->fd_lease_och;
2413 lock = ldlm_handle2lock(&och->och_lease_handle);
2415 lock_res_and_lock(lock);
2416 if (!ldlm_is_cancel(lock))
2417 rc = och->och_flags &
2418 (FMODE_READ | FMODE_WRITE);
2419 unlock_res_and_lock(lock);
2420 ldlm_lock_put(lock);
2423 mutex_unlock(&lli->lli_och_mutex);
2431 ll_iocontrol_call(inode, file, cmd, arg, &err))
2434 return obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2441 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2443 struct inode *inode = file->f_dentry->d_inode;
2444 loff_t retval, eof = 0;
2446 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2447 (origin == SEEK_CUR) ? file->f_pos : 0);
2448 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2449 inode->i_ino, inode->i_generation, inode, retval, retval,
2451 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2453 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2454 retval = ll_glimpse_size(inode);
2457 eof = i_size_read(inode);
2460 retval = generic_file_llseek_size(file, offset, origin,
2461 ll_file_maxbytes(inode), eof);
2465 int ll_flush(struct file *file, fl_owner_t id)
2467 struct inode *inode = file->f_dentry->d_inode;
2468 struct ll_inode_info *lli = ll_i2info(inode);
2469 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2472 LASSERT(!S_ISDIR(inode->i_mode));
2474 /* catch async errors that were recorded back when async writeback
2475 * failed for pages in this mapping. */
2476 rc = lli->lli_async_rc;
2477 lli->lli_async_rc = 0;
2478 err = lov_read_and_clear_async_rc(lli->lli_clob);
2482 /* The application has been told write failure already.
2483 * Do not report failure again. */
2484 if (fd->fd_write_failed)
2486 return rc ? -EIO : 0;
2490 * Called to make sure a portion of file has been written out.
2491 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2493 * Return how many pages have been written.
2495 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2496 enum cl_fsync_mode mode, int ignore_layout)
2498 struct cl_env_nest nest;
2501 struct obd_capa *capa = NULL;
2502 struct cl_fsync_io *fio;
2505 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2506 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2509 env = cl_env_nested_get(&nest);
2511 return PTR_ERR(env);
2513 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2515 io = ccc_env_thread_io(env);
2516 io->ci_obj = cl_i2info(inode)->lli_clob;
2517 io->ci_ignore_layout = ignore_layout;
2519 /* initialize parameters for sync */
2520 fio = &io->u.ci_fsync;
2521 fio->fi_capa = capa;
2522 fio->fi_start = start;
2524 fio->fi_fid = ll_inode2fid(inode);
2525 fio->fi_mode = mode;
2526 fio->fi_nr_written = 0;
2528 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2529 result = cl_io_loop(env, io);
2531 result = io->ci_result;
2533 result = fio->fi_nr_written;
2534 cl_io_fini(env, io);
2535 cl_env_nested_put(&nest, env);
2543 * When dentry is provided (the 'else' case), *file->f_dentry may be
2544 * null and dentry must be used directly rather than pulled from
2545 * *file->f_dentry as is done otherwise.
2548 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2550 struct dentry *dentry = file->f_dentry;
2551 struct inode *inode = dentry->d_inode;
2552 struct ll_inode_info *lli = ll_i2info(inode);
2553 struct ptlrpc_request *req;
2554 struct obd_capa *oc;
2557 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2558 inode->i_generation, inode);
2559 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2561 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2562 mutex_lock(&inode->i_mutex);
2564 /* catch async errors that were recorded back when async writeback
2565 * failed for pages in this mapping. */
2566 if (!S_ISDIR(inode->i_mode)) {
2567 err = lli->lli_async_rc;
2568 lli->lli_async_rc = 0;
2571 err = lov_read_and_clear_async_rc(lli->lli_clob);
2576 oc = ll_mdscapa_get(inode);
2577 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2583 ptlrpc_req_finished(req);
2585 if (datasync && S_ISREG(inode->i_mode)) {
2586 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2588 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2590 if (rc == 0 && err < 0)
2593 fd->fd_write_failed = true;
2595 fd->fd_write_failed = false;
2598 mutex_unlock(&inode->i_mutex);
2602 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2604 struct inode *inode = file->f_dentry->d_inode;
2605 struct ll_sb_info *sbi = ll_i2sbi(inode);
2606 struct ldlm_enqueue_info einfo = {
2607 .ei_type = LDLM_FLOCK,
2608 .ei_cb_cp = ldlm_flock_completion_ast,
2609 .ei_cbdata = file_lock,
2611 struct md_op_data *op_data;
2612 struct lustre_handle lockh = {0};
2613 ldlm_policy_data_t flock = {{0}};
2618 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2619 inode->i_ino, file_lock);
2621 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2623 if (file_lock->fl_flags & FL_FLOCK) {
2624 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2625 /* flocks are whole-file locks */
2626 flock.l_flock.end = OFFSET_MAX;
2627 /* For flocks owner is determined by the local file desctiptor*/
2628 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2629 } else if (file_lock->fl_flags & FL_POSIX) {
2630 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2631 flock.l_flock.start = file_lock->fl_start;
2632 flock.l_flock.end = file_lock->fl_end;
2636 flock.l_flock.pid = file_lock->fl_pid;
2638 /* Somewhat ugly workaround for svc lockd.
2639 * lockd installs custom fl_lmops->lm_compare_owner that checks
2640 * for the fl_owner to be the same (which it always is on local node
2641 * I guess between lockd processes) and then compares pid.
2642 * As such we assign pid to the owner field to make it all work,
2643 * conflict with normal locks is unlikely since pid space and
2644 * pointer space for current->files are not intersecting */
2645 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2646 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2648 switch (file_lock->fl_type) {
2650 einfo.ei_mode = LCK_PR;
2653 /* An unlock request may or may not have any relation to
2654 * existing locks so we may not be able to pass a lock handle
2655 * via a normal ldlm_lock_cancel() request. The request may even
2656 * unlock a byte range in the middle of an existing lock. In
2657 * order to process an unlock request we need all of the same
2658 * information that is given with a normal read or write record
2659 * lock request. To avoid creating another ldlm unlock (cancel)
2660 * message we'll treat a LCK_NL flock request as an unlock. */
2661 einfo.ei_mode = LCK_NL;
2664 einfo.ei_mode = LCK_PW;
2667 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2668 file_lock->fl_type);
2683 flags = LDLM_FL_BLOCK_NOWAIT;
2689 flags = LDLM_FL_TEST_LOCK;
2690 /* Save the old mode so that if the mode in the lock changes we
2691 * can decrement the appropriate reader or writer refcount. */
2692 file_lock->fl_type = einfo.ei_mode;
2695 CERROR("unknown fcntl lock command: %d\n", cmd);
2699 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2700 LUSTRE_OPC_ANY, NULL);
2701 if (IS_ERR(op_data))
2702 return PTR_ERR(op_data);
2704 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2705 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2706 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2708 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2709 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2711 if ((file_lock->fl_flags & FL_FLOCK) &&
2712 (rc == 0 || file_lock->fl_type == F_UNLCK))
2713 rc2 = flock_lock_file_wait(file, file_lock);
2714 if ((file_lock->fl_flags & FL_POSIX) &&
2715 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2716 !(flags & LDLM_FL_TEST_LOCK))
2717 rc2 = posix_lock_file_wait(file, file_lock);
2719 if (rc2 && file_lock->fl_type != F_UNLCK) {
2720 einfo.ei_mode = LCK_NL;
2721 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2722 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2726 ll_finish_md_op_data(op_data);
2731 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2737 * test if some locks matching bits and l_req_mode are acquired
2738 * - bits can be in different locks
2739 * - if found clear the common lock bits in *bits
2740 * - the bits not found, are kept in *bits
2742 * \param bits [IN] searched lock bits [IN]
2743 * \param l_req_mode [IN] searched lock mode
2744 * \retval boolean, true iff all bits are found
2746 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2748 struct lustre_handle lockh;
2749 ldlm_policy_data_t policy;
2750 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2751 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2759 fid = &ll_i2info(inode)->lli_fid;
2760 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2761 ldlm_lockname[mode]);
2763 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2764 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2765 policy.l_inodebits.bits = *bits & (1 << i);
2766 if (policy.l_inodebits.bits == 0)
2769 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2770 &policy, mode, &lockh)) {
2771 struct ldlm_lock *lock;
2773 lock = ldlm_handle2lock(&lockh);
2776 ~(lock->l_policy_data.l_inodebits.bits);
2777 LDLM_LOCK_PUT(lock);
2779 *bits &= ~policy.l_inodebits.bits;
2786 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2787 struct lustre_handle *lockh, __u64 flags,
2790 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2794 fid = &ll_i2info(inode)->lli_fid;
2795 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2797 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2798 fid, LDLM_IBITS, &policy, mode, lockh);
2803 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2805 /* Already unlinked. Just update nlink and return success */
2806 if (rc == -ENOENT) {
2808 /* This path cannot be hit for regular files unless in
2809 * case of obscure races, so no need to validate size.
2811 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2813 } else if (rc != 0) {
2814 CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
2815 ll_get_fsname(inode->i_sb, NULL, 0),
2816 PFID(ll_inode2fid(inode)), rc);
2822 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2825 struct inode *inode = dentry->d_inode;
2826 struct ptlrpc_request *req = NULL;
2827 struct obd_export *exp;
2830 LASSERT(inode != NULL);
2832 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2833 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2835 exp = ll_i2mdexp(inode);
2837 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2838 * But under CMD case, it caused some lock issues, should be fixed
2839 * with new CMD ibits lock. See bug 12718 */
2840 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2841 struct lookup_intent oit = { .it_op = IT_GETATTR };
2842 struct md_op_data *op_data;
2844 if (ibits == MDS_INODELOCK_LOOKUP)
2845 oit.it_op = IT_LOOKUP;
2847 /* Call getattr by fid, so do not provide name at all. */
2848 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2849 dentry->d_inode, NULL, 0, 0,
2850 LUSTRE_OPC_ANY, NULL);
2851 if (IS_ERR(op_data))
2852 return PTR_ERR(op_data);
2854 oit.it_create_mode |= M_CHECK_STALE;
2855 rc = md_intent_lock(exp, op_data, NULL, 0,
2856 /* we are not interested in name
2859 ll_md_blocking_ast, 0);
2860 ll_finish_md_op_data(op_data);
2861 oit.it_create_mode &= ~M_CHECK_STALE;
2863 rc = ll_inode_revalidate_fini(inode, rc);
2867 rc = ll_revalidate_it_finish(req, &oit, dentry);
2869 ll_intent_release(&oit);
2873 /* Unlinked? Unhash dentry, so it is not picked up later by
2874 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2875 here to preserve get_cwd functionality on 2.6.
2877 if (!dentry->d_inode->i_nlink)
2878 d_lustre_invalidate(dentry, 0);
2880 ll_lookup_finish_locks(&oit, dentry);
2881 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2882 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2883 obd_valid valid = OBD_MD_FLGETATTR;
2884 struct md_op_data *op_data;
2887 if (S_ISREG(inode->i_mode)) {
2888 rc = ll_get_max_mdsize(sbi, &ealen);
2891 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2894 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2895 0, ealen, LUSTRE_OPC_ANY,
2897 if (IS_ERR(op_data))
2898 return PTR_ERR(op_data);
2900 op_data->op_valid = valid;
2901 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2902 * capa for this inode. Because we only keep capas of dirs
2904 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2905 ll_finish_md_op_data(op_data);
2907 rc = ll_inode_revalidate_fini(inode, rc);
2911 rc = ll_prep_inode(&inode, req, NULL, NULL);
2914 ptlrpc_req_finished(req);
2918 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2921 struct inode *inode = dentry->d_inode;
2924 rc = __ll_inode_revalidate_it(dentry, it, ibits);
2928 /* if object isn't regular file, don't validate size */
2929 if (!S_ISREG(inode->i_mode)) {
2930 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2931 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2932 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2934 /* In case of restore, the MDT has the right size and has
2935 * already send it back without granting the layout lock,
2936 * inode is up-to-date so glimpse is useless.
2937 * Also to glimpse we need the layout, in case of a running
2938 * restore the MDT holds the layout lock so the glimpse will
2939 * block up to the end of restore (getattr will block)
2941 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
2942 rc = ll_glimpse_size(inode);
2947 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2948 struct lookup_intent *it, struct kstat *stat)
2950 struct inode *inode = de->d_inode;
2951 struct ll_sb_info *sbi = ll_i2sbi(inode);
2952 struct ll_inode_info *lli = ll_i2info(inode);
2955 res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
2956 MDS_INODELOCK_LOOKUP);
2957 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
2962 stat->dev = inode->i_sb->s_dev;
2963 if (ll_need_32bit_api(sbi))
2964 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
2966 stat->ino = inode->i_ino;
2967 stat->mode = inode->i_mode;
2968 stat->nlink = inode->i_nlink;
2969 stat->uid = inode->i_uid;
2970 stat->gid = inode->i_gid;
2971 stat->rdev = inode->i_rdev;
2972 stat->atime = inode->i_atime;
2973 stat->mtime = inode->i_mtime;
2974 stat->ctime = inode->i_ctime;
2975 stat->blksize = 1 << inode->i_blkbits;
2977 stat->size = i_size_read(inode);
2978 stat->blocks = inode->i_blocks;
2982 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2984 struct lookup_intent it = { .it_op = IT_GETATTR };
2986 return ll_getattr_it(mnt, de, &it, stat);
2989 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2990 __u64 start, __u64 len)
2994 struct ll_user_fiemap *fiemap;
2995 unsigned int extent_count = fieinfo->fi_extents_max;
2997 num_bytes = sizeof(*fiemap) + (extent_count *
2998 sizeof(struct ll_fiemap_extent));
2999 OBD_ALLOC_LARGE(fiemap, num_bytes);
3004 fiemap->fm_flags = fieinfo->fi_flags;
3005 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3006 fiemap->fm_start = start;
3007 fiemap->fm_length = len;
3008 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3009 sizeof(struct ll_fiemap_extent));
3011 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3013 fieinfo->fi_flags = fiemap->fm_flags;
3014 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3015 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3016 fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
3018 OBD_FREE_LARGE(fiemap, num_bytes);
3022 struct posix_acl * ll_get_acl(struct inode *inode, int type)
3024 struct ll_inode_info *lli = ll_i2info(inode);
3025 struct posix_acl *acl = NULL;
3027 spin_lock(&lli->lli_lock);
3028 /* VFS' acl_permission_check->check_acl will release the refcount */
3029 acl = posix_acl_dup(lli->lli_posix_acl);
3030 spin_unlock(&lli->lli_lock);
3036 int ll_inode_permission(struct inode *inode, int mask)
3040 #ifdef MAY_NOT_BLOCK
3041 if (mask & MAY_NOT_BLOCK)
3045 /* as root inode are NOT getting validated in lookup operation,
3046 * need to do it before permission check. */
3048 if (inode == inode->i_sb->s_root->d_inode) {
3049 struct lookup_intent it = { .it_op = IT_LOOKUP };
3051 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
3052 MDS_INODELOCK_LOOKUP);
3057 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3058 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3060 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3061 return lustre_check_remote_perm(inode, mask);
3063 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3064 rc = generic_permission(inode, mask);
3069 /* -o localflock - only provides locally consistent flock locks */
3070 struct file_operations ll_file_operations = {
3071 .read = ll_file_read,
3072 .aio_read = ll_file_aio_read,
3073 .write = ll_file_write,
3074 .aio_write = ll_file_aio_write,
3075 .unlocked_ioctl = ll_file_ioctl,
3076 .open = ll_file_open,
3077 .release = ll_file_release,
3078 .mmap = ll_file_mmap,
3079 .llseek = ll_file_seek,
3080 .splice_read = ll_file_splice_read,
3085 struct file_operations ll_file_operations_flock = {
3086 .read = ll_file_read,
3087 .aio_read = ll_file_aio_read,
3088 .write = ll_file_write,
3089 .aio_write = ll_file_aio_write,
3090 .unlocked_ioctl = ll_file_ioctl,
3091 .open = ll_file_open,
3092 .release = ll_file_release,
3093 .mmap = ll_file_mmap,
3094 .llseek = ll_file_seek,
3095 .splice_read = ll_file_splice_read,
3098 .flock = ll_file_flock,
3099 .lock = ll_file_flock
3102 /* These are for -o noflock - to return ENOSYS on flock calls */
3103 struct file_operations ll_file_operations_noflock = {
3104 .read = ll_file_read,
3105 .aio_read = ll_file_aio_read,
3106 .write = ll_file_write,
3107 .aio_write = ll_file_aio_write,
3108 .unlocked_ioctl = ll_file_ioctl,
3109 .open = ll_file_open,
3110 .release = ll_file_release,
3111 .mmap = ll_file_mmap,
3112 .llseek = ll_file_seek,
3113 .splice_read = ll_file_splice_read,
3116 .flock = ll_file_noflock,
3117 .lock = ll_file_noflock
3120 struct inode_operations ll_file_inode_operations = {
3121 .setattr = ll_setattr,
3122 .getattr = ll_getattr,
3123 .permission = ll_inode_permission,
3124 .setxattr = ll_setxattr,
3125 .getxattr = ll_getxattr,
3126 .listxattr = ll_listxattr,
3127 .removexattr = ll_removexattr,
3128 .fiemap = ll_fiemap,
3129 .get_acl = ll_get_acl,
3132 /* dynamic ioctl number support routins */
3133 static struct llioc_ctl_data {
3134 struct rw_semaphore ioc_sem;
3135 struct list_head ioc_head;
3137 __RWSEM_INITIALIZER(llioc.ioc_sem),
3138 LIST_HEAD_INIT(llioc.ioc_head)
3143 struct list_head iocd_list;
3144 unsigned int iocd_size;
3145 llioc_callback_t iocd_cb;
3146 unsigned int iocd_count;
3147 unsigned int iocd_cmd[0];
3150 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3153 struct llioc_data *in_data = NULL;
3155 if (cb == NULL || cmd == NULL ||
3156 count > LLIOC_MAX_CMD || count < 0)
3159 size = sizeof(*in_data) + count * sizeof(unsigned int);
3160 OBD_ALLOC(in_data, size);
3161 if (in_data == NULL)
3164 memset(in_data, 0, sizeof(*in_data));
3165 in_data->iocd_size = size;
3166 in_data->iocd_cb = cb;
3167 in_data->iocd_count = count;
3168 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3170 down_write(&llioc.ioc_sem);
3171 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3172 up_write(&llioc.ioc_sem);
3177 void ll_iocontrol_unregister(void *magic)
3179 struct llioc_data *tmp;
3184 down_write(&llioc.ioc_sem);
3185 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3187 unsigned int size = tmp->iocd_size;
3189 list_del(&tmp->iocd_list);
3190 up_write(&llioc.ioc_sem);
3192 OBD_FREE(tmp, size);
3196 up_write(&llioc.ioc_sem);
3198 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3201 EXPORT_SYMBOL(ll_iocontrol_register);
3202 EXPORT_SYMBOL(ll_iocontrol_unregister);
3204 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3205 unsigned int cmd, unsigned long arg, int *rcp)
3207 enum llioc_iter ret = LLIOC_CONT;
3208 struct llioc_data *data;
3209 int rc = -EINVAL, i;
3211 down_read(&llioc.ioc_sem);
3212 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3213 for (i = 0; i < data->iocd_count; i++) {
3214 if (cmd != data->iocd_cmd[i])
3217 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3221 if (ret == LLIOC_STOP)
3224 up_read(&llioc.ioc_sem);
3231 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3233 struct ll_inode_info *lli = ll_i2info(inode);
3234 struct cl_env_nest nest;
3238 if (lli->lli_clob == NULL)
3241 env = cl_env_nested_get(&nest);
3243 return PTR_ERR(env);
3245 result = cl_conf_set(env, lli->lli_clob, conf);
3246 cl_env_nested_put(&nest, env);
3248 if (conf->coc_opc == OBJECT_CONF_SET) {
3249 struct ldlm_lock *lock = conf->coc_lock;
3251 LASSERT(lock != NULL);
3252 LASSERT(ldlm_has_layout(lock));
3254 /* it can only be allowed to match after layout is
3255 * applied to inode otherwise false layout would be
3256 * seen. Applying layout shoud happen before dropping
3257 * the intent lock. */
3258 ldlm_lock_allow_match(lock);
3264 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3265 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3268 struct ll_sb_info *sbi = ll_i2sbi(inode);
3269 struct obd_capa *oc;
3270 struct ptlrpc_request *req;
3271 struct mdt_body *body;
3277 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3278 PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
3279 lock->l_lvb_data, lock->l_lvb_len);
3281 if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
3284 /* if layout lock was granted right away, the layout is returned
3285 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3286 * blocked and then granted via completion ast, we have to fetch
3287 * layout here. Please note that we can't use the LVB buffer in
3288 * completion AST because it doesn't have a large enough buffer */
3289 oc = ll_mdscapa_get(inode);
3290 rc = ll_get_max_mdsize(sbi, &lmmsize);
3292 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3293 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3299 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3300 if (body == NULL || body->eadatasize > lmmsize)
3301 GOTO(out, rc = -EPROTO);
3303 lmmsize = body->eadatasize;
3304 if (lmmsize == 0) /* empty layout */
3307 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3309 GOTO(out, rc = -EFAULT);
3311 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3312 if (lvbdata == NULL)
3313 GOTO(out, rc = -ENOMEM);
3315 memcpy(lvbdata, lmm, lmmsize);
3316 lock_res_and_lock(lock);
3317 if (lock->l_lvb_data != NULL)
3318 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3320 lock->l_lvb_data = lvbdata;
3321 lock->l_lvb_len = lmmsize;
3322 unlock_res_and_lock(lock);
3325 ptlrpc_req_finished(req);
3330 * Apply the layout to the inode. Layout lock is held and will be released
3333 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3334 struct inode *inode, __u32 *gen, bool reconf)
3336 struct ll_inode_info *lli = ll_i2info(inode);
3337 struct ll_sb_info *sbi = ll_i2sbi(inode);
3338 struct ldlm_lock *lock;
3339 struct lustre_md md = { NULL };
3340 struct cl_object_conf conf;
3343 bool wait_layout = false;
3345 LASSERT(lustre_handle_is_used(lockh));
3347 lock = ldlm_handle2lock(lockh);
3348 LASSERT(lock != NULL);
3349 LASSERT(ldlm_has_layout(lock));
3351 LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3352 inode, PFID(&lli->lli_fid), reconf);
3354 /* in case this is a caching lock and reinstate with new inode */
3355 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3357 lock_res_and_lock(lock);
3358 lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3359 unlock_res_and_lock(lock);
3360 /* checking lvb_ready is racy but this is okay. The worst case is
3361 * that multi processes may configure the file on the same time. */
3362 if (lvb_ready || !reconf) {
3365 /* layout_gen must be valid if layout lock is not
3366 * cancelled and stripe has already set */
3367 *gen = lli->lli_layout_gen;
3373 rc = ll_layout_fetch(inode, lock);
3377 /* for layout lock, lmm is returned in lock's lvb.
3378 * lvb_data is immutable if the lock is held so it's safe to access it
3379 * without res lock. See the description in ldlm_lock_decref_internal()
3380 * for the condition to free lvb_data of layout lock */
3381 if (lock->l_lvb_data != NULL) {
3382 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3383 lock->l_lvb_data, lock->l_lvb_len);
3385 *gen = LL_LAYOUT_GEN_EMPTY;
3387 *gen = md.lsm->lsm_layout_gen;
3390 CERROR("%s: file "DFID" unpackmd error: %d\n",
3391 ll_get_fsname(inode->i_sb, NULL, 0),
3392 PFID(&lli->lli_fid), rc);
3398 /* set layout to file. Unlikely this will fail as old layout was
3399 * surely eliminated */
3400 memset(&conf, 0, sizeof(conf));
3401 conf.coc_opc = OBJECT_CONF_SET;
3402 conf.coc_inode = inode;
3403 conf.coc_lock = lock;
3404 conf.u.coc_md = &md;
3405 rc = ll_layout_conf(inode, &conf);
3408 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3410 /* refresh layout failed, need to wait */
3411 wait_layout = rc == -EBUSY;
3414 LDLM_LOCK_PUT(lock);
3415 ldlm_lock_decref(lockh, mode);
3417 /* wait for IO to complete if it's still being used. */
3419 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3420 ll_get_fsname(inode->i_sb, NULL, 0),
3421 inode, PFID(&lli->lli_fid));
3423 memset(&conf, 0, sizeof(conf));
3424 conf.coc_opc = OBJECT_CONF_WAIT;
3425 conf.coc_inode = inode;
3426 rc = ll_layout_conf(inode, &conf);
3430 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3431 PFID(&lli->lli_fid), rc);
3437 * This function checks if there exists a LAYOUT lock on the client side,
3438 * or enqueues it if it doesn't have one in cache.
3440 * This function will not hold layout lock so it may be revoked any time after
3441 * this function returns. Any operations depend on layout should be redone
3444 * This function should be called before lov_io_init() to get an uptodate
3445 * layout version, the caller should save the version number and after IO
3446 * is finished, this function should be called again to verify that layout
3447 * is not changed during IO time.
3449 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3451 struct ll_inode_info *lli = ll_i2info(inode);
3452 struct ll_sb_info *sbi = ll_i2sbi(inode);
3453 struct md_op_data *op_data;
3454 struct lookup_intent it;
3455 struct lustre_handle lockh;
3457 struct ldlm_enqueue_info einfo = {
3458 .ei_type = LDLM_IBITS,
3460 .ei_cb_bl = ll_md_blocking_ast,
3461 .ei_cb_cp = ldlm_completion_ast,
3465 *gen = lli->lli_layout_gen;
3466 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
3470 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3471 LASSERT(S_ISREG(inode->i_mode));
3473 /* mostly layout lock is caching on the local side, so try to match
3474 * it before grabbing layout lock mutex. */
3475 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3476 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3477 if (mode != 0) { /* hit cached lock */
3478 rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
3482 /* better hold lli_layout_mutex to try again otherwise
3483 * it will have starvation problem. */
3486 /* take layout lock mutex to enqueue layout lock exclusively. */
3487 mutex_lock(&lli->lli_layout_mutex);
3490 /* try again. Maybe somebody else has done this. */
3491 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3492 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3493 if (mode != 0) { /* hit cached lock */
3494 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3498 mutex_unlock(&lli->lli_layout_mutex);
3502 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3503 0, 0, LUSTRE_OPC_ANY, NULL);
3504 if (IS_ERR(op_data)) {
3505 mutex_unlock(&lli->lli_layout_mutex);
3506 return PTR_ERR(op_data);
3509 /* have to enqueue one */
3510 memset(&it, 0, sizeof(it));
3511 it.it_op = IT_LAYOUT;
3512 lockh.cookie = 0ULL;
3514 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3515 ll_get_fsname(inode->i_sb, NULL, 0), inode,
3516 PFID(&lli->lli_fid));
3518 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3520 if (it.d.lustre.it_data != NULL)
3521 ptlrpc_req_finished(it.d.lustre.it_data);
3522 it.d.lustre.it_data = NULL;
3524 ll_finish_md_op_data(op_data);
3526 mode = it.d.lustre.it_lock_mode;
3527 it.d.lustre.it_lock_mode = 0;
3528 ll_intent_drop_lock(&it);
3531 /* set lock data in case this is a new lock */
3532 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3533 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3537 mutex_unlock(&lli->lli_layout_mutex);
3543 * This function send a restore request to the MDT
3545 int ll_layout_restore(struct inode *inode)
3547 struct hsm_user_request *hur;
3550 len = sizeof(struct hsm_user_request) +
3551 sizeof(struct hsm_user_item);
3552 OBD_ALLOC(hur, len);
3556 hur->hur_request.hr_action = HUA_RESTORE;
3557 hur->hur_request.hr_archive_id = 0;
3558 hur->hur_request.hr_flags = 0;
3559 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3560 sizeof(hur->hur_user_item[0].hui_fid));
3561 hur->hur_user_item[0].hui_extent.length = -1;
3562 hur->hur_request.hr_itemcount = 1;
3563 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,