4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2012, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
51 #include "cl_object.h"
53 struct ll_file_data *ll_file_data_get(void)
55 struct ll_file_data *fd;
57 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, __GFP_IO);
60 fd->fd_write_failed = false;
64 static void ll_file_data_put(struct ll_file_data *fd)
67 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
70 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
71 struct lustre_handle *fh)
73 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
74 op_data->op_attr.ia_mode = inode->i_mode;
75 op_data->op_attr.ia_atime = inode->i_atime;
76 op_data->op_attr.ia_mtime = inode->i_mtime;
77 op_data->op_attr.ia_ctime = inode->i_ctime;
78 op_data->op_attr.ia_size = i_size_read(inode);
79 op_data->op_attr_blocks = inode->i_blocks;
80 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
81 ll_inode_to_ext_flags(inode->i_flags);
82 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
84 op_data->op_handle = *fh;
85 op_data->op_capa1 = ll_mdscapa_get(inode);
87 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
88 op_data->op_bias |= MDS_DATA_MODIFIED;
92 * Closes the IO epoch and packs all the attributes into @op_data for
95 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
96 struct obd_client_handle *och)
98 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
99 ATTR_MTIME | ATTR_MTIME_SET |
100 ATTR_CTIME | ATTR_CTIME_SET;
102 if (!(och->och_flags & FMODE_WRITE))
105 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
106 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
108 ll_ioepoch_close(inode, op_data, &och, 0);
111 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
112 ll_prep_md_op_data(op_data, inode, NULL, NULL,
113 0, 0, LUSTRE_OPC_ANY, NULL);
116 static int ll_close_inode_openhandle(struct obd_export *md_exp,
118 struct obd_client_handle *och)
120 struct obd_export *exp = ll_i2mdexp(inode);
121 struct md_op_data *op_data;
122 struct ptlrpc_request *req = NULL;
123 struct obd_device *obd = class_exp2obd(exp);
129 * XXX: in case of LMV, is this correct to access
132 CERROR("Invalid MDC connection handle "LPX64"\n",
133 ll_i2mdexp(inode)->exp_handle.h_cookie);
137 OBD_ALLOC_PTR(op_data);
139 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
141 ll_prepare_close(inode, op_data, och);
142 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
143 rc = md_close(md_exp, op_data, och->och_mod, &req);
145 /* This close must have the epoch closed. */
146 LASSERT(epoch_close);
147 /* MDS has instructed us to obtain Size-on-MDS attribute from
148 * OSTs and send setattr to back to MDS. */
149 rc = ll_som_update(inode, op_data);
151 CERROR("inode %lu mdc Size-on-MDS update failed: "
152 "rc = %d\n", inode->i_ino, rc);
156 CERROR("inode %lu mdc close failed: rc = %d\n",
160 /* DATA_MODIFIED flag was successfully sent on close, cancel data
161 * modification flag. */
162 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
163 struct ll_inode_info *lli = ll_i2info(inode);
165 spin_lock(&lli->lli_lock);
166 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
167 spin_unlock(&lli->lli_lock);
170 ll_finish_md_op_data(op_data);
173 rc = ll_objects_destroy(req, inode);
175 CERROR("inode %lu ll_objects destroy: rc = %d\n",
180 if (exp_connect_som(exp) && !epoch_close &&
181 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
182 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
184 md_clear_open_replay_data(md_exp, och);
185 /* Free @och if it is not waiting for DONE_WRITING. */
186 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
189 if (req) /* This is close request */
190 ptlrpc_req_finished(req);
194 int ll_md_real_close(struct inode *inode, int flags)
196 struct ll_inode_info *lli = ll_i2info(inode);
197 struct obd_client_handle **och_p;
198 struct obd_client_handle *och;
202 if (flags & FMODE_WRITE) {
203 och_p = &lli->lli_mds_write_och;
204 och_usecount = &lli->lli_open_fd_write_count;
205 } else if (flags & FMODE_EXEC) {
206 och_p = &lli->lli_mds_exec_och;
207 och_usecount = &lli->lli_open_fd_exec_count;
209 LASSERT(flags & FMODE_READ);
210 och_p = &lli->lli_mds_read_och;
211 och_usecount = &lli->lli_open_fd_read_count;
214 mutex_lock(&lli->lli_och_mutex);
215 if (*och_usecount) { /* There are still users of this handle, so
217 mutex_unlock(&lli->lli_och_mutex);
222 mutex_unlock(&lli->lli_och_mutex);
224 if (och) { /* There might be a race and somebody have freed this och
226 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
233 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
236 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
237 struct ll_inode_info *lli = ll_i2info(inode);
240 /* clear group lock, if present */
241 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
242 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
244 if (fd->fd_lease_och != NULL) {
247 /* Usually the lease is not released when the
248 * application crashed, we need to release here. */
249 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
250 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
251 PFID(&lli->lli_fid), rc, lease_broken);
253 fd->fd_lease_och = NULL;
256 if (fd->fd_och != NULL) {
257 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och);
262 /* Let's see if we have good enough OPEN lock on the file and if
263 we can skip talking to MDS */
264 if (file->f_dentry->d_inode) { /* Can this ever be false? */
266 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
267 struct lustre_handle lockh;
268 struct inode *inode = file->f_dentry->d_inode;
269 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
271 mutex_lock(&lli->lli_och_mutex);
272 if (fd->fd_omode & FMODE_WRITE) {
274 LASSERT(lli->lli_open_fd_write_count);
275 lli->lli_open_fd_write_count--;
276 } else if (fd->fd_omode & FMODE_EXEC) {
278 LASSERT(lli->lli_open_fd_exec_count);
279 lli->lli_open_fd_exec_count--;
282 LASSERT(lli->lli_open_fd_read_count);
283 lli->lli_open_fd_read_count--;
285 mutex_unlock(&lli->lli_och_mutex);
287 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
288 LDLM_IBITS, &policy, lockmode,
290 rc = ll_md_real_close(file->f_dentry->d_inode,
294 CERROR("Releasing a file %p with negative dentry %p. Name %s",
295 file, file->f_dentry, file->f_dentry->d_name.name);
299 LUSTRE_FPRIVATE(file) = NULL;
300 ll_file_data_put(fd);
301 ll_capa_close(inode);
306 /* While this returns an error code, fput() the caller does not, so we need
307 * to make every effort to clean up all of our state here. Also, applications
308 * rarely check close errors and even if an error is returned they will not
309 * re-try the close call.
311 int ll_file_release(struct inode *inode, struct file *file)
313 struct ll_file_data *fd;
314 struct ll_sb_info *sbi = ll_i2sbi(inode);
315 struct ll_inode_info *lli = ll_i2info(inode);
318 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
319 inode->i_generation, inode);
321 #ifdef CONFIG_FS_POSIX_ACL
322 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
323 inode == inode->i_sb->s_root->d_inode) {
324 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
327 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
328 fd->fd_flags &= ~LL_FILE_RMTACL;
329 rct_del(&sbi->ll_rct, current_pid());
330 et_search_free(&sbi->ll_et, current_pid());
335 if (inode->i_sb->s_root != file->f_dentry)
336 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
337 fd = LUSTRE_FPRIVATE(file);
340 /* The last ref on @file, maybe not the the owner pid of statahead.
341 * Different processes can open the same dir, "ll_opendir_key" means:
342 * it is me that should stop the statahead thread. */
343 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
344 lli->lli_opendir_pid != 0)
345 ll_stop_statahead(inode, lli->lli_opendir_key);
347 if (inode->i_sb->s_root == file->f_dentry) {
348 LUSTRE_FPRIVATE(file) = NULL;
349 ll_file_data_put(fd);
353 if (!S_ISDIR(inode->i_mode)) {
354 lov_read_and_clear_async_rc(lli->lli_clob);
355 lli->lli_async_rc = 0;
358 rc = ll_md_close(sbi->ll_md_exp, inode, file);
360 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
361 libcfs_debug_dumplog();
366 static int ll_intent_file_open(struct file *file, void *lmm,
367 int lmmsize, struct lookup_intent *itp)
369 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
370 struct dentry *parent = file->f_dentry->d_parent;
371 const char *name = file->f_dentry->d_name.name;
372 const int len = file->f_dentry->d_name.len;
373 struct md_op_data *op_data;
374 struct ptlrpc_request *req;
375 __u32 opc = LUSTRE_OPC_ANY;
381 /* Usually we come here only for NFSD, and we want open lock.
382 But we can also get here with pre 2.6.15 patchless kernels, and in
383 that case that lock is also ok */
384 /* We can also get here if there was cached open handle in revalidate_it
385 * but it disappeared while we were getting from there to ll_file_open.
386 * But this means this file was closed and immediately opened which
387 * makes a good candidate for using OPEN lock */
388 /* If lmmsize & lmm are not 0, we are just setting stripe info
389 * parameters. No need for the open lock */
390 if (lmm == NULL && lmmsize == 0) {
391 itp->it_flags |= MDS_OPEN_LOCK;
392 if (itp->it_flags & FMODE_WRITE)
393 opc = LUSTRE_OPC_CREATE;
396 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
397 file->f_dentry->d_inode, name, len,
400 return PTR_ERR(op_data);
402 itp->it_flags |= MDS_OPEN_BY_FID;
403 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
404 0 /*unused */, &req, ll_md_blocking_ast, 0);
405 ll_finish_md_op_data(op_data);
407 /* reason for keep own exit path - don`t flood log
408 * with messages with -ESTALE errors.
410 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
411 it_open_error(DISP_OPEN_OPEN, itp))
413 ll_release_openhandle(file->f_dentry, itp);
417 if (it_disposition(itp, DISP_LOOKUP_NEG))
418 GOTO(out, rc = -ENOENT);
420 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
421 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
422 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
426 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
427 if (!rc && itp->d.lustre.it_lock_mode)
428 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
432 ptlrpc_req_finished(itp->d.lustre.it_data);
433 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
434 ll_intent_drop_lock(itp);
440 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
441 * not believe attributes if a few ioepoch holders exist. Attributes for
442 * previous ioepoch if new one is opened are also skipped by MDS.
444 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
446 if (ioepoch && lli->lli_ioepoch != ioepoch) {
447 lli->lli_ioepoch = ioepoch;
448 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
449 ioepoch, PFID(&lli->lli_fid));
453 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
454 struct obd_client_handle *och)
456 struct ptlrpc_request *req = it->d.lustre.it_data;
457 struct mdt_body *body;
459 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
460 och->och_fh = body->handle;
461 och->och_fid = body->fid1;
462 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
463 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
464 och->och_flags = it->it_flags;
466 return md_set_open_replay_data(md_exp, och, req);
469 int ll_local_open(struct file *file, struct lookup_intent *it,
470 struct ll_file_data *fd, struct obd_client_handle *och)
472 struct inode *inode = file->f_dentry->d_inode;
473 struct ll_inode_info *lli = ll_i2info(inode);
475 LASSERT(!LUSTRE_FPRIVATE(file));
480 struct ptlrpc_request *req = it->d.lustre.it_data;
481 struct mdt_body *body;
484 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
488 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
489 ll_ioepoch_open(lli, body->ioepoch);
492 LUSTRE_FPRIVATE(file) = fd;
493 ll_readahead_init(inode, &fd->fd_ras);
494 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
498 /* Open a file, and (for the very first open) create objects on the OSTs at
499 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
500 * creation or open until ll_lov_setstripe() ioctl is called.
502 * If we already have the stripe MD locally then we don't request it in
503 * md_open(), by passing a lmm_size = 0.
505 * It is up to the application to ensure no other processes open this file
506 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
507 * used. We might be able to avoid races of that sort by getting lli_open_sem
508 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
509 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
511 int ll_file_open(struct inode *inode, struct file *file)
513 struct ll_inode_info *lli = ll_i2info(inode);
514 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
515 .it_flags = file->f_flags };
516 struct obd_client_handle **och_p = NULL;
517 __u64 *och_usecount = NULL;
518 struct ll_file_data *fd;
519 int rc = 0, opendir_set = 0;
521 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
522 inode->i_generation, inode, file->f_flags);
524 it = file->private_data; /* XXX: compat macro */
525 file->private_data = NULL; /* prevent ll_local_open assertion */
527 fd = ll_file_data_get();
529 GOTO(out_openerr, rc = -ENOMEM);
532 if (S_ISDIR(inode->i_mode)) {
533 spin_lock(&lli->lli_sa_lock);
534 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
535 lli->lli_opendir_pid == 0) {
536 lli->lli_opendir_key = fd;
537 lli->lli_opendir_pid = current_pid();
540 spin_unlock(&lli->lli_sa_lock);
543 if (inode->i_sb->s_root == file->f_dentry) {
544 LUSTRE_FPRIVATE(file) = fd;
548 if (!it || !it->d.lustre.it_disposition) {
549 /* Convert f_flags into access mode. We cannot use file->f_mode,
550 * because everything but O_ACCMODE mask was stripped from
552 if ((oit.it_flags + 1) & O_ACCMODE)
554 if (file->f_flags & O_TRUNC)
555 oit.it_flags |= FMODE_WRITE;
557 /* kernel only call f_op->open in dentry_open. filp_open calls
558 * dentry_open after call to open_namei that checks permissions.
559 * Only nfsd_open call dentry_open directly without checking
560 * permissions and because of that this code below is safe. */
561 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
562 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
564 /* We do not want O_EXCL here, presumably we opened the file
565 * already? XXX - NFS implications? */
566 oit.it_flags &= ~O_EXCL;
568 /* bug20584, if "it_flags" contains O_CREAT, the file will be
569 * created if necessary, then "IT_CREAT" should be set to keep
570 * consistent with it */
571 if (oit.it_flags & O_CREAT)
572 oit.it_op |= IT_CREAT;
578 /* Let's see if we have file open on MDS already. */
579 if (it->it_flags & FMODE_WRITE) {
580 och_p = &lli->lli_mds_write_och;
581 och_usecount = &lli->lli_open_fd_write_count;
582 } else if (it->it_flags & FMODE_EXEC) {
583 och_p = &lli->lli_mds_exec_och;
584 och_usecount = &lli->lli_open_fd_exec_count;
586 och_p = &lli->lli_mds_read_och;
587 och_usecount = &lli->lli_open_fd_read_count;
590 mutex_lock(&lli->lli_och_mutex);
591 if (*och_p) { /* Open handle is present */
592 if (it_disposition(it, DISP_OPEN_OPEN)) {
593 /* Well, there's extra open request that we do not need,
594 let's close it somehow. This will decref request. */
595 rc = it_open_error(DISP_OPEN_OPEN, it);
597 mutex_unlock(&lli->lli_och_mutex);
598 GOTO(out_openerr, rc);
601 ll_release_openhandle(file->f_dentry, it);
605 rc = ll_local_open(file, it, fd, NULL);
608 mutex_unlock(&lli->lli_och_mutex);
609 GOTO(out_openerr, rc);
612 LASSERT(*och_usecount == 0);
613 if (!it->d.lustre.it_disposition) {
614 /* We cannot just request lock handle now, new ELC code
615 means that one of other OPEN locks for this file
616 could be cancelled, and since blocking ast handler
617 would attempt to grab och_mutex as well, that would
618 result in a deadlock */
619 mutex_unlock(&lli->lli_och_mutex);
620 it->it_create_mode |= M_CHECK_STALE;
621 rc = ll_intent_file_open(file, NULL, 0, it);
622 it->it_create_mode &= ~M_CHECK_STALE;
624 GOTO(out_openerr, rc);
628 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
630 GOTO(out_och_free, rc = -ENOMEM);
634 /* md_intent_lock() didn't get a request ref if there was an
635 * open error, so don't do cleanup on the request here
637 /* XXX (green): Should not we bail out on any error here, not
638 * just open error? */
639 rc = it_open_error(DISP_OPEN_OPEN, it);
641 GOTO(out_och_free, rc);
643 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
645 rc = ll_local_open(file, it, fd, *och_p);
647 GOTO(out_och_free, rc);
649 mutex_unlock(&lli->lli_och_mutex);
652 /* Must do this outside lli_och_mutex lock to prevent deadlock where
653 different kind of OPEN lock for this same inode gets cancelled
654 by ldlm_cancel_lru */
655 if (!S_ISREG(inode->i_mode))
656 GOTO(out_och_free, rc);
660 if (!lli->lli_has_smd) {
661 if (file->f_flags & O_LOV_DELAY_CREATE ||
662 !(file->f_mode & FMODE_WRITE)) {
663 CDEBUG(D_INODE, "object creation was delayed\n");
664 GOTO(out_och_free, rc);
667 file->f_flags &= ~O_LOV_DELAY_CREATE;
668 GOTO(out_och_free, rc);
672 if (och_p && *och_p) {
673 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
674 *och_p = NULL; /* OBD_FREE writes some magic there */
677 mutex_unlock(&lli->lli_och_mutex);
680 if (opendir_set != 0)
681 ll_stop_statahead(inode, lli->lli_opendir_key);
683 ll_file_data_put(fd);
685 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
688 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
689 ptlrpc_req_finished(it->d.lustre.it_data);
690 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
696 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
697 struct ldlm_lock_desc *desc, void *data, int flag)
700 struct lustre_handle lockh;
703 case LDLM_CB_BLOCKING:
704 ldlm_lock2handle(lock, &lockh);
705 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
707 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
711 case LDLM_CB_CANCELING:
719 * Acquire a lease and open the file.
721 struct obd_client_handle *ll_lease_open(struct inode *inode, struct file *file,
724 struct lookup_intent it = { .it_op = IT_OPEN };
725 struct ll_sb_info *sbi = ll_i2sbi(inode);
726 struct md_op_data *op_data;
727 struct ptlrpc_request *req;
728 struct lustre_handle old_handle = { 0 };
729 struct obd_client_handle *och = NULL;
733 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
734 return ERR_PTR(-EINVAL);
737 struct ll_inode_info *lli = ll_i2info(inode);
738 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
739 struct obd_client_handle **och_p;
742 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
743 return ERR_PTR(-EPERM);
745 /* Get the openhandle of the file */
747 mutex_lock(&lli->lli_och_mutex);
748 if (fd->fd_lease_och != NULL) {
749 mutex_unlock(&lli->lli_och_mutex);
753 if (fd->fd_och == NULL) {
754 if (file->f_mode & FMODE_WRITE) {
755 LASSERT(lli->lli_mds_write_och != NULL);
756 och_p = &lli->lli_mds_write_och;
757 och_usecount = &lli->lli_open_fd_write_count;
759 LASSERT(lli->lli_mds_read_och != NULL);
760 och_p = &lli->lli_mds_read_och;
761 och_usecount = &lli->lli_open_fd_read_count;
763 if (*och_usecount == 1) {
770 mutex_unlock(&lli->lli_och_mutex);
771 if (rc < 0) /* more than 1 opener */
774 LASSERT(fd->fd_och != NULL);
775 old_handle = fd->fd_och->och_fh;
780 return ERR_PTR(-ENOMEM);
782 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
783 LUSTRE_OPC_ANY, NULL);
785 GOTO(out, rc = PTR_ERR(op_data));
787 /* To tell the MDT this openhandle is from the same owner */
788 op_data->op_handle = old_handle;
790 it.it_flags = fmode | MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
791 rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
792 ll_md_blocking_lease_ast,
793 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
794 * it can be cancelled which may mislead applications that the lease is
796 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
797 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
798 * doesn't deal with openhandle, so normal openhandle will be leaked. */
799 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
800 ll_finish_md_op_data(op_data);
802 ptlrpc_req_finished(req);
803 it_clear_disposition(&it, DISP_ENQ_COMPLETE);
806 GOTO(out_release_it, rc);
808 if (it_disposition(&it, DISP_LOOKUP_NEG))
809 GOTO(out_release_it, rc = -ENOENT);
811 rc = it_open_error(DISP_OPEN_OPEN, &it);
813 GOTO(out_release_it, rc);
815 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
816 ll_och_fill(sbi->ll_md_exp, &it, och);
818 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
819 GOTO(out_close, rc = -EOPNOTSUPP);
821 /* already get lease, handle lease lock */
822 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
823 if (it.d.lustre.it_lock_mode == 0 ||
824 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
825 /* open lock must return for lease */
826 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
827 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
828 it.d.lustre.it_lock_bits);
829 GOTO(out_close, rc = -EPROTO);
832 ll_intent_release(&it);
836 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och);
838 CERROR("Close openhandle returned %d\n", rc2);
840 /* cancel open lock */
841 if (it.d.lustre.it_lock_mode != 0) {
842 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
843 it.d.lustre.it_lock_mode);
844 it.d.lustre.it_lock_mode = 0;
847 ll_intent_release(&it);
852 EXPORT_SYMBOL(ll_lease_open);
855 * Release lease and close the file.
856 * It will check if the lease has ever broken.
858 int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
861 struct ldlm_lock *lock;
862 bool cancelled = true;
865 lock = ldlm_handle2lock(&och->och_lease_handle);
867 lock_res_and_lock(lock);
868 cancelled = ldlm_is_cancel(lock);
869 unlock_res_and_lock(lock);
873 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
874 PFID(&ll_i2info(inode)->lli_fid), cancelled);
877 ldlm_cli_cancel(&och->och_lease_handle, 0);
878 if (lease_broken != NULL)
879 *lease_broken = cancelled;
881 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och);
884 EXPORT_SYMBOL(ll_lease_close);
886 /* Fills the obdo with the attributes for the lsm */
887 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
888 struct obd_capa *capa, struct obdo *obdo,
889 __u64 ioepoch, int sync)
891 struct ptlrpc_request_set *set;
892 struct obd_info oinfo = { { { 0 } } };
895 LASSERT(lsm != NULL);
899 oinfo.oi_oa->o_oi = lsm->lsm_oi;
900 oinfo.oi_oa->o_mode = S_IFREG;
901 oinfo.oi_oa->o_ioepoch = ioepoch;
902 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
903 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
904 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
905 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
906 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
907 OBD_MD_FLDATAVERSION;
908 oinfo.oi_capa = capa;
910 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
911 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
914 set = ptlrpc_prep_set();
916 CERROR("can't allocate ptlrpc set\n");
919 rc = obd_getattr_async(exp, &oinfo, set);
921 rc = ptlrpc_set_wait(set);
922 ptlrpc_set_destroy(set);
925 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
926 OBD_MD_FLATIME | OBD_MD_FLMTIME |
927 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
928 OBD_MD_FLDATAVERSION);
933 * Performs the getattr on the inode and updates its fields.
934 * If @sync != 0, perform the getattr under the server-side lock.
936 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
937 __u64 ioepoch, int sync)
939 struct obd_capa *capa = ll_mdscapa_get(inode);
940 struct lov_stripe_md *lsm;
943 lsm = ccc_inode_lsm_get(inode);
944 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
945 capa, obdo, ioepoch, sync);
948 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
950 obdo_refresh_inode(inode, obdo, obdo->o_valid);
951 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
952 " blksize %lu\n", POSTID(oi), i_size_read(inode),
953 (unsigned long long)inode->i_blocks,
954 (unsigned long)ll_inode_blksize(inode));
956 ccc_inode_lsm_put(inode, lsm);
960 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
962 struct ll_inode_info *lli = ll_i2info(inode);
963 struct cl_object *obj = lli->lli_clob;
964 struct cl_attr *attr = ccc_env_thread_attr(env);
968 ll_inode_size_lock(inode);
969 /* merge timestamps the most recently obtained from mds with
970 timestamps obtained from osts */
971 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
972 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
973 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
974 inode_init_lvb(inode, &lvb);
976 cl_object_attr_lock(obj);
977 rc = cl_object_attr_get(env, obj, attr);
978 cl_object_attr_unlock(obj);
981 if (lvb.lvb_atime < attr->cat_atime)
982 lvb.lvb_atime = attr->cat_atime;
983 if (lvb.lvb_ctime < attr->cat_ctime)
984 lvb.lvb_ctime = attr->cat_ctime;
985 if (lvb.lvb_mtime < attr->cat_mtime)
986 lvb.lvb_mtime = attr->cat_mtime;
988 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
989 PFID(&lli->lli_fid), attr->cat_size);
990 cl_isize_write_nolock(inode, attr->cat_size);
992 inode->i_blocks = attr->cat_blocks;
994 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
995 LTIME_S(inode->i_atime) = lvb.lvb_atime;
996 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
998 ll_inode_size_unlock(inode);
1003 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1006 struct obdo obdo = { 0 };
1009 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1011 st->st_size = obdo.o_size;
1012 st->st_blocks = obdo.o_blocks;
1013 st->st_mtime = obdo.o_mtime;
1014 st->st_atime = obdo.o_atime;
1015 st->st_ctime = obdo.o_ctime;
1020 void ll_io_init(struct cl_io *io, const struct file *file, int write)
1022 struct inode *inode = file->f_dentry->d_inode;
1024 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1026 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1027 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1028 file->f_flags & O_DIRECT ||
1031 io->ci_obj = ll_i2info(inode)->lli_clob;
1032 io->ci_lockreq = CILR_MAYBE;
1033 if (ll_file_nolock(file)) {
1034 io->ci_lockreq = CILR_NEVER;
1035 io->ci_no_srvlock = 1;
1036 } else if (file->f_flags & O_APPEND) {
1037 io->ci_lockreq = CILR_MANDATORY;
1042 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1043 struct file *file, enum cl_io_type iot,
1044 loff_t *ppos, size_t count)
1046 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1047 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1052 io = ccc_env_thread_io(env);
1053 ll_io_init(io, file, iot == CIT_WRITE);
1055 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1056 struct vvp_io *vio = vvp_env_io(env);
1057 struct ccc_io *cio = ccc_env_io(env);
1058 int write_mutex_locked = 0;
1060 cio->cui_fd = LUSTRE_FPRIVATE(file);
1061 vio->cui_io_subtype = args->via_io_subtype;
1063 switch (vio->cui_io_subtype) {
1065 cio->cui_iov = args->u.normal.via_iov;
1066 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1067 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1068 cio->cui_iocb = args->u.normal.via_iocb;
1069 if ((iot == CIT_WRITE) &&
1070 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1071 if (mutex_lock_interruptible(&lli->
1073 GOTO(out, result = -ERESTARTSYS);
1074 write_mutex_locked = 1;
1075 } else if (iot == CIT_READ) {
1076 down_read(&lli->lli_trunc_sem);
1080 vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
1081 vio->u.sendfile.cui_target = args->u.sendfile.via_target;
1084 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1085 vio->u.splice.cui_flags = args->u.splice.via_flags;
1088 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1091 result = cl_io_loop(env, io);
1092 if (write_mutex_locked)
1093 mutex_unlock(&lli->lli_write_mutex);
1094 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
1095 up_read(&lli->lli_trunc_sem);
1097 /* cl_io_rw_init() handled IO */
1098 result = io->ci_result;
1101 if (io->ci_nob > 0) {
1102 result = io->ci_nob;
1103 *ppos = io->u.ci_wr.wr.crw_pos;
1107 cl_io_fini(env, io);
1108 /* If any bit been read/written (result != 0), we just return
1109 * short read/write instead of restart io. */
1110 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1111 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1112 iot == CIT_READ ? "read" : "write",
1113 file->f_dentry->d_name.name, *ppos, count);
1114 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1118 if (iot == CIT_READ) {
1120 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1121 LPROC_LL_READ_BYTES, result);
1122 } else if (iot == CIT_WRITE) {
1124 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1125 LPROC_LL_WRITE_BYTES, result);
1126 fd->fd_write_failed = false;
1127 } else if (result != -ERESTARTSYS) {
1128 fd->fd_write_failed = true;
1137 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1139 static int ll_file_get_iov_count(const struct iovec *iov,
1140 unsigned long *nr_segs, size_t *count)
1145 for (seg = 0; seg < *nr_segs; seg++) {
1146 const struct iovec *iv = &iov[seg];
1149 * If any segment has a negative length, or the cumulative
1150 * length ever wraps negative then return -EINVAL.
1153 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1155 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1160 cnt -= iv->iov_len; /* This segment is no good */
1167 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1168 unsigned long nr_segs, loff_t pos)
1171 struct vvp_io_args *args;
1176 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1180 env = cl_env_get(&refcheck);
1182 return PTR_ERR(env);
1184 args = vvp_env_args(env, IO_NORMAL);
1185 args->u.normal.via_iov = (struct iovec *)iov;
1186 args->u.normal.via_nrsegs = nr_segs;
1187 args->u.normal.via_iocb = iocb;
1189 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1190 &iocb->ki_pos, count);
1191 cl_env_put(env, &refcheck);
1195 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1199 struct iovec *local_iov;
1200 struct kiocb *kiocb;
1204 env = cl_env_get(&refcheck);
1206 return PTR_ERR(env);
1208 local_iov = &vvp_env_info(env)->vti_local_iov;
1209 kiocb = &vvp_env_info(env)->vti_kiocb;
1210 local_iov->iov_base = (void __user *)buf;
1211 local_iov->iov_len = count;
1212 init_sync_kiocb(kiocb, file);
1213 kiocb->ki_pos = *ppos;
1214 kiocb->ki_nbytes = count;
1216 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1217 *ppos = kiocb->ki_pos;
1219 cl_env_put(env, &refcheck);
1224 * Write to a file (through the page cache).
1226 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1227 unsigned long nr_segs, loff_t pos)
1230 struct vvp_io_args *args;
1235 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1239 env = cl_env_get(&refcheck);
1241 return PTR_ERR(env);
1243 args = vvp_env_args(env, IO_NORMAL);
1244 args->u.normal.via_iov = (struct iovec *)iov;
1245 args->u.normal.via_nrsegs = nr_segs;
1246 args->u.normal.via_iocb = iocb;
1248 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1249 &iocb->ki_pos, count);
1250 cl_env_put(env, &refcheck);
1254 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1258 struct iovec *local_iov;
1259 struct kiocb *kiocb;
1263 env = cl_env_get(&refcheck);
1265 return PTR_ERR(env);
1267 local_iov = &vvp_env_info(env)->vti_local_iov;
1268 kiocb = &vvp_env_info(env)->vti_kiocb;
1269 local_iov->iov_base = (void __user *)buf;
1270 local_iov->iov_len = count;
1271 init_sync_kiocb(kiocb, file);
1272 kiocb->ki_pos = *ppos;
1273 kiocb->ki_nbytes = count;
1275 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1276 *ppos = kiocb->ki_pos;
1278 cl_env_put(env, &refcheck);
1285 * Send file content (through pagecache) somewhere with helper
1287 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1288 struct pipe_inode_info *pipe, size_t count,
1292 struct vvp_io_args *args;
1296 env = cl_env_get(&refcheck);
1298 return PTR_ERR(env);
1300 args = vvp_env_args(env, IO_SPLICE);
1301 args->u.splice.via_pipe = pipe;
1302 args->u.splice.via_flags = flags;
1304 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1305 cl_env_put(env, &refcheck);
1309 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1312 struct obd_export *exp = ll_i2dtexp(inode);
1313 struct obd_trans_info oti = { 0 };
1314 struct obdo *oa = NULL;
1317 struct lov_stripe_md *lsm = NULL, *lsm2;
1323 lsm = ccc_inode_lsm_get(inode);
1324 if (!lsm_has_objects(lsm))
1325 GOTO(out, rc = -ENOENT);
1327 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1328 (lsm->lsm_stripe_count));
1330 OBD_ALLOC_LARGE(lsm2, lsm_size);
1332 GOTO(out, rc = -ENOMEM);
1335 oa->o_nlink = ost_idx;
1336 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1337 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1338 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1339 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1340 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1341 memcpy(lsm2, lsm, lsm_size);
1342 ll_inode_size_lock(inode);
1343 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1344 ll_inode_size_unlock(inode);
1346 OBD_FREE_LARGE(lsm2, lsm_size);
1349 ccc_inode_lsm_put(inode, lsm);
1354 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1356 struct ll_recreate_obj ucreat;
1359 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1362 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1366 ostid_set_seq_mdt0(&oi);
1367 ostid_set_id(&oi, ucreat.lrc_id);
1368 return ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx);
1371 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1377 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1380 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1383 fid_to_ostid(&fid, &oi);
1384 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1385 return ll_lov_recreate(inode, &oi, ost_idx);
1388 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1389 int flags, struct lov_user_md *lum, int lum_size)
1391 struct lov_stripe_md *lsm = NULL;
1392 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1395 lsm = ccc_inode_lsm_get(inode);
1397 ccc_inode_lsm_put(inode, lsm);
1398 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1403 ll_inode_size_lock(inode);
1404 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1407 rc = oit.d.lustre.it_status;
1409 GOTO(out_req_free, rc);
1411 ll_release_openhandle(file->f_dentry, &oit);
1414 ll_inode_size_unlock(inode);
1415 ll_intent_release(&oit);
1416 ccc_inode_lsm_put(inode, lsm);
1419 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1423 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1424 struct lov_mds_md **lmmp, int *lmm_size,
1425 struct ptlrpc_request **request)
1427 struct ll_sb_info *sbi = ll_i2sbi(inode);
1428 struct mdt_body *body;
1429 struct lov_mds_md *lmm = NULL;
1430 struct ptlrpc_request *req = NULL;
1431 struct md_op_data *op_data;
1434 rc = ll_get_max_mdsize(sbi, &lmmsize);
1438 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1439 strlen(filename), lmmsize,
1440 LUSTRE_OPC_ANY, NULL);
1441 if (IS_ERR(op_data))
1442 return PTR_ERR(op_data);
1444 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1445 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1446 ll_finish_md_op_data(op_data);
1448 CDEBUG(D_INFO, "md_getattr_name failed "
1449 "on %s: rc %d\n", filename, rc);
1453 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1454 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1456 lmmsize = body->eadatasize;
1458 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1460 GOTO(out, rc = -ENODATA);
1463 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1464 LASSERT(lmm != NULL);
1466 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1467 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1468 GOTO(out, rc = -EPROTO);
1472 * This is coming from the MDS, so is probably in
1473 * little endian. We convert it to host endian before
1474 * passing it to userspace.
1476 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1479 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1480 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1483 /* if function called for directory - we should
1484 * avoid swab not existent lsm objects */
1485 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1486 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1487 if (S_ISREG(body->mode))
1488 lustre_swab_lov_user_md_objects(
1489 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1491 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1492 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1493 if (S_ISREG(body->mode))
1494 lustre_swab_lov_user_md_objects(
1495 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1502 *lmm_size = lmmsize;
1507 static int ll_lov_setea(struct inode *inode, struct file *file,
1510 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1511 struct lov_user_md *lump;
1512 int lum_size = sizeof(struct lov_user_md) +
1513 sizeof(struct lov_user_ost_data);
1516 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1519 OBD_ALLOC_LARGE(lump, lum_size);
1523 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1524 OBD_FREE_LARGE(lump, lum_size);
1528 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1530 OBD_FREE_LARGE(lump, lum_size);
1534 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1537 struct lov_user_md_v3 lumv3;
1538 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1539 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1540 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1542 int flags = FMODE_WRITE;
1544 /* first try with v1 which is smaller than v3 */
1545 lum_size = sizeof(struct lov_user_md_v1);
1546 if (copy_from_user(lumv1, lumv1p, lum_size))
1549 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1550 lum_size = sizeof(struct lov_user_md_v3);
1551 if (copy_from_user(&lumv3, lumv3p, lum_size))
1555 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1557 struct lov_stripe_md *lsm;
1560 put_user(0, &lumv1p->lmm_stripe_count);
1562 ll_layout_refresh(inode, &gen);
1563 lsm = ccc_inode_lsm_get(inode);
1564 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1565 0, lsm, (void *)arg);
1566 ccc_inode_lsm_put(inode, lsm);
1571 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1573 struct lov_stripe_md *lsm;
1576 lsm = ccc_inode_lsm_get(inode);
1578 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1580 ccc_inode_lsm_put(inode, lsm);
1584 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1586 struct ll_inode_info *lli = ll_i2info(inode);
1587 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1588 struct ccc_grouplock grouplock;
1591 if (ll_file_nolock(file))
1594 spin_lock(&lli->lli_lock);
1595 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1596 CWARN("group lock already existed with gid %lu\n",
1597 fd->fd_grouplock.cg_gid);
1598 spin_unlock(&lli->lli_lock);
1601 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1602 spin_unlock(&lli->lli_lock);
1604 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1605 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1609 spin_lock(&lli->lli_lock);
1610 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1611 spin_unlock(&lli->lli_lock);
1612 CERROR("another thread just won the race\n");
1613 cl_put_grouplock(&grouplock);
1617 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1618 fd->fd_grouplock = grouplock;
1619 spin_unlock(&lli->lli_lock);
1621 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1625 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1627 struct ll_inode_info *lli = ll_i2info(inode);
1628 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1629 struct ccc_grouplock grouplock;
1631 spin_lock(&lli->lli_lock);
1632 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1633 spin_unlock(&lli->lli_lock);
1634 CWARN("no group lock held\n");
1637 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1639 if (fd->fd_grouplock.cg_gid != arg) {
1640 CWARN("group lock %lu doesn't match current id %lu\n",
1641 arg, fd->fd_grouplock.cg_gid);
1642 spin_unlock(&lli->lli_lock);
1646 grouplock = fd->fd_grouplock;
1647 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1648 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1649 spin_unlock(&lli->lli_lock);
1651 cl_put_grouplock(&grouplock);
1652 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1657 * Close inode open handle
1659 * \param dentry [in] dentry which contains the inode
1660 * \param it [in,out] intent which contains open info and result
1663 * \retval <0 failure
1665 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1667 struct inode *inode = dentry->d_inode;
1668 struct obd_client_handle *och;
1673 /* Root ? Do nothing. */
1674 if (dentry->d_inode->i_sb->s_root == dentry)
1677 /* No open handle to close? Move away */
1678 if (!it_disposition(it, DISP_OPEN_OPEN))
1681 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1683 OBD_ALLOC(och, sizeof(*och));
1685 GOTO(out, rc = -ENOMEM);
1687 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1689 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1692 /* this one is in place of ll_file_open */
1693 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1694 ptlrpc_req_finished(it->d.lustre.it_data);
1695 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1701 * Get size for inode for which FIEMAP mapping is requested.
1702 * Make the FIEMAP get_info call and returns the result.
1704 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1707 struct obd_export *exp = ll_i2dtexp(inode);
1708 struct lov_stripe_md *lsm = NULL;
1709 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1710 int vallen = num_bytes;
1713 /* Checks for fiemap flags */
1714 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1715 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1719 /* Check for FIEMAP_FLAG_SYNC */
1720 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1721 rc = filemap_fdatawrite(inode->i_mapping);
1726 lsm = ccc_inode_lsm_get(inode);
1730 /* If the stripe_count > 1 and the application does not understand
1731 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1733 if (lsm->lsm_stripe_count > 1 &&
1734 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1735 GOTO(out, rc = -EOPNOTSUPP);
1737 fm_key.oa.o_oi = lsm->lsm_oi;
1738 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1740 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1741 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1742 /* If filesize is 0, then there would be no objects for mapping */
1743 if (fm_key.oa.o_size == 0) {
1744 fiemap->fm_mapped_extents = 0;
1748 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1750 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1753 CERROR("obd_get_info failed: rc = %d\n", rc);
1756 ccc_inode_lsm_put(inode, lsm);
1760 int ll_fid2path(struct inode *inode, void *arg)
1762 struct obd_export *exp = ll_i2mdexp(inode);
1763 struct getinfo_fid2path *gfout, *gfin;
1766 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1767 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1770 /* Need to get the buflen */
1771 OBD_ALLOC_PTR(gfin);
1774 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1779 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1780 OBD_ALLOC(gfout, outsize);
1781 if (gfout == NULL) {
1785 memcpy(gfout, gfin, sizeof(*gfout));
1788 /* Call mdc_iocontrol */
1789 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1793 if (copy_to_user(arg, gfout, outsize))
1797 OBD_FREE(gfout, outsize);
1801 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1803 struct ll_user_fiemap *fiemap_s;
1804 size_t num_bytes, ret_bytes;
1805 unsigned int extent_count;
1808 /* Get the extent count so we can calculate the size of
1809 * required fiemap buffer */
1810 if (get_user(extent_count,
1811 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1813 num_bytes = sizeof(*fiemap_s) + (extent_count *
1814 sizeof(struct ll_fiemap_extent));
1816 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1817 if (fiemap_s == NULL)
1820 /* get the fiemap value */
1821 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1823 GOTO(error, rc = -EFAULT);
1825 /* If fm_extent_count is non-zero, read the first extent since
1826 * it is used to calculate end_offset and device from previous
1829 if (copy_from_user(&fiemap_s->fm_extents[0],
1830 (char __user *)arg + sizeof(*fiemap_s),
1831 sizeof(struct ll_fiemap_extent)))
1832 GOTO(error, rc = -EFAULT);
1835 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1839 ret_bytes = sizeof(struct ll_user_fiemap);
1841 if (extent_count != 0)
1842 ret_bytes += (fiemap_s->fm_mapped_extents *
1843 sizeof(struct ll_fiemap_extent));
1845 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1849 OBD_FREE_LARGE(fiemap_s, num_bytes);
1854 * Read the data_version for inode.
1856 * This value is computed using stripe object version on OST.
1857 * Version is computed using server side locking.
1859 * @param extent_lock Take extent lock. Not needed if a process is already
1860 * holding the OST object group locks.
1862 int ll_data_version(struct inode *inode, __u64 *data_version,
1865 struct lov_stripe_md *lsm = NULL;
1866 struct ll_sb_info *sbi = ll_i2sbi(inode);
1867 struct obdo *obdo = NULL;
1870 /* If no stripe, we consider version is 0. */
1871 lsm = ccc_inode_lsm_get(inode);
1872 if (!lsm_has_objects(lsm)) {
1874 CDEBUG(D_INODE, "No object for inode\n");
1878 OBD_ALLOC_PTR(obdo);
1880 GOTO(out, rc = -ENOMEM);
1882 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1884 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1887 *data_version = obdo->o_data_version;
1892 ccc_inode_lsm_put(inode, lsm);
1896 struct ll_swap_stack {
1897 struct iattr ia1, ia2;
1899 struct inode *inode1, *inode2;
1900 bool check_dv1, check_dv2;
1903 static int ll_swap_layouts(struct file *file1, struct file *file2,
1904 struct lustre_swap_layouts *lsl)
1906 struct mdc_swap_layouts msl;
1907 struct md_op_data *op_data;
1910 struct ll_swap_stack *llss = NULL;
1913 OBD_ALLOC_PTR(llss);
1917 llss->inode1 = file1->f_dentry->d_inode;
1918 llss->inode2 = file2->f_dentry->d_inode;
1920 if (!S_ISREG(llss->inode2->i_mode))
1921 GOTO(free, rc = -EINVAL);
1923 if (inode_permission(llss->inode1, MAY_WRITE) ||
1924 inode_permission(llss->inode2, MAY_WRITE))
1925 GOTO(free, rc = -EPERM);
1927 if (llss->inode2->i_sb != llss->inode1->i_sb)
1928 GOTO(free, rc = -EXDEV);
1930 /* we use 2 bool because it is easier to swap than 2 bits */
1931 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1932 llss->check_dv1 = true;
1934 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1935 llss->check_dv2 = true;
1937 /* we cannot use lsl->sl_dvX directly because we may swap them */
1938 llss->dv1 = lsl->sl_dv1;
1939 llss->dv2 = lsl->sl_dv2;
1941 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1942 if (rc == 0) /* same file, done! */
1945 if (rc < 0) { /* sequentialize it */
1946 swap(llss->inode1, llss->inode2);
1948 swap(llss->dv1, llss->dv2);
1949 swap(llss->check_dv1, llss->check_dv2);
1953 if (gid != 0) { /* application asks to flush dirty cache */
1954 rc = ll_get_grouplock(llss->inode1, file1, gid);
1958 rc = ll_get_grouplock(llss->inode2, file2, gid);
1960 ll_put_grouplock(llss->inode1, file1, gid);
1965 /* to be able to restore mtime and atime after swap
1966 * we need to first save them */
1968 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
1969 llss->ia1.ia_mtime = llss->inode1->i_mtime;
1970 llss->ia1.ia_atime = llss->inode1->i_atime;
1971 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
1972 llss->ia2.ia_mtime = llss->inode2->i_mtime;
1973 llss->ia2.ia_atime = llss->inode2->i_atime;
1974 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
1977 /* ultimate check, before swaping the layouts we check if
1978 * dataversion has changed (if requested) */
1979 if (llss->check_dv1) {
1980 rc = ll_data_version(llss->inode1, &dv, 0);
1983 if (dv != llss->dv1)
1984 GOTO(putgl, rc = -EAGAIN);
1987 if (llss->check_dv2) {
1988 rc = ll_data_version(llss->inode2, &dv, 0);
1991 if (dv != llss->dv2)
1992 GOTO(putgl, rc = -EAGAIN);
1995 /* struct md_op_data is used to send the swap args to the mdt
1996 * only flags is missing, so we use struct mdc_swap_layouts
1997 * through the md_op_data->op_data */
1998 /* flags from user space have to be converted before they are send to
1999 * server, no flag is sent today, they are only used on the client */
2002 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2003 0, LUSTRE_OPC_ANY, &msl);
2004 if (IS_ERR(op_data))
2005 GOTO(free, rc = PTR_ERR(op_data));
2007 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2008 sizeof(*op_data), op_data, NULL);
2009 ll_finish_md_op_data(op_data);
2013 ll_put_grouplock(llss->inode2, file2, gid);
2014 ll_put_grouplock(llss->inode1, file1, gid);
2017 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2021 /* clear useless flags */
2022 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2023 llss->ia1.ia_valid &= ~ATTR_MTIME;
2024 llss->ia2.ia_valid &= ~ATTR_MTIME;
2027 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2028 llss->ia1.ia_valid &= ~ATTR_ATIME;
2029 llss->ia2.ia_valid &= ~ATTR_ATIME;
2032 /* update time if requested */
2034 if (llss->ia2.ia_valid != 0) {
2035 mutex_lock(&llss->inode1->i_mutex);
2036 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2037 mutex_unlock(&llss->inode1->i_mutex);
2040 if (llss->ia1.ia_valid != 0) {
2043 mutex_lock(&llss->inode2->i_mutex);
2044 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2045 mutex_unlock(&llss->inode2->i_mutex);
2057 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2059 struct inode *inode = file->f_dentry->d_inode;
2060 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2063 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2064 inode->i_generation, inode, cmd);
2065 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2067 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2068 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2072 case LL_IOC_GETFLAGS:
2073 /* Get the current value of the file flags */
2074 return put_user(fd->fd_flags, (int *)arg);
2075 case LL_IOC_SETFLAGS:
2076 case LL_IOC_CLRFLAGS:
2077 /* Set or clear specific file flags */
2078 /* XXX This probably needs checks to ensure the flags are
2079 * not abused, and to handle any flag side effects.
2081 if (get_user(flags, (int *) arg))
2084 if (cmd == LL_IOC_SETFLAGS) {
2085 if ((flags & LL_FILE_IGNORE_LOCK) &&
2086 !(file->f_flags & O_DIRECT)) {
2087 CERROR("%s: unable to disable locking on "
2088 "non-O_DIRECT file\n", current->comm);
2092 fd->fd_flags |= flags;
2094 fd->fd_flags &= ~flags;
2097 case LL_IOC_LOV_SETSTRIPE:
2098 return ll_lov_setstripe(inode, file, arg);
2099 case LL_IOC_LOV_SETEA:
2100 return ll_lov_setea(inode, file, arg);
2101 case LL_IOC_LOV_SWAP_LAYOUTS: {
2103 struct lustre_swap_layouts lsl;
2105 if (copy_from_user(&lsl, (char *)arg,
2106 sizeof(struct lustre_swap_layouts)))
2109 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2112 file2 = fget(lsl.sl_fd);
2117 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2118 rc = ll_swap_layouts(file, file2, &lsl);
2122 case LL_IOC_LOV_GETSTRIPE:
2123 return ll_lov_getstripe(inode, arg);
2124 case LL_IOC_RECREATE_OBJ:
2125 return ll_lov_recreate_obj(inode, arg);
2126 case LL_IOC_RECREATE_FID:
2127 return ll_lov_recreate_fid(inode, arg);
2128 case FSFILT_IOC_FIEMAP:
2129 return ll_ioctl_fiemap(inode, arg);
2130 case FSFILT_IOC_GETFLAGS:
2131 case FSFILT_IOC_SETFLAGS:
2132 return ll_iocontrol(inode, file, cmd, arg);
2133 case FSFILT_IOC_GETVERSION_OLD:
2134 case FSFILT_IOC_GETVERSION:
2135 return put_user(inode->i_generation, (int *)arg);
2136 case LL_IOC_GROUP_LOCK:
2137 return ll_get_grouplock(inode, file, arg);
2138 case LL_IOC_GROUP_UNLOCK:
2139 return ll_put_grouplock(inode, file, arg);
2140 case IOC_OBD_STATFS:
2141 return ll_obd_statfs(inode, (void *)arg);
2143 /* We need to special case any other ioctls we want to handle,
2144 * to send them to the MDS/OST as appropriate and to properly
2145 * network encode the arg field.
2146 case FSFILT_IOC_SETVERSION_OLD:
2147 case FSFILT_IOC_SETVERSION:
2149 case LL_IOC_FLUSHCTX:
2150 return ll_flush_ctx(inode);
2151 case LL_IOC_PATH2FID: {
2152 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2153 sizeof(struct lu_fid)))
2158 case OBD_IOC_FID2PATH:
2159 return ll_fid2path(inode, (void *)arg);
2160 case LL_IOC_DATA_VERSION: {
2161 struct ioc_data_version idv;
2164 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2167 rc = ll_data_version(inode, &idv.idv_version,
2168 !(idv.idv_flags & LL_DV_NOFLUSH));
2170 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2176 case LL_IOC_GET_MDTIDX: {
2179 mdtidx = ll_get_mdt_idx(inode);
2183 if (put_user((int)mdtidx, (int*)arg))
2188 case OBD_IOC_GETDTNAME:
2189 case OBD_IOC_GETMDNAME:
2190 return ll_get_obd_name(inode, cmd, arg);
2191 case LL_IOC_HSM_STATE_GET: {
2192 struct md_op_data *op_data;
2193 struct hsm_user_state *hus;
2200 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2201 LUSTRE_OPC_ANY, hus);
2202 if (IS_ERR(op_data)) {
2204 return PTR_ERR(op_data);
2207 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2210 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2213 ll_finish_md_op_data(op_data);
2217 case LL_IOC_HSM_STATE_SET: {
2218 struct md_op_data *op_data;
2219 struct hsm_state_set *hss;
2225 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2230 /* Non-root users are forbidden to set or clear flags which are
2231 * NOT defined in HSM_USER_MASK. */
2232 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK)
2233 && !cfs_capable(CFS_CAP_SYS_ADMIN)) {
2238 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2239 LUSTRE_OPC_ANY, hss);
2240 if (IS_ERR(op_data)) {
2242 return PTR_ERR(op_data);
2245 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2248 ll_finish_md_op_data(op_data);
2253 case LL_IOC_HSM_ACTION: {
2254 struct md_op_data *op_data;
2255 struct hsm_current_action *hca;
2262 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2263 LUSTRE_OPC_ANY, hca);
2264 if (IS_ERR(op_data)) {
2266 return PTR_ERR(op_data);
2269 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2272 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2275 ll_finish_md_op_data(op_data);
2279 case LL_IOC_SET_LEASE: {
2280 struct ll_inode_info *lli = ll_i2info(inode);
2281 struct obd_client_handle *och = NULL;
2287 if (!(file->f_mode & FMODE_WRITE))
2292 if (!(file->f_mode & FMODE_READ))
2297 mutex_lock(&lli->lli_och_mutex);
2298 if (fd->fd_lease_och != NULL) {
2299 och = fd->fd_lease_och;
2300 fd->fd_lease_och = NULL;
2302 mutex_unlock(&lli->lli_och_mutex);
2305 mode = och->och_flags &
2306 (FMODE_READ|FMODE_WRITE);
2307 rc = ll_lease_close(och, inode, &lease_broken);
2308 if (rc == 0 && lease_broken)
2314 /* return the type of lease or error */
2315 return rc < 0 ? rc : (int)mode;
2320 CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2322 /* apply for lease */
2323 och = ll_lease_open(inode, file, mode);
2325 return PTR_ERR(och);
2328 mutex_lock(&lli->lli_och_mutex);
2329 if (fd->fd_lease_och == NULL) {
2330 fd->fd_lease_och = och;
2333 mutex_unlock(&lli->lli_och_mutex);
2335 /* impossible now that only excl is supported for now */
2336 ll_lease_close(och, inode, &lease_broken);
2341 case LL_IOC_GET_LEASE: {
2342 struct ll_inode_info *lli = ll_i2info(inode);
2343 struct ldlm_lock *lock = NULL;
2346 mutex_lock(&lli->lli_och_mutex);
2347 if (fd->fd_lease_och != NULL) {
2348 struct obd_client_handle *och = fd->fd_lease_och;
2350 lock = ldlm_handle2lock(&och->och_lease_handle);
2352 lock_res_and_lock(lock);
2353 if (!ldlm_is_cancel(lock))
2354 rc = och->och_flags &
2355 (FMODE_READ | FMODE_WRITE);
2356 unlock_res_and_lock(lock);
2357 ldlm_lock_put(lock);
2360 mutex_unlock(&lli->lli_och_mutex);
2368 ll_iocontrol_call(inode, file, cmd, arg, &err))
2371 return obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2378 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2380 struct inode *inode = file->f_dentry->d_inode;
2381 loff_t retval, eof = 0;
2383 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2384 (origin == SEEK_CUR) ? file->f_pos : 0);
2385 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2386 inode->i_ino, inode->i_generation, inode, retval, retval,
2388 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2390 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2391 retval = ll_glimpse_size(inode);
2394 eof = i_size_read(inode);
2397 retval = generic_file_llseek_size(file, offset, origin,
2398 ll_file_maxbytes(inode), eof);
2402 int ll_flush(struct file *file, fl_owner_t id)
2404 struct inode *inode = file->f_dentry->d_inode;
2405 struct ll_inode_info *lli = ll_i2info(inode);
2406 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2409 LASSERT(!S_ISDIR(inode->i_mode));
2411 /* catch async errors that were recorded back when async writeback
2412 * failed for pages in this mapping. */
2413 rc = lli->lli_async_rc;
2414 lli->lli_async_rc = 0;
2415 err = lov_read_and_clear_async_rc(lli->lli_clob);
2419 /* The application has been told write failure already.
2420 * Do not report failure again. */
2421 if (fd->fd_write_failed)
2423 return rc ? -EIO : 0;
2427 * Called to make sure a portion of file has been written out.
2428 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2430 * Return how many pages have been written.
2432 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2433 enum cl_fsync_mode mode, int ignore_layout)
2435 struct cl_env_nest nest;
2438 struct obd_capa *capa = NULL;
2439 struct cl_fsync_io *fio;
2442 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2443 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2446 env = cl_env_nested_get(&nest);
2448 return PTR_ERR(env);
2450 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2452 io = ccc_env_thread_io(env);
2453 io->ci_obj = cl_i2info(inode)->lli_clob;
2454 io->ci_ignore_layout = ignore_layout;
2456 /* initialize parameters for sync */
2457 fio = &io->u.ci_fsync;
2458 fio->fi_capa = capa;
2459 fio->fi_start = start;
2461 fio->fi_fid = ll_inode2fid(inode);
2462 fio->fi_mode = mode;
2463 fio->fi_nr_written = 0;
2465 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2466 result = cl_io_loop(env, io);
2468 result = io->ci_result;
2470 result = fio->fi_nr_written;
2471 cl_io_fini(env, io);
2472 cl_env_nested_put(&nest, env);
2480 * When dentry is provided (the 'else' case), *file->f_dentry may be
2481 * null and dentry must be used directly rather than pulled from
2482 * *file->f_dentry as is done otherwise.
2485 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2487 struct dentry *dentry = file->f_dentry;
2488 struct inode *inode = dentry->d_inode;
2489 struct ll_inode_info *lli = ll_i2info(inode);
2490 struct ptlrpc_request *req;
2491 struct obd_capa *oc;
2494 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2495 inode->i_generation, inode);
2496 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2498 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2499 mutex_lock(&inode->i_mutex);
2501 /* catch async errors that were recorded back when async writeback
2502 * failed for pages in this mapping. */
2503 if (!S_ISDIR(inode->i_mode)) {
2504 err = lli->lli_async_rc;
2505 lli->lli_async_rc = 0;
2508 err = lov_read_and_clear_async_rc(lli->lli_clob);
2513 oc = ll_mdscapa_get(inode);
2514 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2520 ptlrpc_req_finished(req);
2522 if (datasync && S_ISREG(inode->i_mode)) {
2523 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2525 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2527 if (rc == 0 && err < 0)
2530 fd->fd_write_failed = true;
2532 fd->fd_write_failed = false;
2535 mutex_unlock(&inode->i_mutex);
2539 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2541 struct inode *inode = file->f_dentry->d_inode;
2542 struct ll_sb_info *sbi = ll_i2sbi(inode);
2543 struct ldlm_enqueue_info einfo = {
2544 .ei_type = LDLM_FLOCK,
2545 .ei_cb_cp = ldlm_flock_completion_ast,
2546 .ei_cbdata = file_lock,
2548 struct md_op_data *op_data;
2549 struct lustre_handle lockh = {0};
2550 ldlm_policy_data_t flock = {{0}};
2555 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2556 inode->i_ino, file_lock);
2558 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2560 if (file_lock->fl_flags & FL_FLOCK) {
2561 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2562 /* flocks are whole-file locks */
2563 flock.l_flock.end = OFFSET_MAX;
2564 /* For flocks owner is determined by the local file desctiptor*/
2565 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2566 } else if (file_lock->fl_flags & FL_POSIX) {
2567 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2568 flock.l_flock.start = file_lock->fl_start;
2569 flock.l_flock.end = file_lock->fl_end;
2573 flock.l_flock.pid = file_lock->fl_pid;
2575 /* Somewhat ugly workaround for svc lockd.
2576 * lockd installs custom fl_lmops->lm_compare_owner that checks
2577 * for the fl_owner to be the same (which it always is on local node
2578 * I guess between lockd processes) and then compares pid.
2579 * As such we assign pid to the owner field to make it all work,
2580 * conflict with normal locks is unlikely since pid space and
2581 * pointer space for current->files are not intersecting */
2582 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2583 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2585 switch (file_lock->fl_type) {
2587 einfo.ei_mode = LCK_PR;
2590 /* An unlock request may or may not have any relation to
2591 * existing locks so we may not be able to pass a lock handle
2592 * via a normal ldlm_lock_cancel() request. The request may even
2593 * unlock a byte range in the middle of an existing lock. In
2594 * order to process an unlock request we need all of the same
2595 * information that is given with a normal read or write record
2596 * lock request. To avoid creating another ldlm unlock (cancel)
2597 * message we'll treat a LCK_NL flock request as an unlock. */
2598 einfo.ei_mode = LCK_NL;
2601 einfo.ei_mode = LCK_PW;
2604 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2605 file_lock->fl_type);
2620 flags = LDLM_FL_BLOCK_NOWAIT;
2626 flags = LDLM_FL_TEST_LOCK;
2627 /* Save the old mode so that if the mode in the lock changes we
2628 * can decrement the appropriate reader or writer refcount. */
2629 file_lock->fl_type = einfo.ei_mode;
2632 CERROR("unknown fcntl lock command: %d\n", cmd);
2636 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2637 LUSTRE_OPC_ANY, NULL);
2638 if (IS_ERR(op_data))
2639 return PTR_ERR(op_data);
2641 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2642 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2643 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2645 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2646 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2648 if ((file_lock->fl_flags & FL_FLOCK) &&
2649 (rc == 0 || file_lock->fl_type == F_UNLCK))
2650 rc2 = flock_lock_file_wait(file, file_lock);
2651 if ((file_lock->fl_flags & FL_POSIX) &&
2652 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2653 !(flags & LDLM_FL_TEST_LOCK))
2654 rc2 = posix_lock_file_wait(file, file_lock);
2656 if (rc2 && file_lock->fl_type != F_UNLCK) {
2657 einfo.ei_mode = LCK_NL;
2658 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2659 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2663 ll_finish_md_op_data(op_data);
2668 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2674 * test if some locks matching bits and l_req_mode are acquired
2675 * - bits can be in different locks
2676 * - if found clear the common lock bits in *bits
2677 * - the bits not found, are kept in *bits
2679 * \param bits [IN] searched lock bits [IN]
2680 * \param l_req_mode [IN] searched lock mode
2681 * \retval boolean, true iff all bits are found
2683 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2685 struct lustre_handle lockh;
2686 ldlm_policy_data_t policy;
2687 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2688 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2696 fid = &ll_i2info(inode)->lli_fid;
2697 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2698 ldlm_lockname[mode]);
2700 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2701 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2702 policy.l_inodebits.bits = *bits & (1 << i);
2703 if (policy.l_inodebits.bits == 0)
2706 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2707 &policy, mode, &lockh)) {
2708 struct ldlm_lock *lock;
2710 lock = ldlm_handle2lock(&lockh);
2713 ~(lock->l_policy_data.l_inodebits.bits);
2714 LDLM_LOCK_PUT(lock);
2716 *bits &= ~policy.l_inodebits.bits;
2723 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2724 struct lustre_handle *lockh, __u64 flags)
2726 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2730 fid = &ll_i2info(inode)->lli_fid;
2731 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2733 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2734 fid, LDLM_IBITS, &policy,
2735 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2739 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2741 /* Already unlinked. Just update nlink and return success */
2742 if (rc == -ENOENT) {
2744 /* This path cannot be hit for regular files unless in
2745 * case of obscure races, so no need to validate size.
2747 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2749 } else if (rc != 0) {
2750 CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
2751 ll_get_fsname(inode->i_sb, NULL, 0),
2752 PFID(ll_inode2fid(inode)), rc);
2758 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2761 struct inode *inode = dentry->d_inode;
2762 struct ptlrpc_request *req = NULL;
2763 struct obd_export *exp;
2766 LASSERT(inode != NULL);
2768 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2769 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2771 exp = ll_i2mdexp(inode);
2773 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2774 * But under CMD case, it caused some lock issues, should be fixed
2775 * with new CMD ibits lock. See bug 12718 */
2776 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2777 struct lookup_intent oit = { .it_op = IT_GETATTR };
2778 struct md_op_data *op_data;
2780 if (ibits == MDS_INODELOCK_LOOKUP)
2781 oit.it_op = IT_LOOKUP;
2783 /* Call getattr by fid, so do not provide name at all. */
2784 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2785 dentry->d_inode, NULL, 0, 0,
2786 LUSTRE_OPC_ANY, NULL);
2787 if (IS_ERR(op_data))
2788 return PTR_ERR(op_data);
2790 oit.it_create_mode |= M_CHECK_STALE;
2791 rc = md_intent_lock(exp, op_data, NULL, 0,
2792 /* we are not interested in name
2795 ll_md_blocking_ast, 0);
2796 ll_finish_md_op_data(op_data);
2797 oit.it_create_mode &= ~M_CHECK_STALE;
2799 rc = ll_inode_revalidate_fini(inode, rc);
2803 rc = ll_revalidate_it_finish(req, &oit, dentry);
2805 ll_intent_release(&oit);
2809 /* Unlinked? Unhash dentry, so it is not picked up later by
2810 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2811 here to preserve get_cwd functionality on 2.6.
2813 if (!dentry->d_inode->i_nlink)
2814 d_lustre_invalidate(dentry, 0);
2816 ll_lookup_finish_locks(&oit, dentry);
2817 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2818 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2819 obd_valid valid = OBD_MD_FLGETATTR;
2820 struct md_op_data *op_data;
2823 if (S_ISREG(inode->i_mode)) {
2824 rc = ll_get_max_mdsize(sbi, &ealen);
2827 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2830 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2831 0, ealen, LUSTRE_OPC_ANY,
2833 if (IS_ERR(op_data))
2834 return PTR_ERR(op_data);
2836 op_data->op_valid = valid;
2837 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2838 * capa for this inode. Because we only keep capas of dirs
2840 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2841 ll_finish_md_op_data(op_data);
2843 rc = ll_inode_revalidate_fini(inode, rc);
2847 rc = ll_prep_inode(&inode, req, NULL, NULL);
2850 ptlrpc_req_finished(req);
2854 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2857 struct inode *inode = dentry->d_inode;
2860 rc = __ll_inode_revalidate_it(dentry, it, ibits);
2864 /* if object isn't regular file, don't validate size */
2865 if (!S_ISREG(inode->i_mode)) {
2866 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2867 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2868 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2870 /* In case of restore, the MDT has the right size and has
2871 * already send it back without granting the layout lock,
2872 * inode is up-to-date so glimpse is useless.
2873 * Also to glimpse we need the layout, in case of a running
2874 * restore the MDT holds the layout lock so the glimpse will
2875 * block up to the end of restore (getattr will block)
2877 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
2878 rc = ll_glimpse_size(inode);
2883 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2884 struct lookup_intent *it, struct kstat *stat)
2886 struct inode *inode = de->d_inode;
2887 struct ll_sb_info *sbi = ll_i2sbi(inode);
2888 struct ll_inode_info *lli = ll_i2info(inode);
2891 res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
2892 MDS_INODELOCK_LOOKUP);
2893 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
2898 stat->dev = inode->i_sb->s_dev;
2899 if (ll_need_32bit_api(sbi))
2900 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
2902 stat->ino = inode->i_ino;
2903 stat->mode = inode->i_mode;
2904 stat->nlink = inode->i_nlink;
2905 stat->uid = inode->i_uid;
2906 stat->gid = inode->i_gid;
2907 stat->rdev = inode->i_rdev;
2908 stat->atime = inode->i_atime;
2909 stat->mtime = inode->i_mtime;
2910 stat->ctime = inode->i_ctime;
2911 stat->blksize = 1 << inode->i_blkbits;
2913 stat->size = i_size_read(inode);
2914 stat->blocks = inode->i_blocks;
2918 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2920 struct lookup_intent it = { .it_op = IT_GETATTR };
2922 return ll_getattr_it(mnt, de, &it, stat);
2925 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2926 __u64 start, __u64 len)
2930 struct ll_user_fiemap *fiemap;
2931 unsigned int extent_count = fieinfo->fi_extents_max;
2933 num_bytes = sizeof(*fiemap) + (extent_count *
2934 sizeof(struct ll_fiemap_extent));
2935 OBD_ALLOC_LARGE(fiemap, num_bytes);
2940 fiemap->fm_flags = fieinfo->fi_flags;
2941 fiemap->fm_extent_count = fieinfo->fi_extents_max;
2942 fiemap->fm_start = start;
2943 fiemap->fm_length = len;
2944 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
2945 sizeof(struct ll_fiemap_extent));
2947 rc = ll_do_fiemap(inode, fiemap, num_bytes);
2949 fieinfo->fi_flags = fiemap->fm_flags;
2950 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
2951 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
2952 fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
2954 OBD_FREE_LARGE(fiemap, num_bytes);
2958 struct posix_acl * ll_get_acl(struct inode *inode, int type)
2960 struct ll_inode_info *lli = ll_i2info(inode);
2961 struct posix_acl *acl = NULL;
2963 spin_lock(&lli->lli_lock);
2964 /* VFS' acl_permission_check->check_acl will release the refcount */
2965 acl = posix_acl_dup(lli->lli_posix_acl);
2966 spin_unlock(&lli->lli_lock);
2972 int ll_inode_permission(struct inode *inode, int mask)
2976 #ifdef MAY_NOT_BLOCK
2977 if (mask & MAY_NOT_BLOCK)
2981 /* as root inode are NOT getting validated in lookup operation,
2982 * need to do it before permission check. */
2984 if (inode == inode->i_sb->s_root->d_inode) {
2985 struct lookup_intent it = { .it_op = IT_LOOKUP };
2987 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
2988 MDS_INODELOCK_LOOKUP);
2993 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
2994 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
2996 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2997 return lustre_check_remote_perm(inode, mask);
2999 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3000 rc = generic_permission(inode, mask);
3005 /* -o localflock - only provides locally consistent flock locks */
3006 struct file_operations ll_file_operations = {
3007 .read = ll_file_read,
3008 .aio_read = ll_file_aio_read,
3009 .write = ll_file_write,
3010 .aio_write = ll_file_aio_write,
3011 .unlocked_ioctl = ll_file_ioctl,
3012 .open = ll_file_open,
3013 .release = ll_file_release,
3014 .mmap = ll_file_mmap,
3015 .llseek = ll_file_seek,
3016 .splice_read = ll_file_splice_read,
3021 struct file_operations ll_file_operations_flock = {
3022 .read = ll_file_read,
3023 .aio_read = ll_file_aio_read,
3024 .write = ll_file_write,
3025 .aio_write = ll_file_aio_write,
3026 .unlocked_ioctl = ll_file_ioctl,
3027 .open = ll_file_open,
3028 .release = ll_file_release,
3029 .mmap = ll_file_mmap,
3030 .llseek = ll_file_seek,
3031 .splice_read = ll_file_splice_read,
3034 .flock = ll_file_flock,
3035 .lock = ll_file_flock
3038 /* These are for -o noflock - to return ENOSYS on flock calls */
3039 struct file_operations ll_file_operations_noflock = {
3040 .read = ll_file_read,
3041 .aio_read = ll_file_aio_read,
3042 .write = ll_file_write,
3043 .aio_write = ll_file_aio_write,
3044 .unlocked_ioctl = ll_file_ioctl,
3045 .open = ll_file_open,
3046 .release = ll_file_release,
3047 .mmap = ll_file_mmap,
3048 .llseek = ll_file_seek,
3049 .splice_read = ll_file_splice_read,
3052 .flock = ll_file_noflock,
3053 .lock = ll_file_noflock
3056 struct inode_operations ll_file_inode_operations = {
3057 .setattr = ll_setattr,
3058 .getattr = ll_getattr,
3059 .permission = ll_inode_permission,
3060 .setxattr = ll_setxattr,
3061 .getxattr = ll_getxattr,
3062 .listxattr = ll_listxattr,
3063 .removexattr = ll_removexattr,
3064 .fiemap = ll_fiemap,
3065 .get_acl = ll_get_acl,
3068 /* dynamic ioctl number support routins */
3069 static struct llioc_ctl_data {
3070 struct rw_semaphore ioc_sem;
3071 struct list_head ioc_head;
3073 __RWSEM_INITIALIZER(llioc.ioc_sem),
3074 LIST_HEAD_INIT(llioc.ioc_head)
3079 struct list_head iocd_list;
3080 unsigned int iocd_size;
3081 llioc_callback_t iocd_cb;
3082 unsigned int iocd_count;
3083 unsigned int iocd_cmd[0];
3086 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3089 struct llioc_data *in_data = NULL;
3091 if (cb == NULL || cmd == NULL ||
3092 count > LLIOC_MAX_CMD || count < 0)
3095 size = sizeof(*in_data) + count * sizeof(unsigned int);
3096 OBD_ALLOC(in_data, size);
3097 if (in_data == NULL)
3100 memset(in_data, 0, sizeof(*in_data));
3101 in_data->iocd_size = size;
3102 in_data->iocd_cb = cb;
3103 in_data->iocd_count = count;
3104 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3106 down_write(&llioc.ioc_sem);
3107 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3108 up_write(&llioc.ioc_sem);
3113 void ll_iocontrol_unregister(void *magic)
3115 struct llioc_data *tmp;
3120 down_write(&llioc.ioc_sem);
3121 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3123 unsigned int size = tmp->iocd_size;
3125 list_del(&tmp->iocd_list);
3126 up_write(&llioc.ioc_sem);
3128 OBD_FREE(tmp, size);
3132 up_write(&llioc.ioc_sem);
3134 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3137 EXPORT_SYMBOL(ll_iocontrol_register);
3138 EXPORT_SYMBOL(ll_iocontrol_unregister);
3140 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3141 unsigned int cmd, unsigned long arg, int *rcp)
3143 enum llioc_iter ret = LLIOC_CONT;
3144 struct llioc_data *data;
3145 int rc = -EINVAL, i;
3147 down_read(&llioc.ioc_sem);
3148 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3149 for (i = 0; i < data->iocd_count; i++) {
3150 if (cmd != data->iocd_cmd[i])
3153 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3157 if (ret == LLIOC_STOP)
3160 up_read(&llioc.ioc_sem);
3167 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3169 struct ll_inode_info *lli = ll_i2info(inode);
3170 struct cl_env_nest nest;
3174 if (lli->lli_clob == NULL)
3177 env = cl_env_nested_get(&nest);
3179 return PTR_ERR(env);
3181 result = cl_conf_set(env, lli->lli_clob, conf);
3182 cl_env_nested_put(&nest, env);
3184 if (conf->coc_opc == OBJECT_CONF_SET) {
3185 struct ldlm_lock *lock = conf->coc_lock;
3187 LASSERT(lock != NULL);
3188 LASSERT(ldlm_has_layout(lock));
3190 /* it can only be allowed to match after layout is
3191 * applied to inode otherwise false layout would be
3192 * seen. Applying layout shoud happen before dropping
3193 * the intent lock. */
3194 ldlm_lock_allow_match(lock);
3200 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3201 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3204 struct ll_sb_info *sbi = ll_i2sbi(inode);
3205 struct obd_capa *oc;
3206 struct ptlrpc_request *req;
3207 struct mdt_body *body;
3213 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3214 PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
3215 lock->l_lvb_data, lock->l_lvb_len);
3217 if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
3220 /* if layout lock was granted right away, the layout is returned
3221 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3222 * blocked and then granted via completion ast, we have to fetch
3223 * layout here. Please note that we can't use the LVB buffer in
3224 * completion AST because it doesn't have a large enough buffer */
3225 oc = ll_mdscapa_get(inode);
3226 rc = ll_get_max_mdsize(sbi, &lmmsize);
3228 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3229 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3235 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3236 if (body == NULL || body->eadatasize > lmmsize)
3237 GOTO(out, rc = -EPROTO);
3239 lmmsize = body->eadatasize;
3240 if (lmmsize == 0) /* empty layout */
3243 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3245 GOTO(out, rc = -EFAULT);
3247 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3248 if (lvbdata == NULL)
3249 GOTO(out, rc = -ENOMEM);
3251 memcpy(lvbdata, lmm, lmmsize);
3252 lock_res_and_lock(lock);
3253 if (lock->l_lvb_data != NULL)
3254 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3256 lock->l_lvb_data = lvbdata;
3257 lock->l_lvb_len = lmmsize;
3258 unlock_res_and_lock(lock);
3261 ptlrpc_req_finished(req);
3266 * Apply the layout to the inode. Layout lock is held and will be released
3269 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3270 struct inode *inode, __u32 *gen, bool reconf)
3272 struct ll_inode_info *lli = ll_i2info(inode);
3273 struct ll_sb_info *sbi = ll_i2sbi(inode);
3274 struct ldlm_lock *lock;
3275 struct lustre_md md = { NULL };
3276 struct cl_object_conf conf;
3279 bool wait_layout = false;
3281 LASSERT(lustre_handle_is_used(lockh));
3283 lock = ldlm_handle2lock(lockh);
3284 LASSERT(lock != NULL);
3285 LASSERT(ldlm_has_layout(lock));
3287 LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3288 inode, PFID(&lli->lli_fid), reconf);
3290 /* in case this is a caching lock and reinstate with new inode */
3291 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3293 lock_res_and_lock(lock);
3294 lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3295 unlock_res_and_lock(lock);
3296 /* checking lvb_ready is racy but this is okay. The worst case is
3297 * that multi processes may configure the file on the same time. */
3298 if (lvb_ready || !reconf) {
3301 /* layout_gen must be valid if layout lock is not
3302 * cancelled and stripe has already set */
3303 *gen = lli->lli_layout_gen;
3309 rc = ll_layout_fetch(inode, lock);
3313 /* for layout lock, lmm is returned in lock's lvb.
3314 * lvb_data is immutable if the lock is held so it's safe to access it
3315 * without res lock. See the description in ldlm_lock_decref_internal()
3316 * for the condition to free lvb_data of layout lock */
3317 if (lock->l_lvb_data != NULL) {
3318 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3319 lock->l_lvb_data, lock->l_lvb_len);
3321 *gen = LL_LAYOUT_GEN_EMPTY;
3323 *gen = md.lsm->lsm_layout_gen;
3326 CERROR("%s: file "DFID" unpackmd error: %d\n",
3327 ll_get_fsname(inode->i_sb, NULL, 0),
3328 PFID(&lli->lli_fid), rc);
3334 /* set layout to file. Unlikely this will fail as old layout was
3335 * surely eliminated */
3336 memset(&conf, 0, sizeof(conf));
3337 conf.coc_opc = OBJECT_CONF_SET;
3338 conf.coc_inode = inode;
3339 conf.coc_lock = lock;
3340 conf.u.coc_md = &md;
3341 rc = ll_layout_conf(inode, &conf);
3344 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3346 /* refresh layout failed, need to wait */
3347 wait_layout = rc == -EBUSY;
3350 LDLM_LOCK_PUT(lock);
3351 ldlm_lock_decref(lockh, mode);
3353 /* wait for IO to complete if it's still being used. */
3355 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3356 ll_get_fsname(inode->i_sb, NULL, 0),
3357 inode, PFID(&lli->lli_fid));
3359 memset(&conf, 0, sizeof(conf));
3360 conf.coc_opc = OBJECT_CONF_WAIT;
3361 conf.coc_inode = inode;
3362 rc = ll_layout_conf(inode, &conf);
3366 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3367 PFID(&lli->lli_fid), rc);
3373 * This function checks if there exists a LAYOUT lock on the client side,
3374 * or enqueues it if it doesn't have one in cache.
3376 * This function will not hold layout lock so it may be revoked any time after
3377 * this function returns. Any operations depend on layout should be redone
3380 * This function should be called before lov_io_init() to get an uptodate
3381 * layout version, the caller should save the version number and after IO
3382 * is finished, this function should be called again to verify that layout
3383 * is not changed during IO time.
3385 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3387 struct ll_inode_info *lli = ll_i2info(inode);
3388 struct ll_sb_info *sbi = ll_i2sbi(inode);
3389 struct md_op_data *op_data;
3390 struct lookup_intent it;
3391 struct lustre_handle lockh;
3393 struct ldlm_enqueue_info einfo = {
3394 .ei_type = LDLM_IBITS,
3396 .ei_cb_bl = ll_md_blocking_ast,
3397 .ei_cb_cp = ldlm_completion_ast,
3401 *gen = lli->lli_layout_gen;
3402 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
3406 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3407 LASSERT(S_ISREG(inode->i_mode));
3409 /* mostly layout lock is caching on the local side, so try to match
3410 * it before grabbing layout lock mutex. */
3411 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3412 if (mode != 0) { /* hit cached lock */
3413 rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
3417 /* better hold lli_layout_mutex to try again otherwise
3418 * it will have starvation problem. */
3421 /* take layout lock mutex to enqueue layout lock exclusively. */
3422 mutex_lock(&lli->lli_layout_mutex);
3425 /* try again. Maybe somebody else has done this. */
3426 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3427 if (mode != 0) { /* hit cached lock */
3428 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3432 mutex_unlock(&lli->lli_layout_mutex);
3436 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3437 0, 0, LUSTRE_OPC_ANY, NULL);
3438 if (IS_ERR(op_data)) {
3439 mutex_unlock(&lli->lli_layout_mutex);
3440 return PTR_ERR(op_data);
3443 /* have to enqueue one */
3444 memset(&it, 0, sizeof(it));
3445 it.it_op = IT_LAYOUT;
3446 lockh.cookie = 0ULL;
3448 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3449 ll_get_fsname(inode->i_sb, NULL, 0), inode,
3450 PFID(&lli->lli_fid));
3452 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3454 if (it.d.lustre.it_data != NULL)
3455 ptlrpc_req_finished(it.d.lustre.it_data);
3456 it.d.lustre.it_data = NULL;
3458 ll_finish_md_op_data(op_data);
3460 mode = it.d.lustre.it_lock_mode;
3461 it.d.lustre.it_lock_mode = 0;
3462 ll_intent_drop_lock(&it);
3465 /* set lock data in case this is a new lock */
3466 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3467 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3471 mutex_unlock(&lli->lli_layout_mutex);
3477 * This function send a restore request to the MDT
3479 int ll_layout_restore(struct inode *inode)
3481 struct hsm_user_request *hur;
3484 len = sizeof(struct hsm_user_request) +
3485 sizeof(struct hsm_user_item);
3486 OBD_ALLOC(hur, len);
3490 hur->hur_request.hr_action = HUA_RESTORE;
3491 hur->hur_request.hr_archive_id = 0;
3492 hur->hur_request.hr_flags = 0;
3493 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3494 sizeof(hur->hur_user_item[0].hui_fid));
3495 hur->hur_user_item[0].hui_extent.length = -1;
3496 hur->hur_request.hr_itemcount = 1;
3497 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,