4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2012, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
51 #include "cl_object.h"
53 struct ll_file_data *ll_file_data_get(void)
55 struct ll_file_data *fd;
57 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, __GFP_IO);
60 fd->fd_write_failed = false;
64 static void ll_file_data_put(struct ll_file_data *fd)
67 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
70 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
71 struct lustre_handle *fh)
73 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
74 op_data->op_attr.ia_mode = inode->i_mode;
75 op_data->op_attr.ia_atime = inode->i_atime;
76 op_data->op_attr.ia_mtime = inode->i_mtime;
77 op_data->op_attr.ia_ctime = inode->i_ctime;
78 op_data->op_attr.ia_size = i_size_read(inode);
79 op_data->op_attr_blocks = inode->i_blocks;
80 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
81 ll_inode_to_ext_flags(inode->i_flags);
82 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
84 op_data->op_handle = *fh;
85 op_data->op_capa1 = ll_mdscapa_get(inode);
87 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
88 op_data->op_bias |= MDS_DATA_MODIFIED;
92 * Closes the IO epoch and packs all the attributes into @op_data for
95 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
96 struct obd_client_handle *och)
98 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
99 ATTR_MTIME | ATTR_MTIME_SET |
100 ATTR_CTIME | ATTR_CTIME_SET;
102 if (!(och->och_flags & FMODE_WRITE))
105 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
106 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
108 ll_ioepoch_close(inode, op_data, &och, 0);
111 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
112 ll_prep_md_op_data(op_data, inode, NULL, NULL,
113 0, 0, LUSTRE_OPC_ANY, NULL);
116 static int ll_close_inode_openhandle(struct obd_export *md_exp,
118 struct obd_client_handle *och,
119 const __u64 *data_version)
121 struct obd_export *exp = ll_i2mdexp(inode);
122 struct md_op_data *op_data;
123 struct ptlrpc_request *req = NULL;
124 struct obd_device *obd = class_exp2obd(exp);
130 * XXX: in case of LMV, is this correct to access
133 CERROR("Invalid MDC connection handle "LPX64"\n",
134 ll_i2mdexp(inode)->exp_handle.h_cookie);
138 OBD_ALLOC_PTR(op_data);
140 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
142 ll_prepare_close(inode, op_data, och);
143 if (data_version != NULL) {
144 /* Pass in data_version implies release. */
145 op_data->op_bias |= MDS_HSM_RELEASE;
146 op_data->op_data_version = *data_version;
147 op_data->op_lease_handle = och->och_lease_handle;
148 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
150 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
151 rc = md_close(md_exp, op_data, och->och_mod, &req);
153 /* This close must have the epoch closed. */
154 LASSERT(epoch_close);
155 /* MDS has instructed us to obtain Size-on-MDS attribute from
156 * OSTs and send setattr to back to MDS. */
157 rc = ll_som_update(inode, op_data);
159 CERROR("inode %lu mdc Size-on-MDS update failed: "
160 "rc = %d\n", inode->i_ino, rc);
164 CERROR("inode %lu mdc close failed: rc = %d\n",
168 /* DATA_MODIFIED flag was successfully sent on close, cancel data
169 * modification flag. */
170 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
171 struct ll_inode_info *lli = ll_i2info(inode);
173 spin_lock(&lli->lli_lock);
174 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
175 spin_unlock(&lli->lli_lock);
179 rc = ll_objects_destroy(req, inode);
181 CERROR("inode %lu ll_objects destroy: rc = %d\n",
184 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
185 struct mdt_body *body;
186 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
187 if (!(body->valid & OBD_MD_FLRELEASED))
191 ll_finish_md_op_data(op_data);
194 if (exp_connect_som(exp) && !epoch_close &&
195 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
196 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
198 md_clear_open_replay_data(md_exp, och);
199 /* Free @och if it is not waiting for DONE_WRITING. */
200 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
203 if (req) /* This is close request */
204 ptlrpc_req_finished(req);
208 int ll_md_real_close(struct inode *inode, int flags)
210 struct ll_inode_info *lli = ll_i2info(inode);
211 struct obd_client_handle **och_p;
212 struct obd_client_handle *och;
216 if (flags & FMODE_WRITE) {
217 och_p = &lli->lli_mds_write_och;
218 och_usecount = &lli->lli_open_fd_write_count;
219 } else if (flags & FMODE_EXEC) {
220 och_p = &lli->lli_mds_exec_och;
221 och_usecount = &lli->lli_open_fd_exec_count;
223 LASSERT(flags & FMODE_READ);
224 och_p = &lli->lli_mds_read_och;
225 och_usecount = &lli->lli_open_fd_read_count;
228 mutex_lock(&lli->lli_och_mutex);
229 if (*och_usecount) { /* There are still users of this handle, so
231 mutex_unlock(&lli->lli_och_mutex);
236 mutex_unlock(&lli->lli_och_mutex);
238 if (och) { /* There might be a race and somebody have freed this och
240 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
247 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
250 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
251 struct ll_inode_info *lli = ll_i2info(inode);
254 /* clear group lock, if present */
255 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
256 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
258 if (fd->fd_lease_och != NULL) {
261 /* Usually the lease is not released when the
262 * application crashed, we need to release here. */
263 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
264 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
265 PFID(&lli->lli_fid), rc, lease_broken);
267 fd->fd_lease_och = NULL;
270 if (fd->fd_och != NULL) {
271 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
276 /* Let's see if we have good enough OPEN lock on the file and if
277 we can skip talking to MDS */
278 if (file->f_dentry->d_inode) { /* Can this ever be false? */
280 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
281 struct lustre_handle lockh;
282 struct inode *inode = file->f_dentry->d_inode;
283 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
285 mutex_lock(&lli->lli_och_mutex);
286 if (fd->fd_omode & FMODE_WRITE) {
288 LASSERT(lli->lli_open_fd_write_count);
289 lli->lli_open_fd_write_count--;
290 } else if (fd->fd_omode & FMODE_EXEC) {
292 LASSERT(lli->lli_open_fd_exec_count);
293 lli->lli_open_fd_exec_count--;
296 LASSERT(lli->lli_open_fd_read_count);
297 lli->lli_open_fd_read_count--;
299 mutex_unlock(&lli->lli_och_mutex);
301 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
302 LDLM_IBITS, &policy, lockmode,
304 rc = ll_md_real_close(file->f_dentry->d_inode,
308 CERROR("Releasing a file %p with negative dentry %p. Name %s",
309 file, file->f_dentry, file->f_dentry->d_name.name);
313 LUSTRE_FPRIVATE(file) = NULL;
314 ll_file_data_put(fd);
315 ll_capa_close(inode);
320 /* While this returns an error code, fput() the caller does not, so we need
321 * to make every effort to clean up all of our state here. Also, applications
322 * rarely check close errors and even if an error is returned they will not
323 * re-try the close call.
325 int ll_file_release(struct inode *inode, struct file *file)
327 struct ll_file_data *fd;
328 struct ll_sb_info *sbi = ll_i2sbi(inode);
329 struct ll_inode_info *lli = ll_i2info(inode);
332 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
333 inode->i_generation, inode);
335 #ifdef CONFIG_FS_POSIX_ACL
336 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
337 inode == inode->i_sb->s_root->d_inode) {
338 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
341 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
342 fd->fd_flags &= ~LL_FILE_RMTACL;
343 rct_del(&sbi->ll_rct, current_pid());
344 et_search_free(&sbi->ll_et, current_pid());
349 if (inode->i_sb->s_root != file->f_dentry)
350 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
351 fd = LUSTRE_FPRIVATE(file);
354 /* The last ref on @file, maybe not the the owner pid of statahead.
355 * Different processes can open the same dir, "ll_opendir_key" means:
356 * it is me that should stop the statahead thread. */
357 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
358 lli->lli_opendir_pid != 0)
359 ll_stop_statahead(inode, lli->lli_opendir_key);
361 if (inode->i_sb->s_root == file->f_dentry) {
362 LUSTRE_FPRIVATE(file) = NULL;
363 ll_file_data_put(fd);
367 if (!S_ISDIR(inode->i_mode)) {
368 lov_read_and_clear_async_rc(lli->lli_clob);
369 lli->lli_async_rc = 0;
372 rc = ll_md_close(sbi->ll_md_exp, inode, file);
374 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
375 libcfs_debug_dumplog();
380 static int ll_intent_file_open(struct file *file, void *lmm,
381 int lmmsize, struct lookup_intent *itp)
383 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
384 struct dentry *parent = file->f_dentry->d_parent;
385 const char *name = file->f_dentry->d_name.name;
386 const int len = file->f_dentry->d_name.len;
387 struct md_op_data *op_data;
388 struct ptlrpc_request *req;
389 __u32 opc = LUSTRE_OPC_ANY;
395 /* Usually we come here only for NFSD, and we want open lock.
396 But we can also get here with pre 2.6.15 patchless kernels, and in
397 that case that lock is also ok */
398 /* We can also get here if there was cached open handle in revalidate_it
399 * but it disappeared while we were getting from there to ll_file_open.
400 * But this means this file was closed and immediately opened which
401 * makes a good candidate for using OPEN lock */
402 /* If lmmsize & lmm are not 0, we are just setting stripe info
403 * parameters. No need for the open lock */
404 if (lmm == NULL && lmmsize == 0) {
405 itp->it_flags |= MDS_OPEN_LOCK;
406 if (itp->it_flags & FMODE_WRITE)
407 opc = LUSTRE_OPC_CREATE;
410 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
411 file->f_dentry->d_inode, name, len,
414 return PTR_ERR(op_data);
416 itp->it_flags |= MDS_OPEN_BY_FID;
417 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
418 0 /*unused */, &req, ll_md_blocking_ast, 0);
419 ll_finish_md_op_data(op_data);
421 /* reason for keep own exit path - don`t flood log
422 * with messages with -ESTALE errors.
424 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
425 it_open_error(DISP_OPEN_OPEN, itp))
427 ll_release_openhandle(file->f_dentry, itp);
431 if (it_disposition(itp, DISP_LOOKUP_NEG))
432 GOTO(out, rc = -ENOENT);
434 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
435 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
436 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
440 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
441 if (!rc && itp->d.lustre.it_lock_mode)
442 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
446 ptlrpc_req_finished(itp->d.lustre.it_data);
447 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
448 ll_intent_drop_lock(itp);
454 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
455 * not believe attributes if a few ioepoch holders exist. Attributes for
456 * previous ioepoch if new one is opened are also skipped by MDS.
458 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
460 if (ioepoch && lli->lli_ioepoch != ioepoch) {
461 lli->lli_ioepoch = ioepoch;
462 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
463 ioepoch, PFID(&lli->lli_fid));
467 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
468 struct obd_client_handle *och)
470 struct ptlrpc_request *req = it->d.lustre.it_data;
471 struct mdt_body *body;
473 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
474 och->och_fh = body->handle;
475 och->och_fid = body->fid1;
476 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
477 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
478 och->och_flags = it->it_flags;
480 return md_set_open_replay_data(md_exp, och, req);
483 int ll_local_open(struct file *file, struct lookup_intent *it,
484 struct ll_file_data *fd, struct obd_client_handle *och)
486 struct inode *inode = file->f_dentry->d_inode;
487 struct ll_inode_info *lli = ll_i2info(inode);
489 LASSERT(!LUSTRE_FPRIVATE(file));
494 struct ptlrpc_request *req = it->d.lustre.it_data;
495 struct mdt_body *body;
498 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
502 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
503 ll_ioepoch_open(lli, body->ioepoch);
506 LUSTRE_FPRIVATE(file) = fd;
507 ll_readahead_init(inode, &fd->fd_ras);
508 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
512 /* Open a file, and (for the very first open) create objects on the OSTs at
513 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
514 * creation or open until ll_lov_setstripe() ioctl is called.
516 * If we already have the stripe MD locally then we don't request it in
517 * md_open(), by passing a lmm_size = 0.
519 * It is up to the application to ensure no other processes open this file
520 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
521 * used. We might be able to avoid races of that sort by getting lli_open_sem
522 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
523 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
525 int ll_file_open(struct inode *inode, struct file *file)
527 struct ll_inode_info *lli = ll_i2info(inode);
528 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
529 .it_flags = file->f_flags };
530 struct obd_client_handle **och_p = NULL;
531 __u64 *och_usecount = NULL;
532 struct ll_file_data *fd;
533 int rc = 0, opendir_set = 0;
535 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
536 inode->i_generation, inode, file->f_flags);
538 it = file->private_data; /* XXX: compat macro */
539 file->private_data = NULL; /* prevent ll_local_open assertion */
541 fd = ll_file_data_get();
543 GOTO(out_openerr, rc = -ENOMEM);
546 if (S_ISDIR(inode->i_mode)) {
547 spin_lock(&lli->lli_sa_lock);
548 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
549 lli->lli_opendir_pid == 0) {
550 lli->lli_opendir_key = fd;
551 lli->lli_opendir_pid = current_pid();
554 spin_unlock(&lli->lli_sa_lock);
557 if (inode->i_sb->s_root == file->f_dentry) {
558 LUSTRE_FPRIVATE(file) = fd;
562 if (!it || !it->d.lustre.it_disposition) {
563 /* Convert f_flags into access mode. We cannot use file->f_mode,
564 * because everything but O_ACCMODE mask was stripped from
566 if ((oit.it_flags + 1) & O_ACCMODE)
568 if (file->f_flags & O_TRUNC)
569 oit.it_flags |= FMODE_WRITE;
571 /* kernel only call f_op->open in dentry_open. filp_open calls
572 * dentry_open after call to open_namei that checks permissions.
573 * Only nfsd_open call dentry_open directly without checking
574 * permissions and because of that this code below is safe. */
575 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
576 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
578 /* We do not want O_EXCL here, presumably we opened the file
579 * already? XXX - NFS implications? */
580 oit.it_flags &= ~O_EXCL;
582 /* bug20584, if "it_flags" contains O_CREAT, the file will be
583 * created if necessary, then "IT_CREAT" should be set to keep
584 * consistent with it */
585 if (oit.it_flags & O_CREAT)
586 oit.it_op |= IT_CREAT;
592 /* Let's see if we have file open on MDS already. */
593 if (it->it_flags & FMODE_WRITE) {
594 och_p = &lli->lli_mds_write_och;
595 och_usecount = &lli->lli_open_fd_write_count;
596 } else if (it->it_flags & FMODE_EXEC) {
597 och_p = &lli->lli_mds_exec_och;
598 och_usecount = &lli->lli_open_fd_exec_count;
600 och_p = &lli->lli_mds_read_och;
601 och_usecount = &lli->lli_open_fd_read_count;
604 mutex_lock(&lli->lli_och_mutex);
605 if (*och_p) { /* Open handle is present */
606 if (it_disposition(it, DISP_OPEN_OPEN)) {
607 /* Well, there's extra open request that we do not need,
608 let's close it somehow. This will decref request. */
609 rc = it_open_error(DISP_OPEN_OPEN, it);
611 mutex_unlock(&lli->lli_och_mutex);
612 GOTO(out_openerr, rc);
615 ll_release_openhandle(file->f_dentry, it);
619 rc = ll_local_open(file, it, fd, NULL);
622 mutex_unlock(&lli->lli_och_mutex);
623 GOTO(out_openerr, rc);
626 LASSERT(*och_usecount == 0);
627 if (!it->d.lustre.it_disposition) {
628 /* We cannot just request lock handle now, new ELC code
629 means that one of other OPEN locks for this file
630 could be cancelled, and since blocking ast handler
631 would attempt to grab och_mutex as well, that would
632 result in a deadlock */
633 mutex_unlock(&lli->lli_och_mutex);
634 it->it_create_mode |= M_CHECK_STALE;
635 rc = ll_intent_file_open(file, NULL, 0, it);
636 it->it_create_mode &= ~M_CHECK_STALE;
638 GOTO(out_openerr, rc);
642 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
644 GOTO(out_och_free, rc = -ENOMEM);
648 /* md_intent_lock() didn't get a request ref if there was an
649 * open error, so don't do cleanup on the request here
651 /* XXX (green): Should not we bail out on any error here, not
652 * just open error? */
653 rc = it_open_error(DISP_OPEN_OPEN, it);
655 GOTO(out_och_free, rc);
657 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
659 rc = ll_local_open(file, it, fd, *och_p);
661 GOTO(out_och_free, rc);
663 mutex_unlock(&lli->lli_och_mutex);
666 /* Must do this outside lli_och_mutex lock to prevent deadlock where
667 different kind of OPEN lock for this same inode gets cancelled
668 by ldlm_cancel_lru */
669 if (!S_ISREG(inode->i_mode))
670 GOTO(out_och_free, rc);
674 if (!lli->lli_has_smd) {
675 if (file->f_flags & O_LOV_DELAY_CREATE ||
676 !(file->f_mode & FMODE_WRITE)) {
677 CDEBUG(D_INODE, "object creation was delayed\n");
678 GOTO(out_och_free, rc);
681 file->f_flags &= ~O_LOV_DELAY_CREATE;
682 GOTO(out_och_free, rc);
686 if (och_p && *och_p) {
687 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
688 *och_p = NULL; /* OBD_FREE writes some magic there */
691 mutex_unlock(&lli->lli_och_mutex);
694 if (opendir_set != 0)
695 ll_stop_statahead(inode, lli->lli_opendir_key);
697 ll_file_data_put(fd);
699 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
702 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
703 ptlrpc_req_finished(it->d.lustre.it_data);
704 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
710 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
711 struct ldlm_lock_desc *desc, void *data, int flag)
714 struct lustre_handle lockh;
717 case LDLM_CB_BLOCKING:
718 ldlm_lock2handle(lock, &lockh);
719 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
721 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
725 case LDLM_CB_CANCELING:
733 * Acquire a lease and open the file.
735 struct obd_client_handle *ll_lease_open(struct inode *inode, struct file *file,
736 fmode_t fmode, __u64 open_flags)
738 struct lookup_intent it = { .it_op = IT_OPEN };
739 struct ll_sb_info *sbi = ll_i2sbi(inode);
740 struct md_op_data *op_data;
741 struct ptlrpc_request *req;
742 struct lustre_handle old_handle = { 0 };
743 struct obd_client_handle *och = NULL;
747 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
748 return ERR_PTR(-EINVAL);
751 struct ll_inode_info *lli = ll_i2info(inode);
752 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
753 struct obd_client_handle **och_p;
756 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
757 return ERR_PTR(-EPERM);
759 /* Get the openhandle of the file */
761 mutex_lock(&lli->lli_och_mutex);
762 if (fd->fd_lease_och != NULL) {
763 mutex_unlock(&lli->lli_och_mutex);
767 if (fd->fd_och == NULL) {
768 if (file->f_mode & FMODE_WRITE) {
769 LASSERT(lli->lli_mds_write_och != NULL);
770 och_p = &lli->lli_mds_write_och;
771 och_usecount = &lli->lli_open_fd_write_count;
773 LASSERT(lli->lli_mds_read_och != NULL);
774 och_p = &lli->lli_mds_read_och;
775 och_usecount = &lli->lli_open_fd_read_count;
777 if (*och_usecount == 1) {
784 mutex_unlock(&lli->lli_och_mutex);
785 if (rc < 0) /* more than 1 opener */
788 LASSERT(fd->fd_och != NULL);
789 old_handle = fd->fd_och->och_fh;
794 return ERR_PTR(-ENOMEM);
796 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
797 LUSTRE_OPC_ANY, NULL);
799 GOTO(out, rc = PTR_ERR(op_data));
801 /* To tell the MDT this openhandle is from the same owner */
802 op_data->op_handle = old_handle;
804 it.it_flags = fmode | open_flags;
805 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
806 rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
807 ll_md_blocking_lease_ast,
808 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
809 * it can be cancelled which may mislead applications that the lease is
811 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
812 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
813 * doesn't deal with openhandle, so normal openhandle will be leaked. */
814 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
815 ll_finish_md_op_data(op_data);
817 ptlrpc_req_finished(req);
818 it_clear_disposition(&it, DISP_ENQ_COMPLETE);
821 GOTO(out_release_it, rc);
823 if (it_disposition(&it, DISP_LOOKUP_NEG))
824 GOTO(out_release_it, rc = -ENOENT);
826 rc = it_open_error(DISP_OPEN_OPEN, &it);
828 GOTO(out_release_it, rc);
830 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
831 ll_och_fill(sbi->ll_md_exp, &it, och);
833 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
834 GOTO(out_close, rc = -EOPNOTSUPP);
836 /* already get lease, handle lease lock */
837 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
838 if (it.d.lustre.it_lock_mode == 0 ||
839 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
840 /* open lock must return for lease */
841 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
842 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
843 it.d.lustre.it_lock_bits);
844 GOTO(out_close, rc = -EPROTO);
847 ll_intent_release(&it);
851 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
853 CERROR("Close openhandle returned %d\n", rc2);
855 /* cancel open lock */
856 if (it.d.lustre.it_lock_mode != 0) {
857 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
858 it.d.lustre.it_lock_mode);
859 it.d.lustre.it_lock_mode = 0;
862 ll_intent_release(&it);
867 EXPORT_SYMBOL(ll_lease_open);
870 * Release lease and close the file.
871 * It will check if the lease has ever broken.
873 int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
876 struct ldlm_lock *lock;
877 bool cancelled = true;
880 lock = ldlm_handle2lock(&och->och_lease_handle);
882 lock_res_and_lock(lock);
883 cancelled = ldlm_is_cancel(lock);
884 unlock_res_and_lock(lock);
888 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
889 PFID(&ll_i2info(inode)->lli_fid), cancelled);
892 ldlm_cli_cancel(&och->och_lease_handle, 0);
893 if (lease_broken != NULL)
894 *lease_broken = cancelled;
896 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
900 EXPORT_SYMBOL(ll_lease_close);
902 /* Fills the obdo with the attributes for the lsm */
903 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
904 struct obd_capa *capa, struct obdo *obdo,
905 __u64 ioepoch, int sync)
907 struct ptlrpc_request_set *set;
908 struct obd_info oinfo = { { { 0 } } };
911 LASSERT(lsm != NULL);
915 oinfo.oi_oa->o_oi = lsm->lsm_oi;
916 oinfo.oi_oa->o_mode = S_IFREG;
917 oinfo.oi_oa->o_ioepoch = ioepoch;
918 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
919 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
920 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
921 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
922 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
923 OBD_MD_FLDATAVERSION;
924 oinfo.oi_capa = capa;
926 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
927 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
930 set = ptlrpc_prep_set();
932 CERROR("can't allocate ptlrpc set\n");
935 rc = obd_getattr_async(exp, &oinfo, set);
937 rc = ptlrpc_set_wait(set);
938 ptlrpc_set_destroy(set);
941 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
942 OBD_MD_FLATIME | OBD_MD_FLMTIME |
943 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
944 OBD_MD_FLDATAVERSION);
949 * Performs the getattr on the inode and updates its fields.
950 * If @sync != 0, perform the getattr under the server-side lock.
952 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
953 __u64 ioepoch, int sync)
955 struct obd_capa *capa = ll_mdscapa_get(inode);
956 struct lov_stripe_md *lsm;
959 lsm = ccc_inode_lsm_get(inode);
960 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
961 capa, obdo, ioepoch, sync);
964 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
966 obdo_refresh_inode(inode, obdo, obdo->o_valid);
967 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
968 " blksize %lu\n", POSTID(oi), i_size_read(inode),
969 (unsigned long long)inode->i_blocks,
970 (unsigned long)ll_inode_blksize(inode));
972 ccc_inode_lsm_put(inode, lsm);
976 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
978 struct ll_inode_info *lli = ll_i2info(inode);
979 struct cl_object *obj = lli->lli_clob;
980 struct cl_attr *attr = ccc_env_thread_attr(env);
984 ll_inode_size_lock(inode);
985 /* merge timestamps the most recently obtained from mds with
986 timestamps obtained from osts */
987 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
988 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
989 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
990 inode_init_lvb(inode, &lvb);
992 cl_object_attr_lock(obj);
993 rc = cl_object_attr_get(env, obj, attr);
994 cl_object_attr_unlock(obj);
997 if (lvb.lvb_atime < attr->cat_atime)
998 lvb.lvb_atime = attr->cat_atime;
999 if (lvb.lvb_ctime < attr->cat_ctime)
1000 lvb.lvb_ctime = attr->cat_ctime;
1001 if (lvb.lvb_mtime < attr->cat_mtime)
1002 lvb.lvb_mtime = attr->cat_mtime;
1004 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1005 PFID(&lli->lli_fid), attr->cat_size);
1006 cl_isize_write_nolock(inode, attr->cat_size);
1008 inode->i_blocks = attr->cat_blocks;
1010 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1011 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1012 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1014 ll_inode_size_unlock(inode);
1019 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1022 struct obdo obdo = { 0 };
1025 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1027 st->st_size = obdo.o_size;
1028 st->st_blocks = obdo.o_blocks;
1029 st->st_mtime = obdo.o_mtime;
1030 st->st_atime = obdo.o_atime;
1031 st->st_ctime = obdo.o_ctime;
1036 void ll_io_init(struct cl_io *io, const struct file *file, int write)
1038 struct inode *inode = file->f_dentry->d_inode;
1040 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1042 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1043 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1044 file->f_flags & O_DIRECT ||
1047 io->ci_obj = ll_i2info(inode)->lli_clob;
1048 io->ci_lockreq = CILR_MAYBE;
1049 if (ll_file_nolock(file)) {
1050 io->ci_lockreq = CILR_NEVER;
1051 io->ci_no_srvlock = 1;
1052 } else if (file->f_flags & O_APPEND) {
1053 io->ci_lockreq = CILR_MANDATORY;
1058 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1059 struct file *file, enum cl_io_type iot,
1060 loff_t *ppos, size_t count)
1062 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1063 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1068 io = ccc_env_thread_io(env);
1069 ll_io_init(io, file, iot == CIT_WRITE);
1071 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1072 struct vvp_io *vio = vvp_env_io(env);
1073 struct ccc_io *cio = ccc_env_io(env);
1074 int write_mutex_locked = 0;
1076 cio->cui_fd = LUSTRE_FPRIVATE(file);
1077 vio->cui_io_subtype = args->via_io_subtype;
1079 switch (vio->cui_io_subtype) {
1081 cio->cui_iov = args->u.normal.via_iov;
1082 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1083 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1084 cio->cui_iocb = args->u.normal.via_iocb;
1085 if ((iot == CIT_WRITE) &&
1086 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1087 if (mutex_lock_interruptible(&lli->
1089 GOTO(out, result = -ERESTARTSYS);
1090 write_mutex_locked = 1;
1091 } else if (iot == CIT_READ) {
1092 down_read(&lli->lli_trunc_sem);
1096 vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
1097 vio->u.sendfile.cui_target = args->u.sendfile.via_target;
1100 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1101 vio->u.splice.cui_flags = args->u.splice.via_flags;
1104 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1107 result = cl_io_loop(env, io);
1108 if (write_mutex_locked)
1109 mutex_unlock(&lli->lli_write_mutex);
1110 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
1111 up_read(&lli->lli_trunc_sem);
1113 /* cl_io_rw_init() handled IO */
1114 result = io->ci_result;
1117 if (io->ci_nob > 0) {
1118 result = io->ci_nob;
1119 *ppos = io->u.ci_wr.wr.crw_pos;
1123 cl_io_fini(env, io);
1124 /* If any bit been read/written (result != 0), we just return
1125 * short read/write instead of restart io. */
1126 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1127 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1128 iot == CIT_READ ? "read" : "write",
1129 file->f_dentry->d_name.name, *ppos, count);
1130 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1134 if (iot == CIT_READ) {
1136 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1137 LPROC_LL_READ_BYTES, result);
1138 } else if (iot == CIT_WRITE) {
1140 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1141 LPROC_LL_WRITE_BYTES, result);
1142 fd->fd_write_failed = false;
1143 } else if (result != -ERESTARTSYS) {
1144 fd->fd_write_failed = true;
1151 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1152 unsigned long nr_segs, loff_t pos)
1155 struct vvp_io_args *args;
1160 result = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
1164 env = cl_env_get(&refcheck);
1166 return PTR_ERR(env);
1168 args = vvp_env_args(env, IO_NORMAL);
1169 args->u.normal.via_iov = (struct iovec *)iov;
1170 args->u.normal.via_nrsegs = nr_segs;
1171 args->u.normal.via_iocb = iocb;
1173 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1174 &iocb->ki_pos, count);
1175 cl_env_put(env, &refcheck);
1179 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1183 struct iovec *local_iov;
1184 struct kiocb *kiocb;
1188 env = cl_env_get(&refcheck);
1190 return PTR_ERR(env);
1192 local_iov = &vvp_env_info(env)->vti_local_iov;
1193 kiocb = &vvp_env_info(env)->vti_kiocb;
1194 local_iov->iov_base = (void __user *)buf;
1195 local_iov->iov_len = count;
1196 init_sync_kiocb(kiocb, file);
1197 kiocb->ki_pos = *ppos;
1198 kiocb->ki_nbytes = count;
1200 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1201 *ppos = kiocb->ki_pos;
1203 cl_env_put(env, &refcheck);
1208 * Write to a file (through the page cache).
1210 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1211 unsigned long nr_segs, loff_t pos)
1214 struct vvp_io_args *args;
1219 result = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
1223 env = cl_env_get(&refcheck);
1225 return PTR_ERR(env);
1227 args = vvp_env_args(env, IO_NORMAL);
1228 args->u.normal.via_iov = (struct iovec *)iov;
1229 args->u.normal.via_nrsegs = nr_segs;
1230 args->u.normal.via_iocb = iocb;
1232 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1233 &iocb->ki_pos, count);
1234 cl_env_put(env, &refcheck);
1238 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1242 struct iovec *local_iov;
1243 struct kiocb *kiocb;
1247 env = cl_env_get(&refcheck);
1249 return PTR_ERR(env);
1251 local_iov = &vvp_env_info(env)->vti_local_iov;
1252 kiocb = &vvp_env_info(env)->vti_kiocb;
1253 local_iov->iov_base = (void __user *)buf;
1254 local_iov->iov_len = count;
1255 init_sync_kiocb(kiocb, file);
1256 kiocb->ki_pos = *ppos;
1257 kiocb->ki_nbytes = count;
1259 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1260 *ppos = kiocb->ki_pos;
1262 cl_env_put(env, &refcheck);
1269 * Send file content (through pagecache) somewhere with helper
1271 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1272 struct pipe_inode_info *pipe, size_t count,
1276 struct vvp_io_args *args;
1280 env = cl_env_get(&refcheck);
1282 return PTR_ERR(env);
1284 args = vvp_env_args(env, IO_SPLICE);
1285 args->u.splice.via_pipe = pipe;
1286 args->u.splice.via_flags = flags;
1288 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1289 cl_env_put(env, &refcheck);
1293 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1296 struct obd_export *exp = ll_i2dtexp(inode);
1297 struct obd_trans_info oti = { 0 };
1298 struct obdo *oa = NULL;
1301 struct lov_stripe_md *lsm = NULL, *lsm2;
1307 lsm = ccc_inode_lsm_get(inode);
1308 if (!lsm_has_objects(lsm))
1309 GOTO(out, rc = -ENOENT);
1311 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1312 (lsm->lsm_stripe_count));
1314 OBD_ALLOC_LARGE(lsm2, lsm_size);
1316 GOTO(out, rc = -ENOMEM);
1319 oa->o_nlink = ost_idx;
1320 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1321 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1322 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1323 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1324 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1325 memcpy(lsm2, lsm, lsm_size);
1326 ll_inode_size_lock(inode);
1327 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1328 ll_inode_size_unlock(inode);
1330 OBD_FREE_LARGE(lsm2, lsm_size);
1333 ccc_inode_lsm_put(inode, lsm);
1338 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1340 struct ll_recreate_obj ucreat;
1343 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1346 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1350 ostid_set_seq_mdt0(&oi);
1351 ostid_set_id(&oi, ucreat.lrc_id);
1352 return ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx);
1355 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1361 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1364 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1367 fid_to_ostid(&fid, &oi);
1368 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1369 return ll_lov_recreate(inode, &oi, ost_idx);
1372 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1373 int flags, struct lov_user_md *lum, int lum_size)
1375 struct lov_stripe_md *lsm = NULL;
1376 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1379 lsm = ccc_inode_lsm_get(inode);
1381 ccc_inode_lsm_put(inode, lsm);
1382 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1387 ll_inode_size_lock(inode);
1388 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1391 rc = oit.d.lustre.it_status;
1393 GOTO(out_req_free, rc);
1395 ll_release_openhandle(file->f_dentry, &oit);
1398 ll_inode_size_unlock(inode);
1399 ll_intent_release(&oit);
1400 ccc_inode_lsm_put(inode, lsm);
1403 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1407 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1408 struct lov_mds_md **lmmp, int *lmm_size,
1409 struct ptlrpc_request **request)
1411 struct ll_sb_info *sbi = ll_i2sbi(inode);
1412 struct mdt_body *body;
1413 struct lov_mds_md *lmm = NULL;
1414 struct ptlrpc_request *req = NULL;
1415 struct md_op_data *op_data;
1418 rc = ll_get_max_mdsize(sbi, &lmmsize);
1422 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1423 strlen(filename), lmmsize,
1424 LUSTRE_OPC_ANY, NULL);
1425 if (IS_ERR(op_data))
1426 return PTR_ERR(op_data);
1428 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1429 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1430 ll_finish_md_op_data(op_data);
1432 CDEBUG(D_INFO, "md_getattr_name failed "
1433 "on %s: rc %d\n", filename, rc);
1437 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1438 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1440 lmmsize = body->eadatasize;
1442 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1444 GOTO(out, rc = -ENODATA);
1447 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1448 LASSERT(lmm != NULL);
1450 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1451 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1452 GOTO(out, rc = -EPROTO);
1456 * This is coming from the MDS, so is probably in
1457 * little endian. We convert it to host endian before
1458 * passing it to userspace.
1460 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1463 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1464 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1467 /* if function called for directory - we should
1468 * avoid swab not existent lsm objects */
1469 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1470 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1471 if (S_ISREG(body->mode))
1472 lustre_swab_lov_user_md_objects(
1473 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1475 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1476 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1477 if (S_ISREG(body->mode))
1478 lustre_swab_lov_user_md_objects(
1479 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1486 *lmm_size = lmmsize;
1491 static int ll_lov_setea(struct inode *inode, struct file *file,
1494 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1495 struct lov_user_md *lump;
1496 int lum_size = sizeof(struct lov_user_md) +
1497 sizeof(struct lov_user_ost_data);
1500 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1503 OBD_ALLOC_LARGE(lump, lum_size);
1507 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1508 OBD_FREE_LARGE(lump, lum_size);
1512 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1514 OBD_FREE_LARGE(lump, lum_size);
1518 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1521 struct lov_user_md_v3 lumv3;
1522 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1523 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1524 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1526 int flags = FMODE_WRITE;
1528 /* first try with v1 which is smaller than v3 */
1529 lum_size = sizeof(struct lov_user_md_v1);
1530 if (copy_from_user(lumv1, lumv1p, lum_size))
1533 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1534 lum_size = sizeof(struct lov_user_md_v3);
1535 if (copy_from_user(&lumv3, lumv3p, lum_size))
1539 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1541 struct lov_stripe_md *lsm;
1544 put_user(0, &lumv1p->lmm_stripe_count);
1546 ll_layout_refresh(inode, &gen);
1547 lsm = ccc_inode_lsm_get(inode);
1548 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1549 0, lsm, (void *)arg);
1550 ccc_inode_lsm_put(inode, lsm);
1555 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1557 struct lov_stripe_md *lsm;
1560 lsm = ccc_inode_lsm_get(inode);
1562 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1564 ccc_inode_lsm_put(inode, lsm);
1568 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1570 struct ll_inode_info *lli = ll_i2info(inode);
1571 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1572 struct ccc_grouplock grouplock;
1575 if (ll_file_nolock(file))
1578 spin_lock(&lli->lli_lock);
1579 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1580 CWARN("group lock already existed with gid %lu\n",
1581 fd->fd_grouplock.cg_gid);
1582 spin_unlock(&lli->lli_lock);
1585 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1586 spin_unlock(&lli->lli_lock);
1588 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1589 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1593 spin_lock(&lli->lli_lock);
1594 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1595 spin_unlock(&lli->lli_lock);
1596 CERROR("another thread just won the race\n");
1597 cl_put_grouplock(&grouplock);
1601 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1602 fd->fd_grouplock = grouplock;
1603 spin_unlock(&lli->lli_lock);
1605 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1609 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1611 struct ll_inode_info *lli = ll_i2info(inode);
1612 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1613 struct ccc_grouplock grouplock;
1615 spin_lock(&lli->lli_lock);
1616 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1617 spin_unlock(&lli->lli_lock);
1618 CWARN("no group lock held\n");
1621 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1623 if (fd->fd_grouplock.cg_gid != arg) {
1624 CWARN("group lock %lu doesn't match current id %lu\n",
1625 arg, fd->fd_grouplock.cg_gid);
1626 spin_unlock(&lli->lli_lock);
1630 grouplock = fd->fd_grouplock;
1631 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1632 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1633 spin_unlock(&lli->lli_lock);
1635 cl_put_grouplock(&grouplock);
1636 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1641 * Close inode open handle
1643 * \param dentry [in] dentry which contains the inode
1644 * \param it [in,out] intent which contains open info and result
1647 * \retval <0 failure
1649 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1651 struct inode *inode = dentry->d_inode;
1652 struct obd_client_handle *och;
1657 /* Root ? Do nothing. */
1658 if (dentry->d_inode->i_sb->s_root == dentry)
1661 /* No open handle to close? Move away */
1662 if (!it_disposition(it, DISP_OPEN_OPEN))
1665 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1667 OBD_ALLOC(och, sizeof(*och));
1669 GOTO(out, rc = -ENOMEM);
1671 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1673 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1676 /* this one is in place of ll_file_open */
1677 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1678 ptlrpc_req_finished(it->d.lustre.it_data);
1679 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1685 * Get size for inode for which FIEMAP mapping is requested.
1686 * Make the FIEMAP get_info call and returns the result.
1688 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1691 struct obd_export *exp = ll_i2dtexp(inode);
1692 struct lov_stripe_md *lsm = NULL;
1693 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1694 int vallen = num_bytes;
1697 /* Checks for fiemap flags */
1698 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1699 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1703 /* Check for FIEMAP_FLAG_SYNC */
1704 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1705 rc = filemap_fdatawrite(inode->i_mapping);
1710 lsm = ccc_inode_lsm_get(inode);
1714 /* If the stripe_count > 1 and the application does not understand
1715 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1717 if (lsm->lsm_stripe_count > 1 &&
1718 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1719 GOTO(out, rc = -EOPNOTSUPP);
1721 fm_key.oa.o_oi = lsm->lsm_oi;
1722 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1724 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1725 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1726 /* If filesize is 0, then there would be no objects for mapping */
1727 if (fm_key.oa.o_size == 0) {
1728 fiemap->fm_mapped_extents = 0;
1732 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1734 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1737 CERROR("obd_get_info failed: rc = %d\n", rc);
1740 ccc_inode_lsm_put(inode, lsm);
1744 int ll_fid2path(struct inode *inode, void *arg)
1746 struct obd_export *exp = ll_i2mdexp(inode);
1747 struct getinfo_fid2path *gfout, *gfin;
1750 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1751 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1754 /* Need to get the buflen */
1755 OBD_ALLOC_PTR(gfin);
1758 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1763 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1764 OBD_ALLOC(gfout, outsize);
1765 if (gfout == NULL) {
1769 memcpy(gfout, gfin, sizeof(*gfout));
1772 /* Call mdc_iocontrol */
1773 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1777 if (copy_to_user(arg, gfout, outsize))
1781 OBD_FREE(gfout, outsize);
1785 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1787 struct ll_user_fiemap *fiemap_s;
1788 size_t num_bytes, ret_bytes;
1789 unsigned int extent_count;
1792 /* Get the extent count so we can calculate the size of
1793 * required fiemap buffer */
1794 if (get_user(extent_count,
1795 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1797 num_bytes = sizeof(*fiemap_s) + (extent_count *
1798 sizeof(struct ll_fiemap_extent));
1800 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1801 if (fiemap_s == NULL)
1804 /* get the fiemap value */
1805 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1807 GOTO(error, rc = -EFAULT);
1809 /* If fm_extent_count is non-zero, read the first extent since
1810 * it is used to calculate end_offset and device from previous
1813 if (copy_from_user(&fiemap_s->fm_extents[0],
1814 (char __user *)arg + sizeof(*fiemap_s),
1815 sizeof(struct ll_fiemap_extent)))
1816 GOTO(error, rc = -EFAULT);
1819 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1823 ret_bytes = sizeof(struct ll_user_fiemap);
1825 if (extent_count != 0)
1826 ret_bytes += (fiemap_s->fm_mapped_extents *
1827 sizeof(struct ll_fiemap_extent));
1829 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1833 OBD_FREE_LARGE(fiemap_s, num_bytes);
1838 * Read the data_version for inode.
1840 * This value is computed using stripe object version on OST.
1841 * Version is computed using server side locking.
1843 * @param extent_lock Take extent lock. Not needed if a process is already
1844 * holding the OST object group locks.
1846 int ll_data_version(struct inode *inode, __u64 *data_version,
1849 struct lov_stripe_md *lsm = NULL;
1850 struct ll_sb_info *sbi = ll_i2sbi(inode);
1851 struct obdo *obdo = NULL;
1854 /* If no stripe, we consider version is 0. */
1855 lsm = ccc_inode_lsm_get(inode);
1856 if (!lsm_has_objects(lsm)) {
1858 CDEBUG(D_INODE, "No object for inode\n");
1862 OBD_ALLOC_PTR(obdo);
1864 GOTO(out, rc = -ENOMEM);
1866 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1868 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1871 *data_version = obdo->o_data_version;
1876 ccc_inode_lsm_put(inode, lsm);
1881 * Trigger a HSM release request for the provided inode.
1883 int ll_hsm_release(struct inode *inode)
1885 struct cl_env_nest nest;
1887 struct obd_client_handle *och = NULL;
1888 __u64 data_version = 0;
1892 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1893 ll_get_fsname(inode->i_sb, NULL, 0),
1894 PFID(&ll_i2info(inode)->lli_fid));
1896 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1898 GOTO(out, rc = PTR_ERR(och));
1900 /* Grab latest data_version and [am]time values */
1901 rc = ll_data_version(inode, &data_version, 1);
1905 env = cl_env_nested_get(&nest);
1907 GOTO(out, rc = PTR_ERR(env));
1909 ll_merge_lvb(env, inode);
1910 cl_env_nested_put(&nest, env);
1912 /* Release the file.
1913 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1914 * we still need it to pack l_remote_handle to MDT. */
1915 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
1921 if (och != NULL && !IS_ERR(och)) /* close the file */
1922 ll_lease_close(och, inode, NULL);
1927 struct ll_swap_stack {
1928 struct iattr ia1, ia2;
1930 struct inode *inode1, *inode2;
1931 bool check_dv1, check_dv2;
1934 static int ll_swap_layouts(struct file *file1, struct file *file2,
1935 struct lustre_swap_layouts *lsl)
1937 struct mdc_swap_layouts msl;
1938 struct md_op_data *op_data;
1941 struct ll_swap_stack *llss = NULL;
1944 OBD_ALLOC_PTR(llss);
1948 llss->inode1 = file1->f_dentry->d_inode;
1949 llss->inode2 = file2->f_dentry->d_inode;
1951 if (!S_ISREG(llss->inode2->i_mode))
1952 GOTO(free, rc = -EINVAL);
1954 if (inode_permission(llss->inode1, MAY_WRITE) ||
1955 inode_permission(llss->inode2, MAY_WRITE))
1956 GOTO(free, rc = -EPERM);
1958 if (llss->inode2->i_sb != llss->inode1->i_sb)
1959 GOTO(free, rc = -EXDEV);
1961 /* we use 2 bool because it is easier to swap than 2 bits */
1962 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1963 llss->check_dv1 = true;
1965 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1966 llss->check_dv2 = true;
1968 /* we cannot use lsl->sl_dvX directly because we may swap them */
1969 llss->dv1 = lsl->sl_dv1;
1970 llss->dv2 = lsl->sl_dv2;
1972 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1973 if (rc == 0) /* same file, done! */
1976 if (rc < 0) { /* sequentialize it */
1977 swap(llss->inode1, llss->inode2);
1979 swap(llss->dv1, llss->dv2);
1980 swap(llss->check_dv1, llss->check_dv2);
1984 if (gid != 0) { /* application asks to flush dirty cache */
1985 rc = ll_get_grouplock(llss->inode1, file1, gid);
1989 rc = ll_get_grouplock(llss->inode2, file2, gid);
1991 ll_put_grouplock(llss->inode1, file1, gid);
1996 /* to be able to restore mtime and atime after swap
1997 * we need to first save them */
1999 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2000 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2001 llss->ia1.ia_atime = llss->inode1->i_atime;
2002 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2003 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2004 llss->ia2.ia_atime = llss->inode2->i_atime;
2005 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2008 /* ultimate check, before swaping the layouts we check if
2009 * dataversion has changed (if requested) */
2010 if (llss->check_dv1) {
2011 rc = ll_data_version(llss->inode1, &dv, 0);
2014 if (dv != llss->dv1)
2015 GOTO(putgl, rc = -EAGAIN);
2018 if (llss->check_dv2) {
2019 rc = ll_data_version(llss->inode2, &dv, 0);
2022 if (dv != llss->dv2)
2023 GOTO(putgl, rc = -EAGAIN);
2026 /* struct md_op_data is used to send the swap args to the mdt
2027 * only flags is missing, so we use struct mdc_swap_layouts
2028 * through the md_op_data->op_data */
2029 /* flags from user space have to be converted before they are send to
2030 * server, no flag is sent today, they are only used on the client */
2033 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2034 0, LUSTRE_OPC_ANY, &msl);
2035 if (IS_ERR(op_data))
2036 GOTO(free, rc = PTR_ERR(op_data));
2038 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2039 sizeof(*op_data), op_data, NULL);
2040 ll_finish_md_op_data(op_data);
2044 ll_put_grouplock(llss->inode2, file2, gid);
2045 ll_put_grouplock(llss->inode1, file1, gid);
2048 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2052 /* clear useless flags */
2053 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2054 llss->ia1.ia_valid &= ~ATTR_MTIME;
2055 llss->ia2.ia_valid &= ~ATTR_MTIME;
2058 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2059 llss->ia1.ia_valid &= ~ATTR_ATIME;
2060 llss->ia2.ia_valid &= ~ATTR_ATIME;
2063 /* update time if requested */
2065 if (llss->ia2.ia_valid != 0) {
2066 mutex_lock(&llss->inode1->i_mutex);
2067 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2068 mutex_unlock(&llss->inode1->i_mutex);
2071 if (llss->ia1.ia_valid != 0) {
2074 mutex_lock(&llss->inode2->i_mutex);
2075 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2076 mutex_unlock(&llss->inode2->i_mutex);
2088 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2090 struct md_op_data *op_data;
2093 /* Non-root users are forbidden to set or clear flags which are
2094 * NOT defined in HSM_USER_MASK. */
2095 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2096 !cfs_capable(CFS_CAP_SYS_ADMIN))
2099 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2100 LUSTRE_OPC_ANY, hss);
2101 if (IS_ERR(op_data))
2102 return PTR_ERR(op_data);
2104 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2105 sizeof(*op_data), op_data, NULL);
2107 ll_finish_md_op_data(op_data);
2112 static int ll_hsm_import(struct inode *inode, struct file *file,
2113 struct hsm_user_import *hui)
2115 struct hsm_state_set *hss = NULL;
2116 struct iattr *attr = NULL;
2120 if (!S_ISREG(inode->i_mode))
2126 GOTO(out, rc = -ENOMEM);
2128 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2129 hss->hss_archive_id = hui->hui_archive_id;
2130 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2131 rc = ll_hsm_state_set(inode, hss);
2135 OBD_ALLOC_PTR(attr);
2137 GOTO(out, rc = -ENOMEM);
2139 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2140 attr->ia_mode |= S_IFREG;
2141 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2142 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2143 attr->ia_size = hui->hui_size;
2144 attr->ia_mtime.tv_sec = hui->hui_mtime;
2145 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2146 attr->ia_atime.tv_sec = hui->hui_atime;
2147 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2149 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2150 ATTR_UID | ATTR_GID |
2151 ATTR_MTIME | ATTR_MTIME_SET |
2152 ATTR_ATIME | ATTR_ATIME_SET;
2154 rc = ll_setattr_raw(file->f_dentry, attr, true);
2168 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2170 struct inode *inode = file->f_dentry->d_inode;
2171 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2174 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2175 inode->i_generation, inode, cmd);
2176 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2178 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2179 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2183 case LL_IOC_GETFLAGS:
2184 /* Get the current value of the file flags */
2185 return put_user(fd->fd_flags, (int *)arg);
2186 case LL_IOC_SETFLAGS:
2187 case LL_IOC_CLRFLAGS:
2188 /* Set or clear specific file flags */
2189 /* XXX This probably needs checks to ensure the flags are
2190 * not abused, and to handle any flag side effects.
2192 if (get_user(flags, (int *) arg))
2195 if (cmd == LL_IOC_SETFLAGS) {
2196 if ((flags & LL_FILE_IGNORE_LOCK) &&
2197 !(file->f_flags & O_DIRECT)) {
2198 CERROR("%s: unable to disable locking on "
2199 "non-O_DIRECT file\n", current->comm);
2203 fd->fd_flags |= flags;
2205 fd->fd_flags &= ~flags;
2208 case LL_IOC_LOV_SETSTRIPE:
2209 return ll_lov_setstripe(inode, file, arg);
2210 case LL_IOC_LOV_SETEA:
2211 return ll_lov_setea(inode, file, arg);
2212 case LL_IOC_LOV_SWAP_LAYOUTS: {
2214 struct lustre_swap_layouts lsl;
2216 if (copy_from_user(&lsl, (char *)arg,
2217 sizeof(struct lustre_swap_layouts)))
2220 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2223 file2 = fget(lsl.sl_fd);
2228 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2229 rc = ll_swap_layouts(file, file2, &lsl);
2233 case LL_IOC_LOV_GETSTRIPE:
2234 return ll_lov_getstripe(inode, arg);
2235 case LL_IOC_RECREATE_OBJ:
2236 return ll_lov_recreate_obj(inode, arg);
2237 case LL_IOC_RECREATE_FID:
2238 return ll_lov_recreate_fid(inode, arg);
2239 case FSFILT_IOC_FIEMAP:
2240 return ll_ioctl_fiemap(inode, arg);
2241 case FSFILT_IOC_GETFLAGS:
2242 case FSFILT_IOC_SETFLAGS:
2243 return ll_iocontrol(inode, file, cmd, arg);
2244 case FSFILT_IOC_GETVERSION_OLD:
2245 case FSFILT_IOC_GETVERSION:
2246 return put_user(inode->i_generation, (int *)arg);
2247 case LL_IOC_GROUP_LOCK:
2248 return ll_get_grouplock(inode, file, arg);
2249 case LL_IOC_GROUP_UNLOCK:
2250 return ll_put_grouplock(inode, file, arg);
2251 case IOC_OBD_STATFS:
2252 return ll_obd_statfs(inode, (void *)arg);
2254 /* We need to special case any other ioctls we want to handle,
2255 * to send them to the MDS/OST as appropriate and to properly
2256 * network encode the arg field.
2257 case FSFILT_IOC_SETVERSION_OLD:
2258 case FSFILT_IOC_SETVERSION:
2260 case LL_IOC_FLUSHCTX:
2261 return ll_flush_ctx(inode);
2262 case LL_IOC_PATH2FID: {
2263 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2264 sizeof(struct lu_fid)))
2269 case OBD_IOC_FID2PATH:
2270 return ll_fid2path(inode, (void *)arg);
2271 case LL_IOC_DATA_VERSION: {
2272 struct ioc_data_version idv;
2275 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2278 rc = ll_data_version(inode, &idv.idv_version,
2279 !(idv.idv_flags & LL_DV_NOFLUSH));
2281 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2287 case LL_IOC_GET_MDTIDX: {
2290 mdtidx = ll_get_mdt_idx(inode);
2294 if (put_user((int)mdtidx, (int*)arg))
2299 case OBD_IOC_GETDTNAME:
2300 case OBD_IOC_GETMDNAME:
2301 return ll_get_obd_name(inode, cmd, arg);
2302 case LL_IOC_HSM_STATE_GET: {
2303 struct md_op_data *op_data;
2304 struct hsm_user_state *hus;
2311 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2312 LUSTRE_OPC_ANY, hus);
2313 if (IS_ERR(op_data)) {
2315 return PTR_ERR(op_data);
2318 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2321 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2324 ll_finish_md_op_data(op_data);
2328 case LL_IOC_HSM_STATE_SET: {
2329 struct hsm_state_set *hss;
2336 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2341 rc = ll_hsm_state_set(inode, hss);
2346 case LL_IOC_HSM_ACTION: {
2347 struct md_op_data *op_data;
2348 struct hsm_current_action *hca;
2355 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2356 LUSTRE_OPC_ANY, hca);
2357 if (IS_ERR(op_data)) {
2359 return PTR_ERR(op_data);
2362 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2365 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2368 ll_finish_md_op_data(op_data);
2372 case LL_IOC_SET_LEASE: {
2373 struct ll_inode_info *lli = ll_i2info(inode);
2374 struct obd_client_handle *och = NULL;
2380 if (!(file->f_mode & FMODE_WRITE))
2385 if (!(file->f_mode & FMODE_READ))
2390 mutex_lock(&lli->lli_och_mutex);
2391 if (fd->fd_lease_och != NULL) {
2392 och = fd->fd_lease_och;
2393 fd->fd_lease_och = NULL;
2395 mutex_unlock(&lli->lli_och_mutex);
2398 mode = och->och_flags &
2399 (FMODE_READ|FMODE_WRITE);
2400 rc = ll_lease_close(och, inode, &lease_broken);
2401 if (rc == 0 && lease_broken)
2407 /* return the type of lease or error */
2408 return rc < 0 ? rc : (int)mode;
2413 CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2415 /* apply for lease */
2416 och = ll_lease_open(inode, file, mode, 0);
2418 return PTR_ERR(och);
2421 mutex_lock(&lli->lli_och_mutex);
2422 if (fd->fd_lease_och == NULL) {
2423 fd->fd_lease_och = och;
2426 mutex_unlock(&lli->lli_och_mutex);
2428 /* impossible now that only excl is supported for now */
2429 ll_lease_close(och, inode, &lease_broken);
2434 case LL_IOC_GET_LEASE: {
2435 struct ll_inode_info *lli = ll_i2info(inode);
2436 struct ldlm_lock *lock = NULL;
2439 mutex_lock(&lli->lli_och_mutex);
2440 if (fd->fd_lease_och != NULL) {
2441 struct obd_client_handle *och = fd->fd_lease_och;
2443 lock = ldlm_handle2lock(&och->och_lease_handle);
2445 lock_res_and_lock(lock);
2446 if (!ldlm_is_cancel(lock))
2447 rc = och->och_flags &
2448 (FMODE_READ | FMODE_WRITE);
2449 unlock_res_and_lock(lock);
2450 ldlm_lock_put(lock);
2453 mutex_unlock(&lli->lli_och_mutex);
2456 case LL_IOC_HSM_IMPORT: {
2457 struct hsm_user_import *hui;
2463 if (copy_from_user(hui, (void *)arg, sizeof(*hui))) {
2468 rc = ll_hsm_import(inode, file, hui);
2477 ll_iocontrol_call(inode, file, cmd, arg, &err))
2480 return obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2487 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2489 struct inode *inode = file->f_dentry->d_inode;
2490 loff_t retval, eof = 0;
2492 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2493 (origin == SEEK_CUR) ? file->f_pos : 0);
2494 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2495 inode->i_ino, inode->i_generation, inode, retval, retval,
2497 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2499 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2500 retval = ll_glimpse_size(inode);
2503 eof = i_size_read(inode);
2506 retval = generic_file_llseek_size(file, offset, origin,
2507 ll_file_maxbytes(inode), eof);
2511 int ll_flush(struct file *file, fl_owner_t id)
2513 struct inode *inode = file->f_dentry->d_inode;
2514 struct ll_inode_info *lli = ll_i2info(inode);
2515 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2518 LASSERT(!S_ISDIR(inode->i_mode));
2520 /* catch async errors that were recorded back when async writeback
2521 * failed for pages in this mapping. */
2522 rc = lli->lli_async_rc;
2523 lli->lli_async_rc = 0;
2524 err = lov_read_and_clear_async_rc(lli->lli_clob);
2528 /* The application has been told write failure already.
2529 * Do not report failure again. */
2530 if (fd->fd_write_failed)
2532 return rc ? -EIO : 0;
2536 * Called to make sure a portion of file has been written out.
2537 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2539 * Return how many pages have been written.
2541 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2542 enum cl_fsync_mode mode, int ignore_layout)
2544 struct cl_env_nest nest;
2547 struct obd_capa *capa = NULL;
2548 struct cl_fsync_io *fio;
2551 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2552 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2555 env = cl_env_nested_get(&nest);
2557 return PTR_ERR(env);
2559 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2561 io = ccc_env_thread_io(env);
2562 io->ci_obj = cl_i2info(inode)->lli_clob;
2563 io->ci_ignore_layout = ignore_layout;
2565 /* initialize parameters for sync */
2566 fio = &io->u.ci_fsync;
2567 fio->fi_capa = capa;
2568 fio->fi_start = start;
2570 fio->fi_fid = ll_inode2fid(inode);
2571 fio->fi_mode = mode;
2572 fio->fi_nr_written = 0;
2574 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2575 result = cl_io_loop(env, io);
2577 result = io->ci_result;
2579 result = fio->fi_nr_written;
2580 cl_io_fini(env, io);
2581 cl_env_nested_put(&nest, env);
2589 * When dentry is provided (the 'else' case), *file->f_dentry may be
2590 * null and dentry must be used directly rather than pulled from
2591 * *file->f_dentry as is done otherwise.
2594 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2596 struct dentry *dentry = file->f_dentry;
2597 struct inode *inode = dentry->d_inode;
2598 struct ll_inode_info *lli = ll_i2info(inode);
2599 struct ptlrpc_request *req;
2600 struct obd_capa *oc;
2603 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2604 inode->i_generation, inode);
2605 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2607 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2608 mutex_lock(&inode->i_mutex);
2610 /* catch async errors that were recorded back when async writeback
2611 * failed for pages in this mapping. */
2612 if (!S_ISDIR(inode->i_mode)) {
2613 err = lli->lli_async_rc;
2614 lli->lli_async_rc = 0;
2617 err = lov_read_and_clear_async_rc(lli->lli_clob);
2622 oc = ll_mdscapa_get(inode);
2623 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2629 ptlrpc_req_finished(req);
2631 if (datasync && S_ISREG(inode->i_mode)) {
2632 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2634 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2636 if (rc == 0 && err < 0)
2639 fd->fd_write_failed = true;
2641 fd->fd_write_failed = false;
2644 mutex_unlock(&inode->i_mutex);
2648 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2650 struct inode *inode = file->f_dentry->d_inode;
2651 struct ll_sb_info *sbi = ll_i2sbi(inode);
2652 struct ldlm_enqueue_info einfo = {
2653 .ei_type = LDLM_FLOCK,
2654 .ei_cb_cp = ldlm_flock_completion_ast,
2655 .ei_cbdata = file_lock,
2657 struct md_op_data *op_data;
2658 struct lustre_handle lockh = {0};
2659 ldlm_policy_data_t flock = {{0}};
2664 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2665 inode->i_ino, file_lock);
2667 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2669 if (file_lock->fl_flags & FL_FLOCK) {
2670 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2671 /* flocks are whole-file locks */
2672 flock.l_flock.end = OFFSET_MAX;
2673 /* For flocks owner is determined by the local file desctiptor*/
2674 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2675 } else if (file_lock->fl_flags & FL_POSIX) {
2676 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2677 flock.l_flock.start = file_lock->fl_start;
2678 flock.l_flock.end = file_lock->fl_end;
2682 flock.l_flock.pid = file_lock->fl_pid;
2684 /* Somewhat ugly workaround for svc lockd.
2685 * lockd installs custom fl_lmops->lm_compare_owner that checks
2686 * for the fl_owner to be the same (which it always is on local node
2687 * I guess between lockd processes) and then compares pid.
2688 * As such we assign pid to the owner field to make it all work,
2689 * conflict with normal locks is unlikely since pid space and
2690 * pointer space for current->files are not intersecting */
2691 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2692 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2694 switch (file_lock->fl_type) {
2696 einfo.ei_mode = LCK_PR;
2699 /* An unlock request may or may not have any relation to
2700 * existing locks so we may not be able to pass a lock handle
2701 * via a normal ldlm_lock_cancel() request. The request may even
2702 * unlock a byte range in the middle of an existing lock. In
2703 * order to process an unlock request we need all of the same
2704 * information that is given with a normal read or write record
2705 * lock request. To avoid creating another ldlm unlock (cancel)
2706 * message we'll treat a LCK_NL flock request as an unlock. */
2707 einfo.ei_mode = LCK_NL;
2710 einfo.ei_mode = LCK_PW;
2713 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2714 file_lock->fl_type);
2729 flags = LDLM_FL_BLOCK_NOWAIT;
2735 flags = LDLM_FL_TEST_LOCK;
2736 /* Save the old mode so that if the mode in the lock changes we
2737 * can decrement the appropriate reader or writer refcount. */
2738 file_lock->fl_type = einfo.ei_mode;
2741 CERROR("unknown fcntl lock command: %d\n", cmd);
2745 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2746 LUSTRE_OPC_ANY, NULL);
2747 if (IS_ERR(op_data))
2748 return PTR_ERR(op_data);
2750 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2751 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2752 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2754 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2755 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2757 if ((file_lock->fl_flags & FL_FLOCK) &&
2758 (rc == 0 || file_lock->fl_type == F_UNLCK))
2759 rc2 = flock_lock_file_wait(file, file_lock);
2760 if ((file_lock->fl_flags & FL_POSIX) &&
2761 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2762 !(flags & LDLM_FL_TEST_LOCK))
2763 rc2 = posix_lock_file_wait(file, file_lock);
2765 if (rc2 && file_lock->fl_type != F_UNLCK) {
2766 einfo.ei_mode = LCK_NL;
2767 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2768 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2772 ll_finish_md_op_data(op_data);
2777 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2783 * test if some locks matching bits and l_req_mode are acquired
2784 * - bits can be in different locks
2785 * - if found clear the common lock bits in *bits
2786 * - the bits not found, are kept in *bits
2788 * \param bits [IN] searched lock bits [IN]
2789 * \param l_req_mode [IN] searched lock mode
2790 * \retval boolean, true iff all bits are found
2792 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2794 struct lustre_handle lockh;
2795 ldlm_policy_data_t policy;
2796 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2797 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2805 fid = &ll_i2info(inode)->lli_fid;
2806 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2807 ldlm_lockname[mode]);
2809 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2810 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2811 policy.l_inodebits.bits = *bits & (1 << i);
2812 if (policy.l_inodebits.bits == 0)
2815 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2816 &policy, mode, &lockh)) {
2817 struct ldlm_lock *lock;
2819 lock = ldlm_handle2lock(&lockh);
2822 ~(lock->l_policy_data.l_inodebits.bits);
2823 LDLM_LOCK_PUT(lock);
2825 *bits &= ~policy.l_inodebits.bits;
2832 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2833 struct lustre_handle *lockh, __u64 flags,
2836 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2840 fid = &ll_i2info(inode)->lli_fid;
2841 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2843 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2844 fid, LDLM_IBITS, &policy, mode, lockh);
2849 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2851 /* Already unlinked. Just update nlink and return success */
2852 if (rc == -ENOENT) {
2854 /* This path cannot be hit for regular files unless in
2855 * case of obscure races, so no need to validate size.
2857 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2859 } else if (rc != 0) {
2860 CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
2861 ll_get_fsname(inode->i_sb, NULL, 0),
2862 PFID(ll_inode2fid(inode)), rc);
2868 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2871 struct inode *inode = dentry->d_inode;
2872 struct ptlrpc_request *req = NULL;
2873 struct obd_export *exp;
2876 LASSERT(inode != NULL);
2878 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2879 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2881 exp = ll_i2mdexp(inode);
2883 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2884 * But under CMD case, it caused some lock issues, should be fixed
2885 * with new CMD ibits lock. See bug 12718 */
2886 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2887 struct lookup_intent oit = { .it_op = IT_GETATTR };
2888 struct md_op_data *op_data;
2890 if (ibits == MDS_INODELOCK_LOOKUP)
2891 oit.it_op = IT_LOOKUP;
2893 /* Call getattr by fid, so do not provide name at all. */
2894 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2895 dentry->d_inode, NULL, 0, 0,
2896 LUSTRE_OPC_ANY, NULL);
2897 if (IS_ERR(op_data))
2898 return PTR_ERR(op_data);
2900 oit.it_create_mode |= M_CHECK_STALE;
2901 rc = md_intent_lock(exp, op_data, NULL, 0,
2902 /* we are not interested in name
2905 ll_md_blocking_ast, 0);
2906 ll_finish_md_op_data(op_data);
2907 oit.it_create_mode &= ~M_CHECK_STALE;
2909 rc = ll_inode_revalidate_fini(inode, rc);
2913 rc = ll_revalidate_it_finish(req, &oit, dentry);
2915 ll_intent_release(&oit);
2919 /* Unlinked? Unhash dentry, so it is not picked up later by
2920 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2921 here to preserve get_cwd functionality on 2.6.
2923 if (!dentry->d_inode->i_nlink)
2924 d_lustre_invalidate(dentry, 0);
2926 ll_lookup_finish_locks(&oit, dentry);
2927 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2928 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2929 obd_valid valid = OBD_MD_FLGETATTR;
2930 struct md_op_data *op_data;
2933 if (S_ISREG(inode->i_mode)) {
2934 rc = ll_get_max_mdsize(sbi, &ealen);
2937 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2940 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2941 0, ealen, LUSTRE_OPC_ANY,
2943 if (IS_ERR(op_data))
2944 return PTR_ERR(op_data);
2946 op_data->op_valid = valid;
2947 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2948 * capa for this inode. Because we only keep capas of dirs
2950 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2951 ll_finish_md_op_data(op_data);
2953 rc = ll_inode_revalidate_fini(inode, rc);
2957 rc = ll_prep_inode(&inode, req, NULL, NULL);
2960 ptlrpc_req_finished(req);
2964 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2967 struct inode *inode = dentry->d_inode;
2970 rc = __ll_inode_revalidate_it(dentry, it, ibits);
2974 /* if object isn't regular file, don't validate size */
2975 if (!S_ISREG(inode->i_mode)) {
2976 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2977 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2978 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2980 /* In case of restore, the MDT has the right size and has
2981 * already send it back without granting the layout lock,
2982 * inode is up-to-date so glimpse is useless.
2983 * Also to glimpse we need the layout, in case of a running
2984 * restore the MDT holds the layout lock so the glimpse will
2985 * block up to the end of restore (getattr will block)
2987 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
2988 rc = ll_glimpse_size(inode);
2993 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2994 struct lookup_intent *it, struct kstat *stat)
2996 struct inode *inode = de->d_inode;
2997 struct ll_sb_info *sbi = ll_i2sbi(inode);
2998 struct ll_inode_info *lli = ll_i2info(inode);
3001 res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
3002 MDS_INODELOCK_LOOKUP);
3003 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3008 stat->dev = inode->i_sb->s_dev;
3009 if (ll_need_32bit_api(sbi))
3010 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3012 stat->ino = inode->i_ino;
3013 stat->mode = inode->i_mode;
3014 stat->nlink = inode->i_nlink;
3015 stat->uid = inode->i_uid;
3016 stat->gid = inode->i_gid;
3017 stat->rdev = inode->i_rdev;
3018 stat->atime = inode->i_atime;
3019 stat->mtime = inode->i_mtime;
3020 stat->ctime = inode->i_ctime;
3021 stat->blksize = 1 << inode->i_blkbits;
3023 stat->size = i_size_read(inode);
3024 stat->blocks = inode->i_blocks;
3028 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3030 struct lookup_intent it = { .it_op = IT_GETATTR };
3032 return ll_getattr_it(mnt, de, &it, stat);
3035 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3036 __u64 start, __u64 len)
3040 struct ll_user_fiemap *fiemap;
3041 unsigned int extent_count = fieinfo->fi_extents_max;
3043 num_bytes = sizeof(*fiemap) + (extent_count *
3044 sizeof(struct ll_fiemap_extent));
3045 OBD_ALLOC_LARGE(fiemap, num_bytes);
3050 fiemap->fm_flags = fieinfo->fi_flags;
3051 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3052 fiemap->fm_start = start;
3053 fiemap->fm_length = len;
3054 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3055 sizeof(struct ll_fiemap_extent));
3057 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3059 fieinfo->fi_flags = fiemap->fm_flags;
3060 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3061 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3062 fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
3064 OBD_FREE_LARGE(fiemap, num_bytes);
3068 struct posix_acl * ll_get_acl(struct inode *inode, int type)
3070 struct ll_inode_info *lli = ll_i2info(inode);
3071 struct posix_acl *acl = NULL;
3073 spin_lock(&lli->lli_lock);
3074 /* VFS' acl_permission_check->check_acl will release the refcount */
3075 acl = posix_acl_dup(lli->lli_posix_acl);
3076 spin_unlock(&lli->lli_lock);
3082 int ll_inode_permission(struct inode *inode, int mask)
3086 #ifdef MAY_NOT_BLOCK
3087 if (mask & MAY_NOT_BLOCK)
3091 /* as root inode are NOT getting validated in lookup operation,
3092 * need to do it before permission check. */
3094 if (inode == inode->i_sb->s_root->d_inode) {
3095 struct lookup_intent it = { .it_op = IT_LOOKUP };
3097 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
3098 MDS_INODELOCK_LOOKUP);
3103 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3104 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3106 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3107 return lustre_check_remote_perm(inode, mask);
3109 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3110 rc = generic_permission(inode, mask);
3115 /* -o localflock - only provides locally consistent flock locks */
3116 struct file_operations ll_file_operations = {
3117 .read = ll_file_read,
3118 .aio_read = ll_file_aio_read,
3119 .write = ll_file_write,
3120 .aio_write = ll_file_aio_write,
3121 .unlocked_ioctl = ll_file_ioctl,
3122 .open = ll_file_open,
3123 .release = ll_file_release,
3124 .mmap = ll_file_mmap,
3125 .llseek = ll_file_seek,
3126 .splice_read = ll_file_splice_read,
3131 struct file_operations ll_file_operations_flock = {
3132 .read = ll_file_read,
3133 .aio_read = ll_file_aio_read,
3134 .write = ll_file_write,
3135 .aio_write = ll_file_aio_write,
3136 .unlocked_ioctl = ll_file_ioctl,
3137 .open = ll_file_open,
3138 .release = ll_file_release,
3139 .mmap = ll_file_mmap,
3140 .llseek = ll_file_seek,
3141 .splice_read = ll_file_splice_read,
3144 .flock = ll_file_flock,
3145 .lock = ll_file_flock
3148 /* These are for -o noflock - to return ENOSYS on flock calls */
3149 struct file_operations ll_file_operations_noflock = {
3150 .read = ll_file_read,
3151 .aio_read = ll_file_aio_read,
3152 .write = ll_file_write,
3153 .aio_write = ll_file_aio_write,
3154 .unlocked_ioctl = ll_file_ioctl,
3155 .open = ll_file_open,
3156 .release = ll_file_release,
3157 .mmap = ll_file_mmap,
3158 .llseek = ll_file_seek,
3159 .splice_read = ll_file_splice_read,
3162 .flock = ll_file_noflock,
3163 .lock = ll_file_noflock
3166 struct inode_operations ll_file_inode_operations = {
3167 .setattr = ll_setattr,
3168 .getattr = ll_getattr,
3169 .permission = ll_inode_permission,
3170 .setxattr = ll_setxattr,
3171 .getxattr = ll_getxattr,
3172 .listxattr = ll_listxattr,
3173 .removexattr = ll_removexattr,
3174 .fiemap = ll_fiemap,
3175 .get_acl = ll_get_acl,
3178 /* dynamic ioctl number support routins */
3179 static struct llioc_ctl_data {
3180 struct rw_semaphore ioc_sem;
3181 struct list_head ioc_head;
3183 __RWSEM_INITIALIZER(llioc.ioc_sem),
3184 LIST_HEAD_INIT(llioc.ioc_head)
3189 struct list_head iocd_list;
3190 unsigned int iocd_size;
3191 llioc_callback_t iocd_cb;
3192 unsigned int iocd_count;
3193 unsigned int iocd_cmd[0];
3196 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3199 struct llioc_data *in_data = NULL;
3201 if (cb == NULL || cmd == NULL ||
3202 count > LLIOC_MAX_CMD || count < 0)
3205 size = sizeof(*in_data) + count * sizeof(unsigned int);
3206 OBD_ALLOC(in_data, size);
3207 if (in_data == NULL)
3210 memset(in_data, 0, sizeof(*in_data));
3211 in_data->iocd_size = size;
3212 in_data->iocd_cb = cb;
3213 in_data->iocd_count = count;
3214 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3216 down_write(&llioc.ioc_sem);
3217 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3218 up_write(&llioc.ioc_sem);
3223 void ll_iocontrol_unregister(void *magic)
3225 struct llioc_data *tmp;
3230 down_write(&llioc.ioc_sem);
3231 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3233 unsigned int size = tmp->iocd_size;
3235 list_del(&tmp->iocd_list);
3236 up_write(&llioc.ioc_sem);
3238 OBD_FREE(tmp, size);
3242 up_write(&llioc.ioc_sem);
3244 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3247 EXPORT_SYMBOL(ll_iocontrol_register);
3248 EXPORT_SYMBOL(ll_iocontrol_unregister);
3250 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3251 unsigned int cmd, unsigned long arg, int *rcp)
3253 enum llioc_iter ret = LLIOC_CONT;
3254 struct llioc_data *data;
3255 int rc = -EINVAL, i;
3257 down_read(&llioc.ioc_sem);
3258 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3259 for (i = 0; i < data->iocd_count; i++) {
3260 if (cmd != data->iocd_cmd[i])
3263 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3267 if (ret == LLIOC_STOP)
3270 up_read(&llioc.ioc_sem);
3277 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3279 struct ll_inode_info *lli = ll_i2info(inode);
3280 struct cl_env_nest nest;
3284 if (lli->lli_clob == NULL)
3287 env = cl_env_nested_get(&nest);
3289 return PTR_ERR(env);
3291 result = cl_conf_set(env, lli->lli_clob, conf);
3292 cl_env_nested_put(&nest, env);
3294 if (conf->coc_opc == OBJECT_CONF_SET) {
3295 struct ldlm_lock *lock = conf->coc_lock;
3297 LASSERT(lock != NULL);
3298 LASSERT(ldlm_has_layout(lock));
3300 /* it can only be allowed to match after layout is
3301 * applied to inode otherwise false layout would be
3302 * seen. Applying layout shoud happen before dropping
3303 * the intent lock. */
3304 ldlm_lock_allow_match(lock);
3310 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3311 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3314 struct ll_sb_info *sbi = ll_i2sbi(inode);
3315 struct obd_capa *oc;
3316 struct ptlrpc_request *req;
3317 struct mdt_body *body;
3323 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3324 PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
3325 lock->l_lvb_data, lock->l_lvb_len);
3327 if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
3330 /* if layout lock was granted right away, the layout is returned
3331 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3332 * blocked and then granted via completion ast, we have to fetch
3333 * layout here. Please note that we can't use the LVB buffer in
3334 * completion AST because it doesn't have a large enough buffer */
3335 oc = ll_mdscapa_get(inode);
3336 rc = ll_get_max_mdsize(sbi, &lmmsize);
3338 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3339 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3345 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3346 if (body == NULL || body->eadatasize > lmmsize)
3347 GOTO(out, rc = -EPROTO);
3349 lmmsize = body->eadatasize;
3350 if (lmmsize == 0) /* empty layout */
3353 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3355 GOTO(out, rc = -EFAULT);
3357 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3358 if (lvbdata == NULL)
3359 GOTO(out, rc = -ENOMEM);
3361 memcpy(lvbdata, lmm, lmmsize);
3362 lock_res_and_lock(lock);
3363 if (lock->l_lvb_data != NULL)
3364 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3366 lock->l_lvb_data = lvbdata;
3367 lock->l_lvb_len = lmmsize;
3368 unlock_res_and_lock(lock);
3371 ptlrpc_req_finished(req);
3376 * Apply the layout to the inode. Layout lock is held and will be released
3379 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3380 struct inode *inode, __u32 *gen, bool reconf)
3382 struct ll_inode_info *lli = ll_i2info(inode);
3383 struct ll_sb_info *sbi = ll_i2sbi(inode);
3384 struct ldlm_lock *lock;
3385 struct lustre_md md = { NULL };
3386 struct cl_object_conf conf;
3389 bool wait_layout = false;
3391 LASSERT(lustre_handle_is_used(lockh));
3393 lock = ldlm_handle2lock(lockh);
3394 LASSERT(lock != NULL);
3395 LASSERT(ldlm_has_layout(lock));
3397 LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3398 inode, PFID(&lli->lli_fid), reconf);
3400 /* in case this is a caching lock and reinstate with new inode */
3401 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3403 lock_res_and_lock(lock);
3404 lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3405 unlock_res_and_lock(lock);
3406 /* checking lvb_ready is racy but this is okay. The worst case is
3407 * that multi processes may configure the file on the same time. */
3408 if (lvb_ready || !reconf) {
3411 /* layout_gen must be valid if layout lock is not
3412 * cancelled and stripe has already set */
3413 *gen = lli->lli_layout_gen;
3419 rc = ll_layout_fetch(inode, lock);
3423 /* for layout lock, lmm is returned in lock's lvb.
3424 * lvb_data is immutable if the lock is held so it's safe to access it
3425 * without res lock. See the description in ldlm_lock_decref_internal()
3426 * for the condition to free lvb_data of layout lock */
3427 if (lock->l_lvb_data != NULL) {
3428 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3429 lock->l_lvb_data, lock->l_lvb_len);
3431 *gen = LL_LAYOUT_GEN_EMPTY;
3433 *gen = md.lsm->lsm_layout_gen;
3436 CERROR("%s: file "DFID" unpackmd error: %d\n",
3437 ll_get_fsname(inode->i_sb, NULL, 0),
3438 PFID(&lli->lli_fid), rc);
3444 /* set layout to file. Unlikely this will fail as old layout was
3445 * surely eliminated */
3446 memset(&conf, 0, sizeof(conf));
3447 conf.coc_opc = OBJECT_CONF_SET;
3448 conf.coc_inode = inode;
3449 conf.coc_lock = lock;
3450 conf.u.coc_md = &md;
3451 rc = ll_layout_conf(inode, &conf);
3454 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3456 /* refresh layout failed, need to wait */
3457 wait_layout = rc == -EBUSY;
3460 LDLM_LOCK_PUT(lock);
3461 ldlm_lock_decref(lockh, mode);
3463 /* wait for IO to complete if it's still being used. */
3465 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3466 ll_get_fsname(inode->i_sb, NULL, 0),
3467 inode, PFID(&lli->lli_fid));
3469 memset(&conf, 0, sizeof(conf));
3470 conf.coc_opc = OBJECT_CONF_WAIT;
3471 conf.coc_inode = inode;
3472 rc = ll_layout_conf(inode, &conf);
3476 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3477 PFID(&lli->lli_fid), rc);
3483 * This function checks if there exists a LAYOUT lock on the client side,
3484 * or enqueues it if it doesn't have one in cache.
3486 * This function will not hold layout lock so it may be revoked any time after
3487 * this function returns. Any operations depend on layout should be redone
3490 * This function should be called before lov_io_init() to get an uptodate
3491 * layout version, the caller should save the version number and after IO
3492 * is finished, this function should be called again to verify that layout
3493 * is not changed during IO time.
3495 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3497 struct ll_inode_info *lli = ll_i2info(inode);
3498 struct ll_sb_info *sbi = ll_i2sbi(inode);
3499 struct md_op_data *op_data;
3500 struct lookup_intent it;
3501 struct lustre_handle lockh;
3503 struct ldlm_enqueue_info einfo = {
3504 .ei_type = LDLM_IBITS,
3506 .ei_cb_bl = ll_md_blocking_ast,
3507 .ei_cb_cp = ldlm_completion_ast,
3511 *gen = lli->lli_layout_gen;
3512 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
3516 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3517 LASSERT(S_ISREG(inode->i_mode));
3519 /* mostly layout lock is caching on the local side, so try to match
3520 * it before grabbing layout lock mutex. */
3521 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3522 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3523 if (mode != 0) { /* hit cached lock */
3524 rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
3528 /* better hold lli_layout_mutex to try again otherwise
3529 * it will have starvation problem. */
3532 /* take layout lock mutex to enqueue layout lock exclusively. */
3533 mutex_lock(&lli->lli_layout_mutex);
3536 /* try again. Maybe somebody else has done this. */
3537 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3538 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3539 if (mode != 0) { /* hit cached lock */
3540 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3544 mutex_unlock(&lli->lli_layout_mutex);
3548 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3549 0, 0, LUSTRE_OPC_ANY, NULL);
3550 if (IS_ERR(op_data)) {
3551 mutex_unlock(&lli->lli_layout_mutex);
3552 return PTR_ERR(op_data);
3555 /* have to enqueue one */
3556 memset(&it, 0, sizeof(it));
3557 it.it_op = IT_LAYOUT;
3558 lockh.cookie = 0ULL;
3560 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3561 ll_get_fsname(inode->i_sb, NULL, 0), inode,
3562 PFID(&lli->lli_fid));
3564 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3566 if (it.d.lustre.it_data != NULL)
3567 ptlrpc_req_finished(it.d.lustre.it_data);
3568 it.d.lustre.it_data = NULL;
3570 ll_finish_md_op_data(op_data);
3572 mode = it.d.lustre.it_lock_mode;
3573 it.d.lustre.it_lock_mode = 0;
3574 ll_intent_drop_lock(&it);
3577 /* set lock data in case this is a new lock */
3578 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3579 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3583 mutex_unlock(&lli->lli_layout_mutex);
3589 * This function send a restore request to the MDT
3591 int ll_layout_restore(struct inode *inode)
3593 struct hsm_user_request *hur;
3596 len = sizeof(struct hsm_user_request) +
3597 sizeof(struct hsm_user_item);
3598 OBD_ALLOC(hur, len);
3602 hur->hur_request.hr_action = HUA_RESTORE;
3603 hur->hur_request.hr_archive_id = 0;
3604 hur->hur_request.hr_flags = 0;
3605 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3606 sizeof(hur->hur_user_item[0].hui_fid));
3607 hur->hur_user_item[0].hui_extent.length = -1;
3608 hur->hur_request.hr_itemcount = 1;
3609 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,