namei.c 119 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
/*
 *  linux/fs/namei.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 * Some corrections by tytso.
 */

/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
 * lookup logic.
 */
/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
 */

#include <linux/init.h>
18
#include <linux/export.h>
19
#include <linux/kernel.h>
Linus Torvalds's avatar
Linus Torvalds committed
20
21
22
23
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
Robert Love's avatar
Robert Love committed
24
#include <linux/fsnotify.h>
Linus Torvalds's avatar
Linus Torvalds committed
25
26
#include <linux/personality.h>
#include <linux/security.h>
Mimi Zohar's avatar
Mimi Zohar committed
27
#include <linux/ima.h>
Linus Torvalds's avatar
Linus Torvalds committed
28
29
30
#include <linux/syscalls.h>
#include <linux/mount.h>
#include <linux/audit.h>
31
#include <linux/capability.h>
32
#include <linux/file.h>
33
#include <linux/fcntl.h>
34
#include <linux/device_cgroup.h>
35
#include <linux/fs_struct.h>
36
#include <linux/posix_acl.h>
37
#include <linux/hash.h>
38
#include <linux/bitops.h>
Linus Torvalds's avatar
Linus Torvalds committed
39
40
#include <asm/uaccess.h>

41
#include "internal.h"
42
#include "mount.h"
43

Linus Torvalds's avatar
Linus Torvalds committed
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
/* [Feb-1997 T. Schoebel-Theuer]
 * Fundamental changes in the pathname lookup mechanisms (namei)
 * were necessary because of omirr.  The reason is that omirr needs
 * to know the _real_ pathname, not the user-supplied one, in case
 * of symlinks (and also when transname replacements occur).
 *
 * The new code replaces the old recursive symlink resolution with
 * an iterative one (in case of non-nested symlink chains).  It does
 * this with calls to <fs>_follow_link().
 * As a side effect, dir_namei(), _namei() and follow_link() are now 
 * replaced with a single function lookup_dentry() that can handle all 
 * the special cases of the former code.
 *
 * With the new dcache, the pathname is stored at each inode, at least as
 * long as the refcount of the inode is positive.  As a side effect, the
 * size of the dcache depends on the inode cache and thus is dynamic.
 *
 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
 * resolution to correspond with current state of the code.
 *
 * Note that the symlink resolution is not *completely* iterative.
 * There is still a significant amount of tail- and mid- recursion in
 * the algorithm.  Also, note that <fs>_readlink() is not used in
 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
 * may return different results than <fs>_follow_link().  Many virtual
 * filesystems (including /proc) exhibit this behavior.
 */

/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
 * and the name already exists in form of a symlink, try to create the new
 * name indicated by the symlink. The old code always complained that the
 * name already exists, due to not following the symlink even if its target
 * is nonexistent.  The new semantics affects also mknod() and link() when
Lucas De Marchi's avatar
Lucas De Marchi committed
78
 * the name is a symlink pointing to a non-existent name.
Linus Torvalds's avatar
Linus Torvalds committed
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
 *
 * I don't know which semantics is the right one, since I have no access
 * to standards. But I found by trial that HP-UX 9.0 has the full "new"
 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
 * "old" one. Personally, I think the new semantics is much more logical.
 * Note that "ln old new" where "new" is a symlink pointing to a non-existing
 * file does succeed in both HP-UX and SunOs, but not in Solaris
 * and in the old Linux semantics.
 */

/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
 * semantics.  See the comments in "open_namei" and "do_link" below.
 *
 * [10-Sep-98 Alan Modra] Another symlink change.
 */

/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
 *	inside the path - always follow.
 *	in the last component in creation/removal/renaming - never follow.
 *	if LOOKUP_FOLLOW passed - follow.
 *	if the pathname has trailing slashes - follow.
 *	otherwise - don't follow.
 * (applied in that order).
 *
 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
 * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
 * During the 2.4 we need to fix the userland stuff depending on it -
 * hopefully we will be able to get rid of that wart in 2.5. So far only
 * XEmacs seems to be relying on it...
 */
/*
 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
111
 * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
Linus Torvalds's avatar
Linus Torvalds committed
112
113
114
115
116
117
118
119
120
121
 * any extra contention...
 */

/* In order to reduce some races, while at the same time doing additional
 * checking and hopefully speeding things up, we copy filenames to the
 * kernel data space before using them..
 *
 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
 * PATH_MAX includes the nul terminator --RR.
 */
122

Al Viro's avatar
Al Viro committed
123
#define EMBEDDED_NAME_MAX	(PATH_MAX - offsetof(struct filename, iname))
124

125
struct filename *
126
127
getname_flags(const char __user *filename, int flags, int *empty)
{
Al Viro's avatar
Al Viro committed
128
	struct filename *result;
129
	char *kname;
Al Viro's avatar
Al Viro committed
130
	int len;
131

132
133
134
135
	result = audit_reusename(filename);
	if (result)
		return result;

136
	result = __getname();
137
	if (unlikely(!result))
138
139
		return ERR_PTR(-ENOMEM);

140
141
142
143
	/*
	 * First, try to embed the struct filename inside the names_cache
	 * allocation
	 */
Al Viro's avatar
Al Viro committed
144
	kname = (char *)result->iname;
145
	result->name = kname;
146

Al Viro's avatar
Al Viro committed
147
	len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
148
	if (unlikely(len < 0)) {
Al Viro's avatar
Al Viro committed
149
150
		__putname(result);
		return ERR_PTR(len);
151
	}
152

153
154
155
156
157
158
	/*
	 * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
	 * separate struct filename so we can dedicate the entire
	 * names_cache allocation for the pathname, and re-do the copy from
	 * userland.
	 */
Al Viro's avatar
Al Viro committed
159
	if (unlikely(len == EMBEDDED_NAME_MAX)) {
Al Viro's avatar
Al Viro committed
160
		const size_t size = offsetof(struct filename, iname[1]);
161
162
		kname = (char *)result;

Al Viro's avatar
Al Viro committed
163
164
165
166
167
168
		/*
		 * size is chosen that way we to guarantee that
		 * result->iname[0] is within the same object and that
		 * kname can't be equal to result->iname, no matter what.
		 */
		result = kzalloc(size, GFP_KERNEL);
Al Viro's avatar
Al Viro committed
169
170
171
		if (unlikely(!result)) {
			__putname(kname);
			return ERR_PTR(-ENOMEM);
172
173
		}
		result->name = kname;
Al Viro's avatar
Al Viro committed
174
175
176
177
178
179
180
181
182
183
184
		len = strncpy_from_user(kname, filename, PATH_MAX);
		if (unlikely(len < 0)) {
			__putname(kname);
			kfree(result);
			return ERR_PTR(len);
		}
		if (unlikely(len == PATH_MAX)) {
			__putname(kname);
			kfree(result);
			return ERR_PTR(-ENAMETOOLONG);
		}
185
186
	}

Al Viro's avatar
Al Viro committed
187
	result->refcnt = 1;
188
189
190
	/* The empty path is special. */
	if (unlikely(!len)) {
		if (empty)
191
			*empty = 1;
Al Viro's avatar
Al Viro committed
192
193
194
195
		if (!(flags & LOOKUP_EMPTY)) {
			putname(result);
			return ERR_PTR(-ENOENT);
		}
Linus Torvalds's avatar
Linus Torvalds committed
196
	}
197

198
	result->uptr = filename;
199
	result->aname = NULL;
200
201
	audit_getname(result);
	return result;
Linus Torvalds's avatar
Linus Torvalds committed
202
203
}

204
205
struct filename *
getname(const char __user * filename)
Al Viro's avatar
Al Viro committed
206
{
207
	return getname_flags(filename, 0, NULL);
Al Viro's avatar
Al Viro committed
208
209
}

210
211
212
213
struct filename *
getname_kernel(const char * filename)
{
	struct filename *result;
214
	int len = strlen(filename) + 1;
215
216
217
218
219

	result = __getname();
	if (unlikely(!result))
		return ERR_PTR(-ENOMEM);

220
	if (len <= EMBEDDED_NAME_MAX) {
Al Viro's avatar
Al Viro committed
221
		result->name = (char *)result->iname;
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
	} else if (len <= PATH_MAX) {
		struct filename *tmp;

		tmp = kmalloc(sizeof(*tmp), GFP_KERNEL);
		if (unlikely(!tmp)) {
			__putname(result);
			return ERR_PTR(-ENOMEM);
		}
		tmp->name = (char *)result;
		result = tmp;
	} else {
		__putname(result);
		return ERR_PTR(-ENAMETOOLONG);
	}
	memcpy((char *)result->name, filename, len);
237
238
	result->uptr = NULL;
	result->aname = NULL;
239
	result->refcnt = 1;
240
	audit_getname(result);
241
242
243
244

	return result;
}

245
void putname(struct filename *name)
Linus Torvalds's avatar
Linus Torvalds committed
246
{
247
248
249
250
251
	BUG_ON(name->refcnt <= 0);

	if (--name->refcnt > 0)
		return;

Al Viro's avatar
Al Viro committed
252
	if (name->name != name->iname) {
253
254
255
256
		__putname(name->name);
		kfree(name);
	} else
		__putname(name);
Linus Torvalds's avatar
Linus Torvalds committed
257
258
}

259
260
static int check_acl(struct inode *inode, int mask)
{
261
#ifdef CONFIG_FS_POSIX_ACL
262
263
264
	struct posix_acl *acl;

	if (mask & MAY_NOT_BLOCK) {
265
266
		acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
	        if (!acl)
267
	                return -EAGAIN;
268
		/* no ->get_acl() calls in RCU mode... */
269
		if (is_uncached_acl(acl))
270
			return -ECHILD;
271
	        return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK);
272
273
	}

Christoph Hellwig's avatar
Christoph Hellwig committed
274
275
276
	acl = get_acl(inode, ACL_TYPE_ACCESS);
	if (IS_ERR(acl))
		return PTR_ERR(acl);
277
278
279
280
281
	if (acl) {
	        int error = posix_acl_permission(inode, acl, mask);
	        posix_acl_release(acl);
	        return error;
	}
282
#endif
283
284
285
286

	return -EAGAIN;
}

287
/*
288
 * This does the basic permission checking
Linus Torvalds's avatar
Linus Torvalds committed
289
 */
290
static int acl_permission_check(struct inode *inode, int mask)
Linus Torvalds's avatar
Linus Torvalds committed
291
{
292
	unsigned int mode = inode->i_mode;
Linus Torvalds's avatar
Linus Torvalds committed
293

294
	if (likely(uid_eq(current_fsuid(), inode->i_uid)))
Linus Torvalds's avatar
Linus Torvalds committed
295
296
		mode >>= 6;
	else {
297
		if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
298
			int error = check_acl(inode, mask);
299
300
			if (error != -EAGAIN)
				return error;
Linus Torvalds's avatar
Linus Torvalds committed
301
302
303
304
305
306
307
308
309
		}

		if (in_group_p(inode->i_gid))
			mode >>= 3;
	}

	/*
	 * If the DACs are ok we don't need any capability check.
	 */
310
	if ((mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
Linus Torvalds's avatar
Linus Torvalds committed
311
		return 0;
312
313
314
315
	return -EACCES;
}

/**
316
 * generic_permission -  check for access rights on a Posix-like filesystem
317
 * @inode:	inode to check access rights for
318
 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
319
320
321
322
 *
 * Used to check for read/write/execute permissions on a file.
 * We use "fsuid" for this, letting us set arbitrary permissions
 * for filesystem access without changing the "normal" uids which
323
324
325
326
327
 * are used for other things.
 *
 * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
 * request cannot be satisfied (eg. requires blocking or too much complexity).
 * It would then be called again in ref-walk mode.
328
 */
329
int generic_permission(struct inode *inode, int mask)
330
331
332
333
{
	int ret;

	/*
334
	 * Do the basic permission checks.
335
	 */
336
	ret = acl_permission_check(inode, mask);
337
338
	if (ret != -EACCES)
		return ret;
Linus Torvalds's avatar
Linus Torvalds committed
339

340
341
	if (S_ISDIR(inode->i_mode)) {
		/* DACs are overridable for directories */
342
		if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
343
344
			return 0;
		if (!(mask & MAY_WRITE))
345
346
			if (capable_wrt_inode_uidgid(inode,
						     CAP_DAC_READ_SEARCH))
347
348
349
				return 0;
		return -EACCES;
	}
Linus Torvalds's avatar
Linus Torvalds committed
350
351
	/*
	 * Read/write DACs are always overridable.
352
353
	 * Executable DACs are overridable when there is
	 * at least one exec bit set.
Linus Torvalds's avatar
Linus Torvalds committed
354
	 */
355
	if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
356
		if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
Linus Torvalds's avatar
Linus Torvalds committed
357
358
359
360
361
			return 0;

	/*
	 * Searching includes executable on directories, else just read.
	 */
362
	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
363
	if (mask == MAY_READ)
364
		if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH))
Linus Torvalds's avatar
Linus Torvalds committed
365
366
367
368
			return 0;

	return -EACCES;
}
369
EXPORT_SYMBOL(generic_permission);
Linus Torvalds's avatar
Linus Torvalds committed
370

371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
/*
 * We _really_ want to just do "generic_permission()" without
 * even looking at the inode->i_op values. So we keep a cache
 * flag in inode->i_opflags, that says "this has not special
 * permission function, use the fast case".
 */
static inline int do_inode_permission(struct inode *inode, int mask)
{
	if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
		if (likely(inode->i_op->permission))
			return inode->i_op->permission(inode, mask);

		/* This gets set once for the inode lifetime */
		spin_lock(&inode->i_lock);
		inode->i_opflags |= IOP_FASTPERM;
		spin_unlock(&inode->i_lock);
	}
	return generic_permission(inode, mask);
}

Christoph Hellwig's avatar
Christoph Hellwig committed
391
/**
David Howells's avatar
David Howells committed
392
393
394
 * __inode_permission - Check for access rights to a given inode
 * @inode: Inode to check permission on
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
Christoph Hellwig's avatar
Christoph Hellwig committed
395
 *
David Howells's avatar
David Howells committed
396
 * Check for read/write/execute permissions on an inode.
397
398
 *
 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
David Howells's avatar
David Howells committed
399
400
401
 *
 * This does not check for a read-only file system.  You probably want
 * inode_permission().
Christoph Hellwig's avatar
Christoph Hellwig committed
402
 */
David Howells's avatar
David Howells committed
403
int __inode_permission(struct inode *inode, int mask)
Linus Torvalds's avatar
Linus Torvalds committed
404
{
405
	int retval;
Linus Torvalds's avatar
Linus Torvalds committed
406

407
	if (unlikely(mask & MAY_WRITE)) {
Linus Torvalds's avatar
Linus Torvalds committed
408
409
410
411
412
413
414
		/*
		 * Nobody gets write access to an immutable file.
		 */
		if (IS_IMMUTABLE(inode))
			return -EACCES;
	}

415
	retval = do_inode_permission(inode, mask);
Linus Torvalds's avatar
Linus Torvalds committed
416
417
418
	if (retval)
		return retval;

419
420
421
422
	retval = devcgroup_inode_permission(inode, mask);
	if (retval)
		return retval;

423
	return security_inode_permission(inode, mask);
Linus Torvalds's avatar
Linus Torvalds committed
424
}
425
EXPORT_SYMBOL(__inode_permission);
Linus Torvalds's avatar
Linus Torvalds committed
426

David Howells's avatar
David Howells committed
427
428
429
/**
 * sb_permission - Check superblock-level permissions
 * @sb: Superblock of inode to check permission on
430
 * @inode: Inode to check permission on
David Howells's avatar
David Howells committed
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Separate out file-system wide checks from inode-specific permission checks.
 */
static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
{
	if (unlikely(mask & MAY_WRITE)) {
		umode_t mode = inode->i_mode;

		/* Nobody gets write access to a read-only fs. */
		if ((sb->s_flags & MS_RDONLY) &&
		    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
			return -EROFS;
	}
	return 0;
}

/**
 * inode_permission - Check for access rights to a given inode
 * @inode: Inode to check permission on
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
 * this, letting us set arbitrary permissions for filesystem access without
 * changing the "normal" UIDs which are used for other things.
 *
 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
 */
int inode_permission(struct inode *inode, int mask)
{
	int retval;

	retval = sb_permission(inode->i_sb, inode, mask);
	if (retval)
		return retval;
	return __inode_permission(inode, mask);
}
468
EXPORT_SYMBOL(inode_permission);
David Howells's avatar
David Howells committed
469

Jan Blunck's avatar
Jan Blunck committed
470
471
472
473
474
475
/**
 * path_get - get a reference to a path
 * @path: path to get the reference to
 *
 * Given a path increment the reference count to the dentry and the vfsmount.
 */
476
void path_get(const struct path *path)
Jan Blunck's avatar
Jan Blunck committed
477
478
479
480
481
482
{
	mntget(path->mnt);
	dget(path->dentry);
}
EXPORT_SYMBOL(path_get);

Jan Blunck's avatar
Jan Blunck committed
483
484
485
486
487
488
/**
 * path_put - put a reference to a path
 * @path: path to put the reference to
 *
 * Given a path decrement the reference count to the dentry and the vfsmount.
 */
489
void path_put(const struct path *path)
Linus Torvalds's avatar
Linus Torvalds committed
490
{
Jan Blunck's avatar
Jan Blunck committed
491
492
	dput(path->dentry);
	mntput(path->mnt);
Linus Torvalds's avatar
Linus Torvalds committed
493
}
Jan Blunck's avatar
Jan Blunck committed
494
EXPORT_SYMBOL(path_put);
Linus Torvalds's avatar
Linus Torvalds committed
495

496
#define EMBEDDED_LEVELS 2
497
498
struct nameidata {
	struct path	path;
Al Viro's avatar
Al Viro committed
499
	struct qstr	last;
500
501
502
	struct path	root;
	struct inode	*inode; /* path.dentry.d_inode */
	unsigned int	flags;
503
	unsigned	seq, m_seq;
504
505
	int		last_type;
	unsigned	depth;
506
	int		total_link_count;
507
508
	struct saved {
		struct path link;
509
		struct delayed_call done;
510
		const char *name;
511
		unsigned seq;
512
	} *stack, internal[EMBEDDED_LEVELS];
513
514
	struct filename	*name;
	struct nameidata *saved;
515
	struct inode	*link_inode;
516
517
	unsigned	root_seq;
	int		dfd;
518
519
};

520
static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
521
{
522
523
	struct nameidata *old = current->nameidata;
	p->stack = p->internal;
524
525
	p->dfd = dfd;
	p->name = name;
526
	p->total_link_count = old ? old->total_link_count : 0;
527
	p->saved = old;
528
	current->nameidata = p;
529
530
}

531
static void restore_nameidata(void)
532
{
533
	struct nameidata *now = current->nameidata, *old = now->saved;
534
535
536
537

	current->nameidata = old;
	if (old)
		old->total_link_count = now->total_link_count;
538
	if (now->stack != now->internal)
539
		kfree(now->stack);
540
541
542
543
}

static int __nd_alloc_stack(struct nameidata *nd)
{
Al Viro's avatar
Al Viro committed
544
545
546
547
548
549
550
551
552
	struct saved *p;

	if (nd->flags & LOOKUP_RCU) {
		p= kmalloc(MAXSYMLINKS * sizeof(struct saved),
				  GFP_ATOMIC);
		if (unlikely(!p))
			return -ECHILD;
	} else {
		p= kmalloc(MAXSYMLINKS * sizeof(struct saved),
553
				  GFP_KERNEL);
Al Viro's avatar
Al Viro committed
554
555
556
		if (unlikely(!p))
			return -ENOMEM;
	}
557
558
559
560
561
	memcpy(p, nd->internal, sizeof(nd->internal));
	nd->stack = p;
	return 0;
}

562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
/**
 * path_connected - Verify that a path->dentry is below path->mnt.mnt_root
 * @path: nameidate to verify
 *
 * Rename can sometimes move a file or directory outside of a bind
 * mount, path_connected allows those cases to be detected.
 */
static bool path_connected(const struct path *path)
{
	struct vfsmount *mnt = path->mnt;

	/* Only bind mounts can have disconnected paths */
	if (mnt->mnt_root == mnt->mnt_sb->s_root)
		return true;

	return is_subdir(path->dentry, mnt->mnt_root);
}

580
581
static inline int nd_alloc_stack(struct nameidata *nd)
{
582
	if (likely(nd->depth != EMBEDDED_LEVELS))
583
584
585
586
587
588
		return 0;
	if (likely(nd->stack != nd->internal))
		return 0;
	return __nd_alloc_stack(nd);
}

589
590
591
592
593
static void drop_links(struct nameidata *nd)
{
	int i = nd->depth;
	while (i--) {
		struct saved *last = nd->stack + i;
594
595
		do_delayed_call(&last->done);
		clear_delayed_call(&last->done);
596
597
598
599
600
601
602
603
604
605
606
	}
}

static void terminate_walk(struct nameidata *nd)
{
	drop_links(nd);
	if (!(nd->flags & LOOKUP_RCU)) {
		int i;
		path_put(&nd->path);
		for (i = 0; i < nd->depth; i++)
			path_put(&nd->stack[i].link);
607
608
609
610
		if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
			path_put(&nd->root);
			nd->root.mnt = NULL;
		}
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
	} else {
		nd->flags &= ~LOOKUP_RCU;
		if (!(nd->flags & LOOKUP_ROOT))
			nd->root.mnt = NULL;
		rcu_read_unlock();
	}
	nd->depth = 0;
}

/* path_put is needed afterwards regardless of success or failure */
static bool legitimize_path(struct nameidata *nd,
			    struct path *path, unsigned seq)
{
	int res = __legitimize_mnt(path->mnt, nd->m_seq);
	if (unlikely(res)) {
		if (res > 0)
			path->mnt = NULL;
		path->dentry = NULL;
		return false;
	}
	if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
		path->dentry = NULL;
		return false;
	}
	return !read_seqcount_retry(&path->dentry->d_seq, seq);
}

static bool legitimize_links(struct nameidata *nd)
{
	int i;
	for (i = 0; i < nd->depth; i++) {
		struct saved *last = nd->stack + i;
		if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
			drop_links(nd);
			nd->depth = i + 1;
			return false;
		}
	}
	return true;
}

Al Viro's avatar
Al Viro committed
652
/*
Nick Piggin's avatar
Nick Piggin committed
653
 * Path walking has 2 modes, rcu-walk and ref-walk (see
Al Viro's avatar
Al Viro committed
654
655
 * Documentation/filesystems/path-lookup.txt).  In situations when we can't
 * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
Mike Marshall's avatar
Mike Marshall committed
656
 * normal reference counts on dentries and vfsmounts to transition to ref-walk
Al Viro's avatar
Al Viro committed
657
658
659
660
 * mode.  Refcounts are grabbed at the last known good point before rcu-walk
 * got stuck, so ref-walk may continue from there. If this is not successful
 * (eg. a seqcount has changed), then failure is returned and it's up to caller
 * to restart the path walk from the beginning in ref-walk mode.
Nick Piggin's avatar
Nick Piggin committed
661
662
663
 */

/**
Al Viro's avatar
Al Viro committed
664
665
666
 * unlazy_walk - try to switch to ref-walk mode.
 * @nd: nameidata pathwalk data
 * @dentry: child of nd->path.dentry or NULL
667
 * @seq: seq number to check dentry against
668
 * Returns: 0 on success, -ECHILD on failure
Nick Piggin's avatar
Nick Piggin committed
669
 *
Al Viro's avatar
Al Viro committed
670
671
672
 * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry
 * for ref-walk mode.  @dentry must be a path found by a do_lookup call on
 * @nd or NULL.  Must be called from rcu-walk context.
673
674
 * Nothing should touch nameidata between unlazy_walk() failure and
 * terminate_walk().
Nick Piggin's avatar
Nick Piggin committed
675
 */
676
static int unlazy_walk(struct nameidata *nd, struct dentry *dentry, unsigned seq)
Nick Piggin's avatar
Nick Piggin committed
677
678
679
680
{
	struct dentry *parent = nd->path.dentry;

	BUG_ON(!(nd->flags & LOOKUP_RCU));
681
682

	nd->flags &= ~LOOKUP_RCU;
683
684
685
686
687
688
	if (unlikely(!legitimize_links(nd)))
		goto out2;
	if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq)))
		goto out2;
	if (unlikely(!lockref_get_not_dead(&parent->d_lockref)))
		goto out1;
Al Viro's avatar
Al Viro committed
689

690
691
692
693
694
695
696
697
698
699
700
	/*
	 * For a negative lookup, the lookup sequence point is the parents
	 * sequence point, and it only needs to revalidate the parent dentry.
	 *
	 * For a positive lookup, we need to move both the parent and the
	 * dentry from the RCU domain to be properly refcounted. And the
	 * sequence number in the dentry validates *both* dentry counters,
	 * since we checked the sequence number of the parent after we got
	 * the child sequence number. So we know the parent must still
	 * be valid if the child sequence number is still valid.
	 */
Al Viro's avatar
Al Viro committed
701
	if (!dentry) {
702
703
		if (read_seqcount_retry(&parent->d_seq, nd->seq))
			goto out;
Al Viro's avatar
Al Viro committed
704
705
		BUG_ON(nd->inode != parent->d_inode);
	} else {
706
707
		if (!lockref_get_not_dead(&dentry->d_lockref))
			goto out;
708
		if (read_seqcount_retry(&dentry->d_seq, seq))
709
			goto drop_dentry;
Al Viro's avatar
Al Viro committed
710
	}
711
712
713
714
715
716

	/*
	 * Sequence counts matched. Now make sure that the root is
	 * still valid and get it if required.
	 */
	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
717
718
719
720
		if (unlikely(!legitimize_path(nd, &nd->root, nd->root_seq))) {
			rcu_read_unlock();
			dput(dentry);
			return -ECHILD;
721
		}
Nick Piggin's avatar
Nick Piggin committed
722
723
	}

Al Viro's avatar
Al Viro committed
724
	rcu_read_unlock();
Nick Piggin's avatar
Nick Piggin committed
725
	return 0;
Al Viro's avatar
Al Viro committed
726

727
drop_dentry:
Al Viro's avatar
Al Viro committed
728
	rcu_read_unlock();
729
	dput(dentry);
730
	goto drop_root_mnt;
731
732
733
734
out2:
	nd->path.mnt = NULL;
out1:
	nd->path.dentry = NULL;
735
out:
Al Viro's avatar
Al Viro committed
736
	rcu_read_unlock();
737
738
739
drop_root_mnt:
	if (!(nd->flags & LOOKUP_ROOT))
		nd->root.mnt = NULL;
Nick Piggin's avatar
Nick Piggin committed
740
741
742
	return -ECHILD;
}

743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
static int unlazy_link(struct nameidata *nd, struct path *link, unsigned seq)
{
	if (unlikely(!legitimize_path(nd, link, seq))) {
		drop_links(nd);
		nd->depth = 0;
		nd->flags &= ~LOOKUP_RCU;
		nd->path.mnt = NULL;
		nd->path.dentry = NULL;
		if (!(nd->flags & LOOKUP_ROOT))
			nd->root.mnt = NULL;
		rcu_read_unlock();
	} else if (likely(unlazy_walk(nd, NULL, 0)) == 0) {
		return 0;
	}
	path_put(link);
	return -ECHILD;
}

761
static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
762
{
763
	return dentry->d_op->d_revalidate(dentry, flags);
764
765
}

766
767
768
/**
 * complete_walk - successful completion of path walk
 * @nd:  pointer nameidata
769
 *
770
771
772
773
774
 * If we had been in RCU mode, drop out of it and legitimize nd->path.
 * Revalidate the final result, unless we'd already done that during
 * the path walk or the filesystem doesn't ask for it.  Return 0 on
 * success, -error on failure.  In case of failure caller does not
 * need to drop nd->path.
775
 */
776
static int complete_walk(struct nameidata *nd)
777
{
Al Viro's avatar
Al Viro committed
778
	struct dentry *dentry = nd->path.dentry;
779
780
	int status;

781
782
783
	if (nd->flags & LOOKUP_RCU) {
		if (!(nd->flags & LOOKUP_ROOT))
			nd->root.mnt = NULL;
784
		if (unlikely(unlazy_walk(nd, NULL, 0)))
785
786
787
			return -ECHILD;
	}

Al Viro's avatar
Al Viro committed
788
789
790
	if (likely(!(nd->flags & LOOKUP_JUMPED)))
		return 0;

791
	if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
792
793
		return 0;

794
	status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
795
796
797
	if (status > 0)
		return 0;

Al Viro's avatar
Al Viro committed
798
	if (!status)
799
		status = -ESTALE;
Al Viro's avatar
Al Viro committed
800

801
802
803
	return status;
}

Al Viro's avatar
Al Viro committed
804
static void set_root(struct nameidata *nd)
Nick Piggin's avatar
Nick Piggin committed
805
{
806
	struct fs_struct *fs = current->fs;
Nick Piggin's avatar
Nick Piggin committed
807

808
809
810
811
812
813
814
815
816
817
818
	if (nd->flags & LOOKUP_RCU) {
		unsigned seq;

		do {
			seq = read_seqcount_begin(&fs->seq);
			nd->root = fs->root;
			nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
		} while (read_seqcount_retry(&fs->seq, seq));
	} else {
		get_fs_root(fs, &nd->root);
	}
Nick Piggin's avatar
Nick Piggin committed
819
820
}

Jan Blunck's avatar
Jan Blunck committed
821
static void path_put_conditional(struct path *path, struct nameidata *nd)
822
823
{
	dput(path->dentry);
824
	if (path->mnt != nd->path.mnt)
825
826
827
		mntput(path->mnt);
}

828
829
static inline void path_to_nameidata(const struct path *path,
					struct nameidata *nd)
830
{
Nick Piggin's avatar
Nick Piggin committed
831
832
833
834
	if (!(nd->flags & LOOKUP_RCU)) {
		dput(nd->path.dentry);
		if (nd->path.mnt != path->mnt)
			mntput(nd->path.mnt);
835
	}
Nick Piggin's avatar
Nick Piggin committed
836
	nd->path.mnt = path->mnt;
837
	nd->path.dentry = path->dentry;
838
839
}

840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
static int nd_jump_root(struct nameidata *nd)
{
	if (nd->flags & LOOKUP_RCU) {
		struct dentry *d;
		nd->path = nd->root;
		d = nd->path.dentry;
		nd->inode = d->d_inode;
		nd->seq = nd->root_seq;
		if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
			return -ECHILD;
	} else {
		path_put(&nd->path);
		nd->path = nd->root;
		path_get(&nd->path);
		nd->inode = nd->path.dentry->d_inode;
	}
	nd->flags |= LOOKUP_JUMPED;
	return 0;
}

Christoph Hellwig's avatar
Christoph Hellwig committed
860
/*
861
 * Helper to directly jump to a known parsed path from ->get_link,
Christoph Hellwig's avatar
Christoph Hellwig committed
862
863
 * caller must have taken a reference to path beforehand.
 */
864
void nd_jump_link(struct path *path)
Christoph Hellwig's avatar
Christoph Hellwig committed
865
{
866
	struct nameidata *nd = current->nameidata;
Christoph Hellwig's avatar
Christoph Hellwig committed
867
868
869
870
871
872
873
	path_put(&nd->path);

	nd->path = *path;
	nd->inode = nd->path.dentry->d_inode;
	nd->flags |= LOOKUP_JUMPED;
}

874
static inline void put_link(struct nameidata *nd)
875
{
Al Viro's avatar
Al Viro committed
876
	struct saved *last = nd->stack + --nd->depth;
877
	do_delayed_call(&last->done);
Al Viro's avatar
Al Viro committed
878
879
	if (!(nd->flags & LOOKUP_RCU))
		path_put(&last->link);
880
881
}

882
883
int sysctl_protected_symlinks __read_mostly = 0;
int sysctl_protected_hardlinks __read_mostly = 0;
Kees Cook's avatar
Kees Cook committed
884
885
886

/**
 * may_follow_link - Check symlink following for unsafe situations
887
 * @nd: nameidata pathwalk data
Kees Cook's avatar
Kees Cook committed
888
889
890
891
892
893
894
895
896
897
898
899
 *
 * In the case of the sysctl_protected_symlinks sysctl being enabled,
 * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
 * in a sticky world-writable directory. This is to protect privileged
 * processes from failing races against path names that may change out
 * from under them by way of other users creating malicious symlinks.
 * It will permit symlinks to be followed only when outside a sticky
 * world-writable directory, or when the uid of the symlink and follower
 * match, or when the directory owner matches the symlink's owner.
 *
 * Returns 0 if following the symlink is allowed, -ve on error.
 */
Al Viro's avatar
Al Viro committed
900
static inline int may_follow_link(struct nameidata *nd)
Kees Cook's avatar
Kees Cook committed
901
902
903
904
905
906
907
908
{
	const struct inode *inode;
	const struct inode *parent;

	if (!sysctl_protected_symlinks)
		return 0;

	/* Allowed if owner and follower match. */
909
	inode = nd->link_inode;
910
	if (uid_eq(current_cred()->fsuid, inode->i_uid))
Kees Cook's avatar
Kees Cook committed
911
912
913
		return 0;

	/* Allowed if parent directory not sticky and world-writable. */
914
	parent = nd->inode;
Kees Cook's avatar
Kees Cook committed
915
916
917
918
	if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
		return 0;

	/* Allowed if parent directory and link owner match. */
919
	if (uid_eq(parent->i_uid, inode->i_uid))
Kees Cook's avatar
Kees Cook committed
920
921
		return 0;

922
923
924
	if (nd->flags & LOOKUP_RCU)
		return -ECHILD;

Al Viro's avatar
Al Viro committed
925
	audit_log_link_denied("follow_link", &nd->stack[0].link);
Kees Cook's avatar
Kees Cook committed
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
	return -EACCES;
}

/**
 * safe_hardlink_source - Check for safe hardlink conditions
 * @inode: the source inode to hardlink from
 *
 * Return false if at least one of the following conditions:
 *    - inode is not a regular file
 *    - inode is setuid
 *    - inode is setgid and group-exec
 *    - access failure for read and write
 *
 * Otherwise returns true.
 */
static bool safe_hardlink_source(struct inode *inode)
{
	umode_t mode = inode->i_mode;

	/* Special files should not get pinned to the filesystem. */
	if (!S_ISREG(mode))
		return false;

	/* Setuid files should not get pinned to the filesystem. */
	if (mode & S_ISUID)
		return false;

	/* Executable setgid files should not get pinned to the filesystem. */
	if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
		return false;

	/* Hardlinking to unreadable or unwritable sources is dangerous. */
	if (inode_permission(inode, MAY_READ | MAY_WRITE))
		return false;

	return true;
}

/**
 * may_linkat - Check permissions for creating a hardlink
 * @link: the source to hardlink from
 *
 * Block hardlink when all of:
 *  - sysctl_protected_hardlinks enabled
 *  - fsuid does not match inode
 *  - hardlink source is unsafe (see safe_hardlink_source() above)
972
 *  - not CAP_FOWNER in a namespace with the inode owner uid mapped
Kees Cook's avatar
Kees Cook committed
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
 *
 * Returns 0 if successful, -ve on error.
 */
static int may_linkat(struct path *link)
{
	struct inode *inode;

	if (!sysctl_protected_hardlinks)
		return 0;

	inode = link->dentry->d_inode;

	/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
	 * otherwise, it must be a safe source.
	 */
988
	if (inode_owner_or_capable(inode) || safe_hardlink_source(inode))
Kees Cook's avatar
Kees Cook committed
989
990
		return 0;

991
	audit_log_link_denied("linkat", link);
Kees Cook's avatar
Kees Cook committed
992
993
994
	return -EPERM;
}

995
996
static __always_inline
const char *get_link(struct nameidata *nd)
Linus Torvalds's avatar
Linus Torvalds committed
997
{
998
	struct saved *last = nd->stack + nd->depth - 1;
Al Viro's avatar
Al Viro committed
999
	struct dentry *dentry = last->link.dentry;
1000
	struct inode *inode = nd->link_inode;
For faster browsing, not all history is shown. View entire blame