inode.c 139 KB
Newer Older
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
	 * the stale data before DIO complete the data IO.
	 *
	 * As to previously fallocated extents, ext4 get_block will
	 * just simply mark the buffer mapped but still keep the
	 * extents uninitialized.
	 *
	 * For non AIO case, we will convert those unwritten extents
	 * to written after return back from blockdev_direct_IO.
	 *
	 * For async DIO, the conversion needs to be deferred when the
	 * IO is completed. The ext4 end_io callback function will be
	 * called to take care of the conversion work.  Here for async
	 * case, we allocate an io_end structure to hook to the iocb.
	 */
	iocb->private = NULL;
	ext4_inode_aio_set(inode, NULL);
	if (!is_sync_kiocb(iocb)) {
		ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS);
		if (!io_end) {
			ret = -ENOMEM;
			goto retake_lock;
3022
		}
3023
3024
		io_end->flag |= EXT4_IO_END_DIRECT;
		iocb->private = io_end;
3025
		/*
3026
3027
3028
3029
		 * we save the io structure for current async direct
		 * IO, so that later ext4_map_blocks() could flag the
		 * io structure whether there is a unwritten extents
		 * needs to be converted when IO is completed.
3030
		 */
3031
3032
		ext4_inode_aio_set(inode, io_end);
	}
3033

3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
	if (overwrite) {
		get_block_func = ext4_get_block_write_nolock;
	} else {
		get_block_func = ext4_get_block_write;
		dio_flags = DIO_LOCKING;
	}
	ret = __blockdev_direct_IO(rw, iocb, inode,
				   inode->i_sb->s_bdev, iov,
				   offset, nr_segs,
				   get_block_func,
				   ext4_end_io_dio,
				   NULL,
				   dio_flags);

	if (iocb->private)
		ext4_inode_aio_set(inode, NULL);
	/*
	 * The io_end structure takes a reference to the inode, that
	 * structure needs to be destroyed and the reference to the
	 * inode need to be dropped, when IO is complete, even with 0
	 * byte write, or failed.
	 *
	 * In the successful AIO DIO case, the io_end structure will
	 * be destroyed and the reference to the inode will be dropped
	 * after the end_io call back function is called.
	 *
	 * In the case there is 0 byte write, or error case, since VFS
	 * direct IO won't invoke the end_io call back function, we
	 * need to free the end_io structure here.
	 */
	if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
		ext4_free_io_end(iocb->private);
		iocb->private = NULL;
	} else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
						EXT4_STATE_DIO_UNWRITTEN)) {
		int err;
		/*
		 * for non AIO case, since the IO is already
		 * completed, we could do the conversion right here
		 */
		err = ext4_convert_unwritten_extents(inode,
						     offset, ret);
		if (err < 0)
			ret = err;
		ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
	}
3080

3081
3082
3083
3084
3085
3086
retake_lock:
	/* take i_mutex locking again if we do a ovewrite dio */
	if (overwrite) {
		inode_dio_done(inode);
		up_read(&EXT4_I(inode)->i_data_sem);
		mutex_lock(&inode->i_mutex);
3087
	}
3088

3089
	return ret;
3090
3091
3092
3093
3094
3095
3096
3097
}

static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
			      const struct iovec *iov, loff_t offset,
			      unsigned long nr_segs)
{
	struct file *file = iocb->ki_filp;
	struct inode *inode = file->f_mapping->host;
3098
	ssize_t ret;
3099

3100
3101
3102
3103
3104
3105
	/*
	 * If we are doing data journalling we don't support O_DIRECT
	 */
	if (ext4_should_journal_data(inode))
		return 0;

3106
3107
3108
3109
	/* Let buffer I/O handle the inline data case. */
	if (ext4_has_inline_data(inode))
		return 0;

3110
	trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
3111
	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3112
3113
3114
3115
3116
3117
		ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
	else
		ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
	trace_ext4_direct_IO_exit(inode, offset,
				iov_length(iov, nr_segs), rw, ret);
	return ret;
3118
3119
}

3120
/*
3121
 * Pages can be marked dirty completely asynchronously from ext4's journalling
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
 * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
 * much here because ->set_page_dirty is called under VFS locks.  The page is
 * not necessarily locked.
 *
 * We cannot just dirty the page and leave attached buffers clean, because the
 * buffers' dirty state is "definitive".  We cannot just set the buffers dirty
 * or jbddirty because all the journalling code will explode.
 *
 * So what we do is to mark the page "pending dirty" and next time writepage
 * is called, propagate that into the buffers appropriately.
 */
3133
static int ext4_journalled_set_page_dirty(struct page *page)
3134
3135
3136
3137
3138
{
	SetPageChecked(page);
	return __set_page_dirty_nobuffers(page);
}

3139
static const struct address_space_operations ext4_ordered_aops = {
3140
3141
	.readpage		= ext4_readpage,
	.readpages		= ext4_readpages,
3142
	.writepage		= ext4_writepage,
3143
3144
3145
3146
3147
3148
3149
3150
	.write_begin		= ext4_write_begin,
	.write_end		= ext4_ordered_write_end,
	.bmap			= ext4_bmap,
	.invalidatepage		= ext4_invalidatepage,
	.releasepage		= ext4_releasepage,
	.direct_IO		= ext4_direct_IO,
	.migratepage		= buffer_migrate_page,
	.is_partially_uptodate  = block_is_partially_uptodate,
3151
	.error_remove_page	= generic_error_remove_page,
3152
3153
};

3154
static const struct address_space_operations ext4_writeback_aops = {
3155
3156
	.readpage		= ext4_readpage,
	.readpages		= ext4_readpages,
3157
	.writepage		= ext4_writepage,
3158
3159
3160
3161
3162
3163
3164
3165
	.write_begin		= ext4_write_begin,
	.write_end		= ext4_writeback_write_end,
	.bmap			= ext4_bmap,
	.invalidatepage		= ext4_invalidatepage,
	.releasepage		= ext4_releasepage,
	.direct_IO		= ext4_direct_IO,
	.migratepage		= buffer_migrate_page,
	.is_partially_uptodate  = block_is_partially_uptodate,
3166
	.error_remove_page	= generic_error_remove_page,
3167
3168
};

3169
static const struct address_space_operations ext4_journalled_aops = {
3170
3171
	.readpage		= ext4_readpage,
	.readpages		= ext4_readpages,
3172
	.writepage		= ext4_writepage,
3173
3174
3175
3176
3177
3178
	.write_begin		= ext4_write_begin,
	.write_end		= ext4_journalled_write_end,
	.set_page_dirty		= ext4_journalled_set_page_dirty,
	.bmap			= ext4_bmap,
	.invalidatepage		= ext4_invalidatepage,
	.releasepage		= ext4_releasepage,
3179
	.direct_IO		= ext4_direct_IO,
3180
	.is_partially_uptodate  = block_is_partially_uptodate,
3181
	.error_remove_page	= generic_error_remove_page,
3182
3183
};

3184
static const struct address_space_operations ext4_da_aops = {
3185
3186
	.readpage		= ext4_readpage,
	.readpages		= ext4_readpages,
3187
	.writepage		= ext4_writepage,
3188
3189
3190
3191
3192
3193
3194
3195
3196
	.writepages		= ext4_da_writepages,
	.write_begin		= ext4_da_write_begin,
	.write_end		= ext4_da_write_end,
	.bmap			= ext4_bmap,
	.invalidatepage		= ext4_da_invalidatepage,
	.releasepage		= ext4_releasepage,
	.direct_IO		= ext4_direct_IO,
	.migratepage		= buffer_migrate_page,
	.is_partially_uptodate  = block_is_partially_uptodate,
3197
	.error_remove_page	= generic_error_remove_page,
3198
3199
};

3200
void ext4_set_aops(struct inode *inode)
3201
{
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
	switch (ext4_inode_journal_mode(inode)) {
	case EXT4_INODE_ORDERED_DATA_MODE:
		if (test_opt(inode->i_sb, DELALLOC))
			inode->i_mapping->a_ops = &ext4_da_aops;
		else
			inode->i_mapping->a_ops = &ext4_ordered_aops;
		break;
	case EXT4_INODE_WRITEBACK_DATA_MODE:
		if (test_opt(inode->i_sb, DELALLOC))
			inode->i_mapping->a_ops = &ext4_da_aops;
		else
			inode->i_mapping->a_ops = &ext4_writeback_aops;
		break;
	case EXT4_INODE_JOURNAL_DATA_MODE:
3216
		inode->i_mapping->a_ops = &ext4_journalled_aops;
3217
3218
3219
3220
		break;
	default:
		BUG();
	}
3221
3222
}

3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242

/*
 * ext4_discard_partial_page_buffers()
 * Wrapper function for ext4_discard_partial_page_buffers_no_lock.
 * This function finds and locks the page containing the offset
 * "from" and passes it to ext4_discard_partial_page_buffers_no_lock.
 * Calling functions that already have the page locked should call
 * ext4_discard_partial_page_buffers_no_lock directly.
 */
int ext4_discard_partial_page_buffers(handle_t *handle,
		struct address_space *mapping, loff_t from,
		loff_t length, int flags)
{
	struct inode *inode = mapping->host;
	struct page *page;
	int err = 0;

	page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
				   mapping_gfp_mask(mapping) & ~__GFP_FS);
	if (!page)
3243
		return -ENOMEM;
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271

	err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page,
		from, length, flags);

	unlock_page(page);
	page_cache_release(page);
	return err;
}

/*
 * ext4_discard_partial_page_buffers_no_lock()
 * Zeros a page range of length 'length' starting from offset 'from'.
 * Buffer heads that correspond to the block aligned regions of the
 * zeroed range will be unmapped.  Unblock aligned regions
 * will have the corresponding buffer head mapped if needed so that
 * that region of the page can be updated with the partial zero out.
 *
 * This function assumes that the page has already been  locked.  The
 * The range to be discarded must be contained with in the given page.
 * If the specified range exceeds the end of the page it will be shortened
 * to the end of the page that corresponds to 'from'.  This function is
 * appropriate for updating a page and it buffer heads to be unmapped and
 * zeroed for blocks that have been either released, or are going to be
 * released.
 *
 * handle: The journal handle
 * inode:  The files inode
 * page:   A locked page that contains the offset "from"
3272
 * from:   The starting byte offset (from the beginning of the file)
3273
3274
3275
3276
3277
3278
3279
 *         to begin discarding
 * len:    The length of bytes to discard
 * flags:  Optional flags that may be used:
 *
 *         EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED
 *         Only zero the regions of the page whose buffer heads
 *         have already been unmapped.  This flag is appropriate
3280
 *         for updating the contents of a page whose blocks may
3281
3282
3283
 *         have already been released, and we only want to zero
 *         out the regions that correspond to those released blocks.
 *
3284
 * Returns zero on success or negative on failure.
3285
 */
Eric Sandeen's avatar
Eric Sandeen committed
3286
static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
		struct inode *inode, struct page *page, loff_t from,
		loff_t length, int flags)
{
	ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
	unsigned int offset = from & (PAGE_CACHE_SIZE-1);
	unsigned int blocksize, max, pos;
	ext4_lblk_t iblock;
	struct buffer_head *bh;
	int err = 0;

	blocksize = inode->i_sb->s_blocksize;
	max = PAGE_CACHE_SIZE - offset;

	if (index != page->index)
		return -EINVAL;

	/*
	 * correct length if it does not fall between
	 * 'from' and the end of the page
	 */
	if (length > max || length < 0)
		length = max;

	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);

3312
3313
	if (!page_has_buffers(page))
		create_empty_buffers(page, blocksize, 0);
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325

	/* Find the buffer that contains "offset" */
	bh = page_buffers(page);
	pos = blocksize;
	while (offset >= pos) {
		bh = bh->b_this_page;
		iblock++;
		pos += blocksize;
	}

	pos = offset;
	while (pos < offset + length) {
3326
3327
		unsigned int end_of_block, range_to_discard;

3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
		err = 0;

		/* The length of space left to zero and unmap */
		range_to_discard = offset + length - pos;

		/* The length of space until the end of the block */
		end_of_block = blocksize - (pos & (blocksize-1));

		/*
		 * Do not unmap or zero past end of block
		 * for this buffer head
		 */
		if (range_to_discard > end_of_block)
			range_to_discard = end_of_block;


		/*
		 * Skip this buffer head if we are only zeroing unampped
		 * regions of the page
		 */
		if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED &&
			buffer_mapped(bh))
				goto next;

		/* If the range is block aligned, unmap */
		if (range_to_discard == blocksize) {
			clear_buffer_dirty(bh);
			bh->b_bdev = NULL;
			clear_buffer_mapped(bh);
			clear_buffer_req(bh);
			clear_buffer_new(bh);
			clear_buffer_delay(bh);
			clear_buffer_unwritten(bh);
			clear_buffer_uptodate(bh);
			zero_user(page, pos, range_to_discard);
			BUFFER_TRACE(bh, "Buffer discarded");
			goto next;
		}

		/*
		 * If this block is not completely contained in the range
		 * to be discarded, then it is not going to be released. Because
		 * we need to keep this block, we need to make sure this part
		 * of the page is uptodate before we modify it by writeing
		 * partial zeros on it.
		 */
		if (!buffer_mapped(bh)) {
			/*
			 * Buffer head must be mapped before we can read
			 * from the block
			 */
			BUFFER_TRACE(bh, "unmapped");
			ext4_get_block(inode, iblock, bh, 0);
			/* unmapped? It's a hole - nothing to do */
			if (!buffer_mapped(bh)) {
				BUFFER_TRACE(bh, "still unmapped");
				goto next;
			}
		}

		/* Ok, it's mapped. Make sure it's up-to-date */
		if (PageUptodate(page))
			set_buffer_uptodate(bh);

		if (!buffer_uptodate(bh)) {
			err = -EIO;
			ll_rw_block(READ, 1, &bh);
			wait_on_buffer(bh);
			/* Uhhuh. Read error. Complain and punt.*/
			if (!buffer_uptodate(bh))
				goto next;
		}

		if (ext4_should_journal_data(inode)) {
			BUFFER_TRACE(bh, "get write access");
			err = ext4_journal_get_write_access(handle, bh);
			if (err)
				goto next;
		}

		zero_user(page, pos, range_to_discard);

		err = 0;
		if (ext4_should_journal_data(inode)) {
			err = ext4_handle_dirty_metadata(handle, inode, bh);
3413
		} else
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
			mark_buffer_dirty(bh);

		BUFFER_TRACE(bh, "Partial buffer zeroed");
next:
		bh = bh->b_this_page;
		iblock++;
		pos += range_to_discard;
	}

	return err;
}

3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
int ext4_can_truncate(struct inode *inode)
{
	if (S_ISREG(inode->i_mode))
		return 1;
	if (S_ISDIR(inode->i_mode))
		return 1;
	if (S_ISLNK(inode->i_mode))
		return !ext4_inode_is_fast_symlink(inode);
	return 0;
}

3437
3438
3439
3440
3441
3442
3443
3444
/*
 * ext4_punch_hole: punches a hole in a file by releaseing the blocks
 * associated with the given offset and length
 *
 * @inode:  File inode
 * @offset: The offset where the hole will begin
 * @len:    The length of the hole
 *
3445
 * Returns: 0 on success or negative on failure
3446
3447
3448
3449
3450
3451
 */

int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
{
	struct inode *inode = file->f_path.dentry->d_inode;
	if (!S_ISREG(inode->i_mode))
3452
		return -EOPNOTSUPP;
3453
3454
3455

	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
		/* TODO: Add support for non extent hole punching */
3456
		return -EOPNOTSUPP;
3457
3458
	}

3459
3460
	if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) {
		/* TODO: Add support for bigalloc file systems */
3461
		return -EOPNOTSUPP;
3462
3463
	}

3464
3465
3466
	return ext4_ext_punch_hole(file, offset, length);
}

3467
/*
3468
 * ext4_truncate()
3469
 *
3470
3471
 * We block out ext4_get_block() block instantiations across the entire
 * transaction, and VFS/VM ensures that ext4_truncate() cannot run
3472
3473
 * simultaneously on behalf of the same inode.
 *
3474
 * As we work through the truncate and commit bits of it to the journal there
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
 * is one core, guiding principle: the file's tree must always be consistent on
 * disk.  We must be able to restart the truncate after a crash.
 *
 * The file's tree may be transiently inconsistent in memory (although it
 * probably isn't), but whenever we close off and commit a journal transaction,
 * the contents of (the filesystem + the journal) must be consistent and
 * restartable.  It's pretty simple, really: bottom up, right to left (although
 * left-to-right works OK too).
 *
 * Note that at recovery time, journal replay occurs *before* the restart of
 * truncate against the orphan inode list.
 *
 * The committed inode has the new, desired i_size (which is the same as
3488
 * i_disksize in this case).  After a crash, ext4_orphan_cleanup() will see
3489
 * that this inode's truncate did not complete and it will again call
3490
3491
 * ext4_truncate() to have another go.  So there will be instantiated blocks
 * to the right of the truncation point in a crashed ext4 filesystem.  But
3492
 * that's fine - as long as they are linked from the inode, the post-crash
3493
 * ext4_truncate() run will find them and release them.
3494
 */
3495
void ext4_truncate(struct inode *inode)
3496
{
3497
3498
	trace_ext4_truncate_enter(inode);

3499
	if (!ext4_can_truncate(inode))
3500
3501
		return;

3502
	ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3503

3504
	if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
3505
		ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
3506

3507
	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3508
		ext4_ext_truncate(inode);
3509
3510
	else
		ext4_ind_truncate(inode);
3511

3512
	trace_ext4_truncate_exit(inode);
3513
3514
3515
}

/*
3516
 * ext4_get_inode_loc returns with an extra refcount against the inode's
3517
3518
3519
3520
 * underlying buffer_head on success. If 'in_mem' is true, we have all
 * data in memory that is needed to recreate the on-disk version of this
 * inode.
 */
3521
3522
static int __ext4_get_inode_loc(struct inode *inode,
				struct ext4_iloc *iloc, int in_mem)
3523
{
3524
3525
3526
3527
3528
3529
	struct ext4_group_desc	*gdp;
	struct buffer_head	*bh;
	struct super_block	*sb = inode->i_sb;
	ext4_fsblk_t		block;
	int			inodes_per_block, inode_offset;

Aneesh Kumar K.V's avatar
Aneesh Kumar K.V committed
3530
	iloc->bh = NULL;
3531
3532
	if (!ext4_valid_inum(sb, inode->i_ino))
		return -EIO;
3533

3534
3535
3536
	iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
	gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
	if (!gdp)
3537
3538
		return -EIO;

3539
3540
3541
	/*
	 * Figure out the offset within the block group inode table
	 */
3542
	inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
3543
3544
3545
3546
3547
3548
	inode_offset = ((inode->i_ino - 1) %
			EXT4_INODES_PER_GROUP(sb));
	block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
	iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);

	bh = sb_getblk(sb, block);
3549
	if (!bh) {
3550
3551
		EXT4_ERROR_INODE_BLOCK(inode, block,
				       "unable to read itable block");
3552
3553
3554
3555
		return -EIO;
	}
	if (!buffer_uptodate(bh)) {
		lock_buffer(bh);
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565

		/*
		 * If the buffer has the write error flag, we have failed
		 * to write out another inode in the same block.  In this
		 * case, we don't have to read the block because we may
		 * read the old inode data successfully.
		 */
		if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
			set_buffer_uptodate(bh);

3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
		if (buffer_uptodate(bh)) {
			/* someone brought it uptodate while we waited */
			unlock_buffer(bh);
			goto has_buffer;
		}

		/*
		 * If we have all information of the inode in memory and this
		 * is the only valid inode in the block, we need not read the
		 * block.
		 */
		if (in_mem) {
			struct buffer_head *bitmap_bh;
3579
			int i, start;
3580

3581
			start = inode_offset & ~(inodes_per_block - 1);
3582

3583
3584
			/* Is the inode bitmap in cache? */
			bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
			if (!bitmap_bh)
				goto make_io;

			/*
			 * If the inode bitmap isn't in cache then the
			 * optimisation may end up performing two reads instead
			 * of one, so skip it.
			 */
			if (!buffer_uptodate(bitmap_bh)) {
				brelse(bitmap_bh);
				goto make_io;
			}
3597
			for (i = start; i < start + inodes_per_block; i++) {
3598
3599
				if (i == inode_offset)
					continue;
3600
				if (ext4_test_bit(i, bitmap_bh->b_data))
3601
3602
3603
					break;
			}
			brelse(bitmap_bh);
3604
			if (i == start + inodes_per_block) {
3605
3606
3607
3608
3609
3610
3611
3612
3613
				/* all other inodes are free, so skip I/O */
				memset(bh->b_data, 0, bh->b_size);
				set_buffer_uptodate(bh);
				unlock_buffer(bh);
				goto has_buffer;
			}
		}

make_io:
3614
3615
3616
3617
3618
3619
3620
3621
3622
		/*
		 * If we need to do any I/O, try to pre-readahead extra
		 * blocks from the inode table.
		 */
		if (EXT4_SB(sb)->s_inode_readahead_blks) {
			ext4_fsblk_t b, end, table;
			unsigned num;

			table = ext4_inode_table(sb, gdp);
3623
			/* s_inode_readahead_blks is always a power of 2 */
3624
3625
3626
3627
3628
			b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
			if (table > b)
				b = table;
			end = b + EXT4_SB(sb)->s_inode_readahead_blks;
			num = EXT4_INODES_PER_GROUP(sb);
3629
			if (ext4_has_group_desc_csum(sb))
3630
				num -= ext4_itable_unused_count(sb, gdp);
3631
3632
3633
3634
3635
3636
3637
			table += num / inodes_per_block;
			if (end > table)
				end = table;
			while (b <= end)
				sb_breadahead(sb, b++);
		}

3638
3639
3640
3641
3642
		/*
		 * There are other valid inodes in the buffer, this inode
		 * has in-inode xattrs, or we don't have this inode in memory.
		 * Read the block from disk.
		 */
3643
		trace_ext4_load_inode(inode);
3644
3645
		get_bh(bh);
		bh->b_end_io = end_buffer_read_sync;
3646
		submit_bh(READ | REQ_META | REQ_PRIO, bh);
3647
3648
		wait_on_buffer(bh);
		if (!buffer_uptodate(bh)) {
3649
3650
			EXT4_ERROR_INODE_BLOCK(inode, block,
					       "unable to read itable block");
3651
3652
3653
3654
3655
3656
3657
3658
3659
			brelse(bh);
			return -EIO;
		}
	}
has_buffer:
	iloc->bh = bh;
	return 0;
}

3660
int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
3661
3662
{
	/* We have all inode data except xattrs in memory here. */
3663
	return __ext4_get_inode_loc(inode, iloc,
3664
		!ext4_test_inode_state(inode, EXT4_STATE_XATTR));
3665
3666
}

3667
void ext4_set_inode_flags(struct inode *inode)
3668
{
3669
	unsigned int flags = EXT4_I(inode)->i_flags;
3670
3671

	inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
3672
	if (flags & EXT4_SYNC_FL)
3673
		inode->i_flags |= S_SYNC;
3674
	if (flags & EXT4_APPEND_FL)
3675
		inode->i_flags |= S_APPEND;
3676
	if (flags & EXT4_IMMUTABLE_FL)
3677
		inode->i_flags |= S_IMMUTABLE;
3678
	if (flags & EXT4_NOATIME_FL)
3679
		inode->i_flags |= S_NOATIME;
3680
	if (flags & EXT4_DIRSYNC_FL)
3681
3682
3683
		inode->i_flags |= S_DIRSYNC;
}

3684
3685
3686
/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
void ext4_get_inode_flags(struct ext4_inode_info *ei)
{
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
	unsigned int vfs_fl;
	unsigned long old_fl, new_fl;

	do {
		vfs_fl = ei->vfs_inode.i_flags;
		old_fl = ei->i_flags;
		new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
				EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|
				EXT4_DIRSYNC_FL);
		if (vfs_fl & S_SYNC)
			new_fl |= EXT4_SYNC_FL;
		if (vfs_fl & S_APPEND)
			new_fl |= EXT4_APPEND_FL;
		if (vfs_fl & S_IMMUTABLE)
			new_fl |= EXT4_IMMUTABLE_FL;
		if (vfs_fl & S_NOATIME)
			new_fl |= EXT4_NOATIME_FL;
		if (vfs_fl & S_DIRSYNC)
			new_fl |= EXT4_DIRSYNC_FL;
	} while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl);
3707
}
3708

3709
static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
3710
				  struct ext4_inode_info *ei)
3711
3712
{
	blkcnt_t i_blocks ;
Aneesh Kumar K.V's avatar
Aneesh Kumar K.V committed
3713
3714
	struct inode *inode = &(ei->vfs_inode);
	struct super_block *sb = inode->i_sb;
3715
3716
3717
3718
3719
3720

	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
				EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
		/* we are using combined 48 bit field */
		i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
					le32_to_cpu(raw_inode->i_blocks_lo);
3721
		if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) {
Aneesh Kumar K.V's avatar
Aneesh Kumar K.V committed
3722
3723
3724
3725
3726
			/* i_blocks represent file system block size */
			return i_blocks  << (inode->i_blkbits - 9);
		} else {
			return i_blocks;
		}
3727
3728
3729
3730
	} else {
		return le32_to_cpu(raw_inode->i_blocks_lo);
	}
}
3731

3732
3733
3734
3735
3736
3737
static inline void ext4_iget_extra_inode(struct inode *inode,
					 struct ext4_inode *raw_inode,
					 struct ext4_inode_info *ei)
{
	__le32 *magic = (void *)raw_inode +
			EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
3738
	if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
3739
		ext4_set_inode_state(inode, EXT4_STATE_XATTR);
3740
3741
		ext4_find_inline_data_nolock(inode);
	}
3742
3743
}

3744
struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
3745
{
3746
3747
	struct ext4_iloc iloc;
	struct ext4_inode *raw_inode;
3748
3749
	struct ext4_inode_info *ei;
	struct inode *inode;
3750
	journal_t *journal = EXT4_SB(sb)->s_journal;
3751
	long ret;
3752
	int block;
3753
3754
	uid_t i_uid;
	gid_t i_gid;
3755

3756
3757
3758
3759
3760
3761
3762
	inode = iget_locked(sb, ino);
	if (!inode)
		return ERR_PTR(-ENOMEM);
	if (!(inode->i_state & I_NEW))
		return inode;

	ei = EXT4_I(inode);
3763
	iloc.bh = NULL;
3764

3765
3766
	ret = __ext4_get_inode_loc(inode, &iloc, 0);
	if (ret < 0)
3767
		goto bad_inode;
3768
	raw_inode = ext4_raw_inode(&iloc);
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801

	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
		ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
		if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
		    EXT4_INODE_SIZE(inode->i_sb)) {
			EXT4_ERROR_INODE(inode, "bad extra_isize (%u != %u)",
				EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize,
				EXT4_INODE_SIZE(inode->i_sb));
			ret = -EIO;
			goto bad_inode;
		}
	} else
		ei->i_extra_isize = 0;

	/* Precompute checksum seed for inode metadata */
	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
			EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
		__u32 csum;
		__le32 inum = cpu_to_le32(inode->i_ino);
		__le32 gen = raw_inode->i_generation;
		csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
				   sizeof(inum));
		ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
					      sizeof(gen));
	}

	if (!ext4_inode_csum_verify(inode, raw_inode, ei)) {
		EXT4_ERROR_INODE(inode, "checksum invalid");
		ret = -EIO;
		goto bad_inode;
	}

3802
	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
3803
3804
	i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
	i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
3805
	if (!(test_opt(inode->i_sb, NO_UID32))) {
3806
3807
		i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
		i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
3808
	}
3809
3810
	i_uid_write(inode, i_uid);
	i_gid_write(inode, i_gid);
Miklos Szeredi's avatar
Miklos Szeredi committed
3811
	set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
3812

3813
	ext4_clear_state_flags(ei);	/* Only relevant on 32-bit archs */
3814
	ei->i_inline_off = 0;
3815
3816
3817
3818
3819
3820
3821
3822
3823
	ei->i_dir_start_lookup = 0;
	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
	/* We now have enough fields to check if the inode was active or not.
	 * This is needed because nfsd might try to access dead inodes
	 * the test is that same one that e2fsck uses
	 * NeilBrown 1999oct15
	 */
	if (inode->i_nlink == 0) {
		if (inode->i_mode == 0 ||
3824
		    !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
3825
			/* this inode is deleted */
3826
			ret = -ESTALE;
3827
3828
3829
3830
3831
3832
3833
3834
			goto bad_inode;
		}
		/* The only unlinked inodes we let through here have
		 * valid i_mode and are being read by the orphan
		 * recovery code: that's fine, we're about to complete
		 * the process of deleting those. */
	}
	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
3835
	inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
3836
	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
3837
	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
3838
3839
		ei->i_file_acl |=
			((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
3840
	inode->i_size = ext4_isize(raw_inode);
3841
	ei->i_disksize = inode->i_size;
3842
3843
3844
#ifdef CONFIG_QUOTA
	ei->i_reserved_quota = 0;
#endif
3845
3846
	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
	ei->i_block_group = iloc.block_group;
3847
	ei->i_last_alloc_group = ~0;
3848
3849
3850
3851
	/*
	 * NOTE! The in-memory inode i_data array is in little-endian order
	 * even on big-endian machines: we do NOT byteswap the block numbers!
	 */
3852
	for (block = 0; block < EXT4_N_BLOCKS; block++)
3853
3854
3855
		ei->i_data[block] = raw_inode->i_block[block];
	INIT_LIST_HEAD(&ei->i_orphan);

3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
	/*
	 * Set transaction id's of transactions that have to be committed
	 * to finish f[data]sync. We set them to currently running transaction
	 * as we cannot be sure that the inode or some of its metadata isn't
	 * part of the transaction - the inode could have been reclaimed and
	 * now it is reread from disk.
	 */
	if (journal) {
		transaction_t *transaction;
		tid_t tid;

3867
		read_lock(&journal->j_state_lock);
3868
3869
3870
3871
3872
3873
3874
3875
		if (journal->j_running_transaction)
			transaction = journal->j_running_transaction;
		else
			transaction = journal->j_committing_transaction;
		if (transaction)
			tid = transaction->t_tid;
		else
			tid = journal->j_commit_sequence;
3876
		read_unlock(&journal->j_state_lock);
3877
3878
3879
3880
		ei->i_sync_tid = tid;
		ei->i_datasync_tid = tid;
	}

3881
	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
3882
3883
		if (ei->i_extra_isize == 0) {
			/* The extra space is currently unused. Use it. */
3884
3885
			ei->i_extra_isize = sizeof(struct ext4_inode) -
					    EXT4_GOOD_OLD_INODE_SIZE;
3886
		} else {
3887
			ext4_iget_extra_inode(inode, raw_inode, ei);
3888
		}
3889
	}
3890

Kalpak Shah's avatar
Kalpak Shah committed
3891
3892
3893
3894
3895
	EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
	EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
	EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
	EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);

3896
3897
3898
3899
3900
3901
3902
	inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
		if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
			inode->i_version |=
			(__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
	}

3903
	ret = 0;
3904
	if (ei->i_file_acl &&
3905
	    !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
3906
3907
		EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
				 ei->i_file_acl);
3908
3909
		ret = -EIO;
		goto bad_inode;
3910
	} else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
3911
3912
3913
3914
3915
		if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
		    (S_ISLNK(inode->i_mode) &&
		     !ext4_inode_is_fast_symlink(inode)))
			/* Validate extent which is part of inode */
			ret = ext4_ext_check_inode(inode);
3916
	} else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
3917
3918
		   (S_ISLNK(inode->i_mode) &&
		    !ext4_inode_is_fast_symlink(inode))) {
3919
		/* Validate block references which are part of inode */
3920
		ret = ext4_ind_check_inode(inode);
3921
	}
3922
	if (ret)
3923
		goto bad_inode;
3924

3925
	if (S_ISREG(inode->i_mode)) {
3926
3927
3928
		inode->i_op = &ext4_file_inode_operations;
		inode->i_fop = &ext4_file_operations;
		ext4_set_aops(inode);
3929
	} else if (S_ISDIR(inode->i_mode)) {
3930
3931
		inode->i_op = &ext4_dir_inode_operations;
		inode->i_fop = &ext4_dir_operations;
3932
	} else if (S_ISLNK(inode->i_mode)) {
3933
		if (ext4_inode_is_fast_symlink(inode)) {
3934
			inode->i_op = &ext4_fast_symlink_inode_operations;
3935
3936
3937
			nd_terminate_link(ei->i_data, inode->i_size,
				sizeof(ei->i_data) - 1);
		} else {
3938
3939
			inode->i_op = &ext4_symlink_inode_operations;
			ext4_set_aops(inode);
3940
		}
3941
3942
	} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
	      S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
3943
		inode->i_op = &ext4_special_inode_operations;
3944
3945
3946
3947
3948
3949
		if (raw_inode->i_block[0])
			init_special_inode(inode, inode->i_mode,
			   old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
		else
			init_special_inode(inode, inode->i_mode,
			   new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
3950
3951
	} else {
		ret = -EIO;
3952
		EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
3953
		goto bad_inode;
3954
	}
3955
	brelse(iloc.bh);
3956
	ext4_set_inode_flags(inode);
3957
3958
	unlock_new_inode(inode);
	return inode;
3959
3960

bad_inode:
3961
	brelse(iloc.bh);
3962
3963
	iget_failed(inode);
	return ERR_PTR(ret);
3964
3965
}

3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
static int ext4_inode_blocks_set(handle_t *handle,
				struct ext4_inode *raw_inode,
				struct ext4_inode_info *ei)
{
	struct inode *inode = &(ei->vfs_inode);
	u64 i_blocks = inode->i_blocks;
	struct super_block *sb = inode->i_sb;

	if (i_blocks <= ~0U) {
		/*
3976
		 * i_blocks can be represented in a 32 bit variable
3977
3978
		 * as multiple of 512 bytes
		 */
Aneesh Kumar K.V's avatar
Aneesh Kumar K.V committed
3979
		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
3980
		raw_inode->i_blocks_high = 0;
3981
		ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
3982
3983
3984
3985
3986
3987
		return 0;
	}
	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
		return -EFBIG;

	if (i_blocks <= 0xffffffffffffULL) {
3988
3989
3990
3991
		/*
		 * i_blocks can be represented in a 48 bit variable
		 * as multiple of 512 bytes
		 */
Aneesh Kumar K.V's avatar
Aneesh Kumar K.V committed
3992
		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
3993
		raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
3994
		ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
3995
	} else {
3996
		ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
Aneesh Kumar K.V's avatar
Aneesh Kumar K.V committed
3997
3998
3999
4000
		/* i_block is stored in file system block size */
		i_blocks = i_blocks >> (inode->i_blkbits - 9);
		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
		raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
For faster browsing, not all history is shown. View entire blame