inode.c 135 KB
Newer Older
1
/*
2
 *  linux/fs/ext4/inode.c
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  from
 *
 *  linux/fs/minix/inode.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  64-bit file support on 64-bit platforms by Jakub Jelinek
 *	(jj@sunsite.ms.mff.cuni.cz)
 *
18
 *  Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
19
20
21
22
 */

#include <linux/fs.h>
#include <linux/time.h>
23
#include <linux/jbd2.h>
24
25
26
27
28
29
#include <linux/highuid.h>
#include <linux/pagemap.h>
#include <linux/quotaops.h>
#include <linux/string.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
30
#include <linux/pagevec.h>
31
#include <linux/mpage.h>
32
#include <linux/namei.h>
33
34
#include <linux/uio.h>
#include <linux/bio.h>
35
#include <linux/workqueue.h>
36
#include <linux/kernel.h>
37
#include <linux/printk.h>
38
#include <linux/slab.h>
39
#include <linux/ratelimit.h>
40

41
#include "ext4_jbd2.h"
42
43
#include "xattr.h"
#include "acl.h"
44
#include "truncate.h"
45

46
47
#include <trace/events/ext4.h>

48
49
#define MPAGE_DA_EXTENT_TAIL 0x01

50
51
52
static inline int ext4_begin_ordered_truncate(struct inode *inode,
					      loff_t new_size)
{
53
	trace_ext4_begin_ordered_truncate(inode, new_size);
54
55
56
57
58
59
60
61
62
63
64
	/*
	 * If jinode is zero, then we never opened the file for
	 * writing, so there's no need to call
	 * jbd2_journal_begin_ordered_truncate() since there's no
	 * outstanding writes we need to flush.
	 */
	if (!EXT4_I(inode)->jinode)
		return 0;
	return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
						   EXT4_I(inode)->jinode,
						   new_size);
65
66
}

67
static void ext4_invalidatepage(struct page *page, unsigned long offset);
68
69
70
71
72
73
static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
				   struct buffer_head *bh_result, int create);
static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
static int __ext4_journalled_writepage(struct page *page, unsigned int len);
static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
Eric Sandeen's avatar
Eric Sandeen committed
74
75
76
static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
		struct inode *inode, struct page *page, loff_t from,
		loff_t length, int flags);
77

78
79
80
/*
 * Test whether an inode is a fast symlink.
 */
81
static int ext4_inode_is_fast_symlink(struct inode *inode)
82
{
83
	int ea_blocks = EXT4_I(inode)->i_file_acl ?
84
85
86
87
88
89
90
91
92
93
		(inode->i_sb->s_blocksize >> 9) : 0;

	return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
}

/*
 * Restart the transaction associated with *handle.  This does a commit,
 * so before we call here everything must be consistently dirtied against
 * this transaction.
 */
94
int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
95
				 int nblocks)
96
{
97
98
99
	int ret;

	/*
100
	 * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
101
102
103
104
	 * moment, get_block can be called only for blocks inside i_size since
	 * page cache has been already dropped and writes are blocked by
	 * i_mutex. So we can safely drop the i_data_sem here.
	 */
105
	BUG_ON(EXT4_JOURNAL(inode) == NULL);
106
	jbd_debug(2, "restarting handle %p\n", handle);
107
	up_write(&EXT4_I(inode)->i_data_sem);
108
	ret = ext4_journal_restart(handle, nblocks);
109
	down_write(&EXT4_I(inode)->i_data_sem);
110
	ext4_discard_preallocations(inode);
111
112

	return ret;
113
114
115
116
117
}

/*
 * Called at the last iput() if i_nlink is zero.
 */
Al Viro's avatar
Al Viro committed
118
void ext4_evict_inode(struct inode *inode)
119
120
{
	handle_t *handle;
121
	int err;
122

123
	trace_ext4_evict_inode(inode);
124
125
126

	ext4_ioend_wait(inode);

Al Viro's avatar
Al Viro committed
127
	if (inode->i_nlink) {
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
		/*
		 * When journalling data dirty buffers are tracked only in the
		 * journal. So although mm thinks everything is clean and
		 * ready for reaping the inode might still have some pages to
		 * write in the running transaction or waiting to be
		 * checkpointed. Thus calling jbd2_journal_invalidatepage()
		 * (via truncate_inode_pages()) to discard these buffers can
		 * cause data loss. Also even if we did not discard these
		 * buffers, we would have no way to find them after the inode
		 * is reaped and thus user could see stale data if he tries to
		 * read them before the transaction is checkpointed. So be
		 * careful and force everything to disk here... We use
		 * ei->i_datasync_tid to store the newest transaction
		 * containing inode's data.
		 *
		 * Note that directories do not have this problem because they
		 * don't use page cache.
		 */
		if (ext4_should_journal_data(inode) &&
		    (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) {
			journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
			tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;

			jbd2_log_start_commit(journal, commit_tid);
			jbd2_log_wait_commit(journal, commit_tid);
			filemap_write_and_wait(&inode->i_data);
		}
Al Viro's avatar
Al Viro committed
155
156
157
158
		truncate_inode_pages(&inode->i_data, 0);
		goto no_delete;
	}

159
	if (!is_bad_inode(inode))
160
		dquot_initialize(inode);
161

162
163
	if (ext4_should_order_data(inode))
		ext4_begin_ordered_truncate(inode, 0);
164
165
166
167
168
	truncate_inode_pages(&inode->i_data, 0);

	if (is_bad_inode(inode))
		goto no_delete;

169
	handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3);
170
	if (IS_ERR(handle)) {
171
		ext4_std_error(inode->i_sb, PTR_ERR(handle));
172
173
174
175
176
		/*
		 * If we're going to skip the normal cleanup, we still need to
		 * make sure that the in-core orphan linked list is properly
		 * cleaned up.
		 */
177
		ext4_orphan_del(NULL, inode);
178
179
180
181
		goto no_delete;
	}

	if (IS_SYNC(inode))
182
		ext4_handle_sync(handle);
183
	inode->i_size = 0;
184
185
	err = ext4_mark_inode_dirty(handle, inode);
	if (err) {
186
		ext4_warning(inode->i_sb,
187
188
189
			     "couldn't mark inode dirty (err %d)", err);
		goto stop_handle;
	}
190
	if (inode->i_blocks)
191
		ext4_truncate(inode);
192
193
194
195
196
197
198

	/*
	 * ext4_ext_truncate() doesn't reserve any slop when it
	 * restarts journal transactions; therefore there may not be
	 * enough credits left in the handle to remove the inode from
	 * the orphan list and set the dtime field.
	 */
199
	if (!ext4_handle_has_enough_credits(handle, 3)) {
200
201
202
203
		err = ext4_journal_extend(handle, 3);
		if (err > 0)
			err = ext4_journal_restart(handle, 3);
		if (err != 0) {
204
			ext4_warning(inode->i_sb,
205
206
207
				     "couldn't extend journal (err %d)", err);
		stop_handle:
			ext4_journal_stop(handle);
208
			ext4_orphan_del(NULL, inode);
209
210
211
212
			goto no_delete;
		}
	}

213
	/*
214
	 * Kill off the orphan record which ext4_truncate created.
215
	 * AKPM: I think this can be inside the above `if'.
216
	 * Note that ext4_orphan_del() has to be able to cope with the
217
	 * deletion of a non-existent orphan - this is because we don't
218
	 * know if ext4_truncate() actually created an orphan record.
219
220
	 * (Well, we could do this if we need to, but heck - it works)
	 */
221
222
	ext4_orphan_del(handle, inode);
	EXT4_I(inode)->i_dtime	= get_seconds();
223
224
225
226
227
228
229
230

	/*
	 * One subtle ordering requirement: if anything has gone wrong
	 * (transaction abort, IO errors, whatever), then we can still
	 * do these next steps (the fs will already have been marked as
	 * having errors), but we can't free the inode if the mark_dirty
	 * fails.
	 */
231
	if (ext4_mark_inode_dirty(handle, inode))
232
		/* If that failed, just do the required in-core inode clear. */
Al Viro's avatar
Al Viro committed
233
		ext4_clear_inode(inode);
234
	else
235
236
		ext4_free_inode(handle, inode);
	ext4_journal_stop(handle);
237
238
	return;
no_delete:
Al Viro's avatar
Al Viro committed
239
	ext4_clear_inode(inode);	/* We must guarantee clearing of inode... */
240
241
}

242
243
#ifdef CONFIG_QUOTA
qsize_t *ext4_get_reserved_space(struct inode *inode)
244
{
245
	return &EXT4_I(inode)->i_reserved_quota;
246
}
247
#endif
248

249
250
/*
 * Calculate the number of metadata blocks need to reserve
251
 * to allocate a block located at @lblock
252
 */
253
static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
254
{
255
	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
256
		return ext4_ext_calc_metadata_amount(inode, lblock);
257

258
	return ext4_ind_calc_metadata_amount(inode, lblock);
259
260
}

261
262
263
264
/*
 * Called with i_data_sem down, which is important since we can call
 * ext4_discard_preallocations() from here.
 */
265
266
void ext4_da_update_reserve_space(struct inode *inode,
					int used, int quota_claim)
267
268
{
	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
269
270
271
	struct ext4_inode_info *ei = EXT4_I(inode);

	spin_lock(&ei->i_block_reservation_lock);
272
	trace_ext4_da_update_reserve_space(inode, used, quota_claim);
273
274
	if (unlikely(used > ei->i_reserved_data_blocks)) {
		ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
275
			 "with only %d reserved data blocks",
276
277
278
279
280
			 __func__, inode->i_ino, used,
			 ei->i_reserved_data_blocks);
		WARN_ON(1);
		used = ei->i_reserved_data_blocks;
	}
281

282
283
284
	/* Update per-inode reservations */
	ei->i_reserved_data_blocks -= used;
	ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
285
	percpu_counter_sub(&sbi->s_dirtyclusters_counter,
286
			   used + ei->i_allocated_meta_blocks);
287
	ei->i_allocated_meta_blocks = 0;
288

289
290
291
292
293
294
	if (ei->i_reserved_data_blocks == 0) {
		/*
		 * We can release all of the reserved metadata blocks
		 * only when we have written all of the delayed
		 * allocation blocks.
		 */
295
		percpu_counter_sub(&sbi->s_dirtyclusters_counter,
296
				   ei->i_reserved_meta_blocks);
297
		ei->i_reserved_meta_blocks = 0;
298
		ei->i_da_metadata_calc_len = 0;
299
	}
300
	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
301

302
303
	/* Update quota subsystem for data blocks */
	if (quota_claim)
304
		dquot_claim_block(inode, EXT4_C2B(sbi, used));
305
	else {
306
307
308
		/*
		 * We did fallocate with an offset that is already delayed
		 * allocated. So on delayed allocated writeback we should
309
		 * not re-claim the quota for fallocated blocks.
310
		 */
311
		dquot_release_reservation_block(inode, EXT4_C2B(sbi, used));
312
	}
313
314
315
316
317
318

	/*
	 * If we have done all the pending block allocations and if
	 * there aren't any writers on the inode, we can discard the
	 * inode's preallocations.
	 */
319
320
	if ((ei->i_reserved_data_blocks == 0) &&
	    (atomic_read(&inode->i_writecount) == 0))
321
		ext4_discard_preallocations(inode);
322
323
}

324
static int __check_block_validity(struct inode *inode, const char *func,
325
326
				unsigned int line,
				struct ext4_map_blocks *map)
327
{
328
329
	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
				   map->m_len)) {
330
331
332
333
		ext4_error_inode(inode, func, line, map->m_pblk,
				 "lblock %lu mapped to illegal pblock "
				 "(length %d)", (unsigned long) map->m_lblk,
				 map->m_len);
334
335
336
337
338
		return -EIO;
	}
	return 0;
}

339
#define check_block_validity(inode, map)	\
340
	__check_block_validity((inode), __func__, __LINE__, (map))
341

342
/*
343
344
 * Return the number of contiguous dirty pages in a given inode
 * starting at page frame idx.
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
 */
static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
				    unsigned int max_pages)
{
	struct address_space *mapping = inode->i_mapping;
	pgoff_t	index;
	struct pagevec pvec;
	pgoff_t num = 0;
	int i, nr_pages, done = 0;

	if (max_pages == 0)
		return 0;
	pagevec_init(&pvec, 0);
	while (!done) {
		index = idx;
		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
					      PAGECACHE_TAG_DIRTY,
					      (pgoff_t)PAGEVEC_SIZE);
		if (nr_pages == 0)
			break;
		for (i = 0; i < nr_pages; i++) {
			struct page *page = pvec.pages[i];
			struct buffer_head *bh, *head;

			lock_page(page);
			if (unlikely(page->mapping != mapping) ||
			    !PageDirty(page) ||
			    PageWriteback(page) ||
			    page->index != idx) {
				done = 1;
				unlock_page(page);
				break;
			}
378
379
380
381
382
383
384
385
386
			if (page_has_buffers(page)) {
				bh = head = page_buffers(page);
				do {
					if (!buffer_delay(bh) &&
					    !buffer_unwritten(bh))
						done = 1;
					bh = bh->b_this_page;
				} while (!done && (bh != head));
			}
387
388
389
390
391
			unlock_page(page);
			if (done)
				break;
			idx++;
			num++;
392
393
			if (num >= max_pages) {
				done = 1;
394
				break;
395
			}
396
397
398
399
400
401
		}
		pagevec_release(&pvec);
	}
	return num;
}

402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
/*
 * Sets the BH_Da_Mapped bit on the buffer heads corresponding to the given map.
 */
static void set_buffers_da_mapped(struct inode *inode,
				   struct ext4_map_blocks *map)
{
	struct address_space *mapping = inode->i_mapping;
	struct pagevec pvec;
	int i, nr_pages;
	pgoff_t index, end;

	index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
	end = (map->m_lblk + map->m_len - 1) >>
		(PAGE_CACHE_SHIFT - inode->i_blkbits);

	pagevec_init(&pvec, 0);
	while (index <= end) {
		nr_pages = pagevec_lookup(&pvec, mapping, index,
					  min(end - index + 1,
					      (pgoff_t)PAGEVEC_SIZE));
		if (nr_pages == 0)
			break;
		for (i = 0; i < nr_pages; i++) {
			struct page *page = pvec.pages[i];
			struct buffer_head *bh, *head;

			if (unlikely(page->mapping != mapping) ||
			    !PageDirty(page))
				break;

			if (page_has_buffers(page)) {
				bh = head = page_buffers(page);
				do {
					set_buffer_da_mapped(bh);
					bh = bh->b_this_page;
				} while (bh != head);
			}
			index++;
		}
		pagevec_release(&pvec);
	}
}

445
/*
446
 * The ext4_map_blocks() function tries to look up the requested blocks,
447
 * and returns if the blocks are already mapped.
448
449
450
451
452
 *
 * Otherwise it takes the write lock of the i_data_sem and allocate blocks
 * and store the allocated blocks in the result buffer head and mark it
 * mapped.
 *
453
454
 * If file type is extents based, it will call ext4_ext_map_blocks(),
 * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
455
456
457
458
459
460
461
462
 * based files
 *
 * On success, it returns the number of blocks being mapped or allocate.
 * if create==0 and the blocks are pre-allocated and uninitialized block,
 * the result buffer head is unmapped. If the create ==1, it will make sure
 * the buffer head is mapped.
 *
 * It returns 0 if plain look up failed (blocks have not been allocated), in
463
 * that case, buffer head is unmapped
464
465
466
 *
 * It returns the error in case of allocation failure.
 */
467
468
int ext4_map_blocks(handle_t *handle, struct inode *inode,
		    struct ext4_map_blocks *map, int flags)
469
470
{
	int retval;
471

472
473
474
475
	map->m_flags = 0;
	ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
		  "logical block %lu\n", inode->i_ino, flags, map->m_len,
		  (unsigned long) map->m_lblk);
476
	/*
477
478
	 * Try to see if we can get the block without requesting a new
	 * file system block.
479
480
	 */
	down_read((&EXT4_I(inode)->i_data_sem));
481
	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
482
483
		retval = ext4_ext_map_blocks(handle, inode, map, flags &
					     EXT4_GET_BLOCKS_KEEP_SIZE);
484
	} else {
485
486
		retval = ext4_ind_map_blocks(handle, inode, map, flags &
					     EXT4_GET_BLOCKS_KEEP_SIZE);
487
	}
488
	up_read((&EXT4_I(inode)->i_data_sem));
489

490
	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
491
		int ret = check_block_validity(inode, map);
492
493
494
495
		if (ret != 0)
			return ret;
	}

496
	/* If it is only a block(s) look up */
497
	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
498
499
500
501
502
503
		return retval;

	/*
	 * Returns if the blocks have already allocated
	 *
	 * Note that if blocks have been preallocated
504
	 * ext4_ext_get_block() returns the create = 0
505
506
	 * with buffer head unmapped.
	 */
507
	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
508
509
		return retval;

510
511
512
513
514
515
516
517
518
519
	/*
	 * When we call get_blocks without the create flag, the
	 * BH_Unwritten flag could have gotten set if the blocks
	 * requested were part of a uninitialized extent.  We need to
	 * clear this flag now that we are committed to convert all or
	 * part of the uninitialized extent to be an initialized
	 * extent.  This is because we need to avoid the combination
	 * of BH_Unwritten and BH_Mapped flags being simultaneously
	 * set on the buffer_head.
	 */
520
	map->m_flags &= ~EXT4_MAP_UNWRITTEN;
521

522
	/*
523
524
525
526
	 * New blocks allocate and/or writing to uninitialized extent
	 * will possibly result in updating i_data, so we take
	 * the write lock of i_data_sem, and call get_blocks()
	 * with create == 1 flag.
527
528
	 */
	down_write((&EXT4_I(inode)->i_data_sem));
529
530
531
532
533
534
535

	/*
	 * if the caller is from delayed allocation writeout path
	 * we have already reserved fs blocks for allocation
	 * let the underlying get_block() function know to
	 * avoid double accounting
	 */
536
	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
537
		ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
538
539
540
541
	/*
	 * We need to check for EXT4 here because migrate
	 * could have changed the inode type in between
	 */
542
	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
543
		retval = ext4_ext_map_blocks(handle, inode, map, flags);
544
	} else {
545
		retval = ext4_ind_map_blocks(handle, inode, map, flags);
546

547
		if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
548
549
550
551
552
			/*
			 * We allocated new blocks which will result in
			 * i_data's format changing.  Force the migrate
			 * to fail by clearing migrate flags
			 */
553
			ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
554
		}
555

556
557
558
559
560
561
562
		/*
		 * Update reserved blocks/metadata blocks after successful
		 * block allocation which had been deferred till now. We don't
		 * support fallocate for non extent files. So we can update
		 * reserve space here.
		 */
		if ((retval > 0) &&
563
			(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
564
565
			ext4_da_update_reserve_space(inode, retval, 1);
	}
566
	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
567
		ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
568

569
570
571
572
573
574
575
576
		/* If we have successfully mapped the delayed allocated blocks,
		 * set the BH_Da_Mapped bit on them. Its important to do this
		 * under the protection of i_data_sem.
		 */
		if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
			set_buffers_da_mapped(inode, map);
	}

577
	up_write((&EXT4_I(inode)->i_data_sem));
578
	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
579
		int ret = check_block_validity(inode, map);
580
581
582
		if (ret != 0)
			return ret;
	}
583
584
585
	return retval;
}

586
587
588
/* Maximum number of blocks we map for direct IO at once. */
#define DIO_MAX_BLOCKS 4096

589
590
static int _ext4_get_block(struct inode *inode, sector_t iblock,
			   struct buffer_head *bh, int flags)
591
{
592
	handle_t *handle = ext4_journal_current_handle();
593
	struct ext4_map_blocks map;
Jan Kara's avatar
Jan Kara committed
594
	int ret = 0, started = 0;
595
	int dio_credits;
596

597
598
599
600
	map.m_lblk = iblock;
	map.m_len = bh->b_size >> inode->i_blkbits;

	if (flags && !handle) {
Jan Kara's avatar
Jan Kara committed
601
		/* Direct IO write... */
602
603
604
		if (map.m_len > DIO_MAX_BLOCKS)
			map.m_len = DIO_MAX_BLOCKS;
		dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
605
		handle = ext4_journal_start(inode, dio_credits);
Jan Kara's avatar
Jan Kara committed
606
		if (IS_ERR(handle)) {
607
			ret = PTR_ERR(handle);
608
			return ret;
609
		}
Jan Kara's avatar
Jan Kara committed
610
		started = 1;
611
612
	}

613
	ret = ext4_map_blocks(handle, inode, &map, flags);
Jan Kara's avatar
Jan Kara committed
614
	if (ret > 0) {
615
616
617
		map_bh(bh, inode->i_sb, map.m_pblk);
		bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
		bh->b_size = inode->i_sb->s_blocksize * map.m_len;
Jan Kara's avatar
Jan Kara committed
618
		ret = 0;
619
	}
Jan Kara's avatar
Jan Kara committed
620
621
	if (started)
		ext4_journal_stop(handle);
622
623
624
	return ret;
}

625
626
627
628
629
630
631
int ext4_get_block(struct inode *inode, sector_t iblock,
		   struct buffer_head *bh, int create)
{
	return _ext4_get_block(inode, iblock, bh,
			       create ? EXT4_GET_BLOCKS_CREATE : 0);
}

632
633
634
/*
 * `handle' can be NULL if create is zero
 */
635
struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
Aneesh Kumar K.V's avatar
Aneesh Kumar K.V committed
636
				ext4_lblk_t block, int create, int *errp)
637
{
638
639
	struct ext4_map_blocks map;
	struct buffer_head *bh;
640
641
642
643
	int fatal = 0, err;

	J_ASSERT(handle != NULL || create == 0);

644
645
646
647
	map.m_lblk = block;
	map.m_len = 1;
	err = ext4_map_blocks(handle, inode, &map,
			      create ? EXT4_GET_BLOCKS_CREATE : 0);
648

649
650
651
652
653
654
655
656
657
658
	if (err < 0)
		*errp = err;
	if (err <= 0)
		return NULL;
	*errp = 0;

	bh = sb_getblk(inode->i_sb, map.m_pblk);
	if (!bh) {
		*errp = -EIO;
		return NULL;
659
	}
660
661
662
	if (map.m_flags & EXT4_MAP_NEW) {
		J_ASSERT(create != 0);
		J_ASSERT(handle != NULL);
663

664
665
666
667
668
669
670
671
672
673
674
675
676
		/*
		 * Now that we do not always journal data, we should
		 * keep in mind whether this should always journal the
		 * new buffer as metadata.  For now, regular file
		 * writes use ext4_get_block instead, so it's not a
		 * problem.
		 */
		lock_buffer(bh);
		BUFFER_TRACE(bh, "call get_create_access");
		fatal = ext4_journal_get_create_access(handle, bh);
		if (!fatal && !buffer_uptodate(bh)) {
			memset(bh->b_data, 0, inode->i_sb->s_blocksize);
			set_buffer_uptodate(bh);
677
		}
678
679
680
681
682
683
684
		unlock_buffer(bh);
		BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
		err = ext4_handle_dirty_metadata(handle, inode, bh);
		if (!fatal)
			fatal = err;
	} else {
		BUFFER_TRACE(bh, "not a new buffer");
685
	}
686
687
688
689
690
691
	if (fatal) {
		*errp = fatal;
		brelse(bh);
		bh = NULL;
	}
	return bh;
692
693
}

694
struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
Aneesh Kumar K.V's avatar
Aneesh Kumar K.V committed
695
			       ext4_lblk_t block, int create, int *err)
696
{
697
	struct buffer_head *bh;
698

699
	bh = ext4_getblk(handle, inode, block, create, err);
700
701
702
703
	if (!bh)
		return bh;
	if (buffer_uptodate(bh))
		return bh;
704
	ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh);
705
706
707
708
709
710
711
712
	wait_on_buffer(bh);
	if (buffer_uptodate(bh))
		return bh;
	put_bh(bh);
	*err = -EIO;
	return NULL;
}

713
714
715
716
717
718
719
static int walk_page_buffers(handle_t *handle,
			     struct buffer_head *head,
			     unsigned from,
			     unsigned to,
			     int *partial,
			     int (*fn)(handle_t *handle,
				       struct buffer_head *bh))
720
721
722
723
724
725
726
{
	struct buffer_head *bh;
	unsigned block_start, block_end;
	unsigned blocksize = head->b_size;
	int err, ret = 0;
	struct buffer_head *next;

727
728
	for (bh = head, block_start = 0;
	     ret == 0 && (bh != head || !block_start);
729
	     block_start = block_end, bh = next) {
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
		next = bh->b_this_page;
		block_end = block_start + blocksize;
		if (block_end <= from || block_start >= to) {
			if (partial && !buffer_uptodate(bh))
				*partial = 1;
			continue;
		}
		err = (*fn)(handle, bh);
		if (!ret)
			ret = err;
	}
	return ret;
}

/*
 * To preserve ordering, it is essential that the hole instantiation and
 * the data write be encapsulated in a single transaction.  We cannot
747
 * close off a transaction and start a new one between the ext4_get_block()
748
 * and the commit_write().  So doing the jbd2_journal_start at the start of
749
750
 * prepare_write() is the right place.
 *
751
752
 * Also, this function can nest inside ext4_writepage() ->
 * block_write_full_page(). In that case, we *know* that ext4_writepage()
753
754
755
756
 * has generated enough buffer credits to do the whole page.  So we won't
 * block on the journal in that case, which is good, because the caller may
 * be PF_MEMALLOC.
 *
757
 * By accident, ext4 can be reentered when a transaction is open via
758
759
760
761
762
763
 * quota file writes.  If we were to commit the transaction while thus
 * reentered, there can be a deadlock - we would be holding a quota
 * lock, and the commit would never complete if another thread had a
 * transaction open and was blocking on the quota lock - a ranking
 * violation.
 *
764
 * So what we do is to rely on the fact that jbd2_journal_stop/journal_start
765
766
767
768
769
 * will _not_ run commit under these circumstances because handle->h_ref
 * is elevated.  We'll still have enough credits for the tiny quotafile
 * write.
 */
static int do_journal_get_write_access(handle_t *handle,
770
				       struct buffer_head *bh)
771
{
772
773
774
	int dirty = buffer_dirty(bh);
	int ret;

775
776
	if (!buffer_mapped(bh) || buffer_freed(bh))
		return 0;
777
	/*
778
	 * __block_write_begin() could have dirtied some buffers. Clean
779
780
	 * the dirty bit as jbd2_journal_get_write_access() could complain
	 * otherwise about fs integrity issues. Setting of the dirty bit
781
	 * by __block_write_begin() isn't a real problem here as we clear
782
783
784
785
786
787
788
789
790
	 * the bit before releasing a page lock and thus writeback cannot
	 * ever write the buffer.
	 */
	if (dirty)
		clear_buffer_dirty(bh);
	ret = ext4_journal_get_write_access(handle, bh);
	if (!ret && dirty)
		ret = ext4_handle_dirty_metadata(handle, NULL, bh);
	return ret;
791
792
}

793
794
static int ext4_get_block_write(struct inode *inode, sector_t iblock,
		   struct buffer_head *bh_result, int create);
Nick Piggin's avatar
Nick Piggin committed
795
static int ext4_write_begin(struct file *file, struct address_space *mapping,
796
797
			    loff_t pos, unsigned len, unsigned flags,
			    struct page **pagep, void **fsdata)
798
{
799
	struct inode *inode = mapping->host;
800
	int ret, needed_blocks;
801
802
	handle_t *handle;
	int retries = 0;
803
	struct page *page;
804
	pgoff_t index;
805
	unsigned from, to;
Nick Piggin's avatar
Nick Piggin committed
806

807
	trace_ext4_write_begin(inode, pos, len, flags);
808
809
810
811
812
	/*
	 * Reserve one block more for addition to orphan list in case
	 * we allocate blocks but write fails for some reason
	 */
	needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
813
	index = pos >> PAGE_CACHE_SHIFT;
814
815
	from = pos & (PAGE_CACHE_SIZE - 1);
	to = from + len;
816
817

retry:
818
819
820
821
	handle = ext4_journal_start(inode, needed_blocks);
	if (IS_ERR(handle)) {
		ret = PTR_ERR(handle);
		goto out;
822
	}
823

824
825
826
827
	/* We cannot recurse into the filesystem as the transaction is already
	 * started */
	flags |= AOP_FLAG_NOFS;

828
	page = grab_cache_page_write_begin(mapping, index, flags);
829
830
831
832
833
834
835
	if (!page) {
		ext4_journal_stop(handle);
		ret = -ENOMEM;
		goto out;
	}
	*pagep = page;

836
	if (ext4_should_dioread_nolock(inode))
837
		ret = __block_write_begin(page, pos, len, ext4_get_block_write);
838
	else
839
		ret = __block_write_begin(page, pos, len, ext4_get_block);
Nick Piggin's avatar
Nick Piggin committed
840
841

	if (!ret && ext4_should_journal_data(inode)) {
842
843
844
		ret = walk_page_buffers(handle, page_buffers(page),
				from, to, NULL, do_journal_get_write_access);
	}
Nick Piggin's avatar
Nick Piggin committed
845
846

	if (ret) {
847
848
		unlock_page(page);
		page_cache_release(page);
849
		/*
850
		 * __block_write_begin may have instantiated a few blocks
851
852
		 * outside i_size.  Trim these off again. Don't need
		 * i_size_read because we hold i_mutex.
853
854
855
		 *
		 * Add inode to orphan list in case we crash before
		 * truncate finishes
856
		 */
857
		if (pos + len > inode->i_size && ext4_can_truncate(inode))
858
859
860
861
			ext4_orphan_add(handle, inode);

		ext4_journal_stop(handle);
		if (pos + len > inode->i_size) {
862
			ext4_truncate_failed_write(inode);
863
			/*
864
			 * If truncate failed early the inode might
865
866
867
868
869
870
871
			 * still be on the orphan list; we need to
			 * make sure the inode is removed from the
			 * orphan list in that case.
			 */
			if (inode->i_nlink)
				ext4_orphan_del(NULL, inode);
		}
Nick Piggin's avatar
Nick Piggin committed
872
873
	}

874
	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
875
		goto retry;
876
out:
877
878
879
	return ret;
}

Nick Piggin's avatar
Nick Piggin committed
880
881
/* For write_end() in data=journal mode */
static int write_end_fn(handle_t *handle, struct buffer_head *bh)
882
883
884
885
{
	if (!buffer_mapped(bh) || buffer_freed(bh))
		return 0;
	set_buffer_uptodate(bh);
886
	return ext4_handle_dirty_metadata(handle, NULL, bh);
887
888
}

889
static int ext4_generic_write_end(struct file *file,
890
891
892
				  struct address_space *mapping,
				  loff_t pos, unsigned len, unsigned copied,
				  struct page *page, void *fsdata)
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
{
	int i_size_changed = 0;
	struct inode *inode = mapping->host;
	handle_t *handle = ext4_journal_current_handle();

	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);

	/*
	 * No need to use i_size_read() here, the i_size
	 * cannot change under us because we hold i_mutex.
	 *
	 * But it's important to update i_size while still holding page lock:
	 * page writeout could otherwise come in and zero beyond i_size.
	 */
	if (pos + copied > inode->i_size) {
		i_size_write(inode, pos + copied);
		i_size_changed = 1;
	}

	if (pos + copied >  EXT4_I(inode)->i_disksize) {
		/* We need to mark inode dirty even if
		 * new_i_size is less that inode->i_size
		 * bu greater than i_disksize.(hint delalloc)
		 */
		ext4_update_i_disksize(inode, (pos + copied));
		i_size_changed = 1;
	}
	unlock_page(page);
	page_cache_release(page);

	/*
	 * Don't mark the inode dirty under page lock. First, it unnecessarily
	 * makes the holding time of page lock longer. Second, it forces lock
	 * ordering of page lock and transaction start for journaling
	 * filesystems.
	 */
	if (i_size_changed)
		ext4_mark_inode_dirty(handle, inode);

	return copied;
}

935
936
937
938
/*
 * We need to pick up the new inode size which generic_commit_write gave us
 * `file' can be NULL - eg, when called from page_symlink().
 *
939
 * ext4 never places buffers on inode->i_mapping->private_list.  metadata
940
941
 * buffers are managed internally.
 */
Nick Piggin's avatar
Nick Piggin committed
942
static int ext4_ordered_write_end(struct file *file,
943
944
945
				  struct address_space *mapping,
				  loff_t pos, unsigned len, unsigned copied,
				  struct page *page, void *fsdata)
946
{
947
	handle_t *handle = ext4_journal_current_handle();
948
	struct inode *inode = mapping->host;
949
950
	int ret = 0, ret2;

951
	trace_ext4_ordered_write_end(inode, pos, len, copied);
952
	ret = ext4_jbd2_file_inode(handle, inode);
953
954

	if (ret == 0) {
955
		ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
Nick Piggin's avatar
Nick Piggin committed
956
							page, fsdata);
957
		copied = ret2;
958
		if (pos + len > inode->i_size && ext4_can_truncate(inode))
959
960
961
962
963
			/* if we have allocated more blocks and copied
			 * less. We will have blocks allocated outside
			 * inode->i_size. So truncate them
			 */
			ext4_orphan_add(handle, inode);
964
965
		if (ret2 < 0)
			ret = ret2;
966
967
968
	} else {
		unlock_page(page);
		page_cache_release(page);
969
	}
970

971
	ret2 = ext4_journal_stop(handle);
972
973
	if (!ret)
		ret = ret2;
Nick Piggin's avatar
Nick Piggin committed
974

975
	if (pos + len > inode->i_size) {
976
		ext4_truncate_failed_write(inode);
977
		/*
978
		 * If truncate failed early the inode might still be
979
980
981
982
983
984
985
986
		 * on the orphan list; we need to make sure the inode
		 * is removed from the orphan list in that case.
		 */
		if (inode->i_nlink)
			ext4_orphan_del(NULL, inode);
	}


Nick Piggin's avatar
Nick Piggin committed
987
	return ret ? ret : copied;
988
989
}

Nick Piggin's avatar
Nick Piggin committed
990
static int ext4_writeback_write_end(struct file *file,
991
992
993
				    struct address_space *mapping,
				    loff_t pos, unsigned len, unsigned copied,
				    struct page *page, void *fsdata)
994
{
995
	handle_t *handle = ext4_journal_current_handle();
996
	struct inode *inode = mapping->host;
997
998
	int ret = 0, ret2;

999
	trace_ext4_writeback_write_end(inode, pos, len, copied);
1000
	ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
For faster browsing, not all history is shown. View entire blame